In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score


from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import pipeline
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
import seaborn as sns

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stopwords_nltk = nltk.corpus.stopwords.words('portuguese')

import matplotlib.pyplot as plt

%matplotlib inline

import graphviz
import os


os.environ['PATH'] = os.environ['PATH'] + ';C:/Program Files (x86)/Graphviz2.38/bin'

plt.style.use('bmh')

datapath = '../../Data/Processed/intervencao_eqp.parquet'

### **<span style="color:MediumSlateBlue"> 1.Carga dos Dados </span>**

In [2]:
dataframe = pd.read_parquet(datapath)

print('shape:', dataframe.shape)
print('columns:', dataframe.columns)

shape: (267318, 2)
columns: Index(['CLASSE', 'TEXTO'], dtype='object')


### <span style="color:MediumSlateBlue"> **2. Treino / Teste** </span>

In [3]:
X = dataframe['TEXTO'].values.reshape(-1).tolist()
Y = (dataframe['CLASSE'].values.reshape(-1) == 'POS').tolist()

# Dividir 20% dos dados para o conjunto out-of-sample de teste
Xtrain, Xtest, Ytrain, Ytest = model_selection.train_test_split( X,
                                                                 Y,
                                                                 test_size=0.25,
                                                                 random_state=0,
                                                                 stratify=Y  )

# Objeto de validacao cruzada
cvfold = model_selection.StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True)

### <span style="color:MediumSlateBlue"> **3.Construção do Pipeline** </span>

In [None]:
# Configure Pipeline
model_dict = {
    'RandomForest': ensemble.RandomForestClassifier(n_estimators=10, max_features='sqrt'),
    'LogisticRegr': LogisticRegression(),
    'kNN': KNeighborsClassifier(weights='distance'),
    'SVM': SVC(probability=False, gamma='auto'),
}

model_conf = {
    'RandomForest': {
        'Model__max_depth': [4, 6],
        'Model__n_estimators': [10, 50],
    },
    'LogisticRegr': {
        'Model__C': [0.001, 0.1, 1, 10],
    },
    'kNN': {
        'Model__n_neighbors': [5, 10, 20],
    },
    'SVM': {
        'Model__kernel': ['rbf',],
        'Model__C' : [0.1, 1, 10],
    }
}

scorer = metrics.make_scorer(metrics.f1_score)


model_list = {}

for model_name in model_dict.keys():
    conf_train_pipe = [
        ('TfIDf', TfidfVectorizer(Xtrain,stop_words=stopwords_nltk)),
        ('Model', model_dict[model_name]),
    ]
    
    #cross_val_score
    # Create Pipeline
    model_pipe = pipeline.Pipeline(conf_train_pipe)
    param_grid = model_conf[model_name].copy()
    model_pipe = model_selection.GridSearchCV(model_pipe, param_grid,
                                              scoring=scorer,
                                              #fit_params=None,
                                              cv=cvfold,
                                              return_train_score = True)
    
    

    model_pipe.fit(Xtrain, Ytrain)
    model_list[model_name] = model_pipe    
    
         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

### <span style="color:MediumSlateBlue"> **4. Curva ROC** </span>

In [None]:
plt.figure(figsize=(6,4))

for model_name in model_dict.keys():
    model = model_list[model_name]
    # estimate Y
    if model_name in ['RandomForest', 'SVM']:
        final_model = LogisticRegressionCV(Cs=20,cv=5)
        if model_name == 'SVM':
            Yhat_train = model.decision_function(Xtrain)
            Yhat = model.decision_function(Xtest)
        else:
            Yhat_train = model.predict_proba(Xtrain)[:,1]
            Yhat = model.predict_proba(Xtest)[:,1]
            
        final_model.fit(Yhat_train.reshape(-1, 1), Ytrain)
        Yhat  = final_model.predict_proba(Yhat.reshape(-1,1))[:,1]
        Ypred = final_model.predict(Yhat.reshape(-1,1))
    else:
        Yhat = model.predict_proba(Xtest)[:,1]
        Ypred = model.predict(Xtest)
    
    
    fpr, tpr, thr = metrics.roc_curve(Ytest, Yhat)
    auc = metrics.roc_auc_score(Ytest, Yhat)
    f1 = metrics.f1_score(Ytest, Ypred)
    label = model_name # + " AUC: %.3f - F1: %.3f"%(auc, f1)
    plt.plot(fpr, tpr, '-', lw=2, label=label)

    
plt.legend()
plt.title('Classificador Vinhos de Alta Qualidade')
plt.grid()
plt.xlabel('Taxa de Falso Alarme')
plt.ylabel('Taxa de Detecção')

# Distribuição de Probabilidades do Modelo 

In [None]:
model_name = 'LogisticRegr'
model = model_list[model_name]
Yhat = model.predict_proba(Xtest)[:,1]

for i in [False, True]:
    # 2 Classes case
    sns.distplot(Yhat[np.array(Ytest) == i], label=['NEG','POS'][i])

plt.title(model_name)
plt.ylabel('Densidade Estimada KDE')
plt.xlabel('Probabilidade Sentimento Positivo')
plt.grid()
plt.legend()

# Matriz de Confusão 

In [None]:
model_name = 'LogisticRegr'


model = model_list[model_name]
Ypred = model.predict(Xtest)

col_names = ['Model ' + s for s in ['Neg','Pos']]
idx_names = ['Real ' + s for s in ['Neg','Pos']]

cmat = metrics.confusion_matrix(Ytest, Ypred)
cmat = pandas.DataFrame(cmat, index=idx_names,
                              columns=col_names)
cmat['Real Total'] = cmat.sum(axis=1)
cmat.loc['Model Total',:] = cmat.sum(axis=0)
cmat = cmat.astype(int)
cmat


In [None]:
print(metrics.classification_report(Ytest, Ypred))

# Coeficientes de Regressão 

In [None]:
plt.figure(figsize=(6,4))

nshow = 10
model_name = 'LogisticRegr'
model = model_list[model_name].best_estimator_.steps[1][1]
vocabulary = model_list[model_name].best_estimator_.steps[0][1].vocabulary_
vocabulary = np.array(list(vocabulary.keys()))

coefs = model.coef_[0]

idx = np.argsort(np.abs(coefs))[-nshow:]

yaxis = np.arange(nshow)


plt.barh(yaxis, coefs[idx])
plt.yticks(yaxis, vocabulary[idx])

plt.title('Coeficientes da Regressão')
plt.xlabel('Coeficiente')


# Explicação pela Árvore de Decisão 

In [None]:
plt.figure(figsize=(6,4))

nshow = 10
model_name = 'RandomForest'
model = model_list[model_name].best_estimator_.steps[1][1]
vocabulary = model_list[model_name].best_estimator_.steps[0][1].vocabulary_
vocabulary = np.array(list(vocabulary.keys()))

coefs = model.feature_importances_

idx = np.argsort(np.abs(coefs))[-nshow:]

yaxis = np.arange(nshow)


plt.barh(yaxis, coefs[idx])
plt.yticks(yaxis, vocabulary[idx])

plt.title(model_name + '  - Importância Features')

plt.xlabel('Importância Relativa')

# Exportação do Resultado 

In [None]:
model_name = 'LogisticRegr'

test_data = [Xtest, Ytest,]
df_cols = ['text', 'class'] 
for model_name in model_dict.keys():
    model = model_list[model_name]
    if model_name in ['RandomForest', 'SVM']:
        final_model = LogisticRegressionCV(Cs=20,cv=5)
        if model_name == 'SVM':
            Yhat_train = model.decision_function(Xtrain)
            Yhat = model.decision_function(Xtest)
        else:
            Yhat_train = model.predict_proba(Xtrain)[:,1]
            Yhat = model.predict_proba(Xtest)[:,1]
            
        final_model.fit(Yhat_train.reshape(-1, 1), Ytrain)
        Yhat  = final_model.predict_proba(Yhat.reshape(-1,1))[:,1]
    else:
        Yhat = model.predict_proba(Xtest)[:,1]

    test_data.append(Yhat)
    df_cols.append(model_name)
        
test_data = np.array(test_data).T
df_test = pd.DataFrame(data=test_data, columns=df_cols)

df_test.to_excel('../../Data/Modeling/results.xlsx')

df_test.head()