In [113]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.svm import SVC
import spacy
import re
import string
from unidecode import unidecode

In [114]:
data = pd.read_csv('final.csv')
data = data.drop('Unnamed: 0', axis=1)
#data['review_rate'] = data['review_rate'].fillna(' ')
#data = data.fillna(data.mean())
data = data.drop_duplicates(subset=['review_body'])

In [115]:
data['review_rate'].value_counts()

50    4012
40    2234
30     514
20     144
10      88
Name: review_rate, dtype: int64

In [116]:
def DATA_BINARIZADOR_RESULTADOS(data):
    RESULTADO_BINARIO = []

    for item in data['review_rate']:
        if item < 40:
            RESULTADO_BINARIO.append(0)
        elif item >= 40:
            RESULTADO_BINARIO.append(1)
        else:
            print("Deu feijoada aqui hein")

    data['RESULTADO_BINARIO'] = RESULTADO_BINARIO
    return data

In [117]:
data = DATA_BINARIZADOR_RESULTADOS(data)
data.head()

Unnamed: 0,review_rate,review_body,RESULTADO_BINARIO
0,10,"Pois bem...as fotos dos pratos, bebidas e doce...",0
1,30,Espero que utilizem essa avaliação para rever ...,0
2,20,"Fomos comer a sobremesa as 20h40, sentamos, pe...",0
3,10,Pedimos o cardápio e ao chamar o atendente ped...,0
4,50,"Fui com algumas amigas em uma segunda-feira, d...",1


In [118]:
def clear_text(text):
    #remove pontuacao, palavras com numeros, deixa o texto em caixa baixa e remove o texto entre colchetes
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = unidecode(text)
    return text

In [119]:
clear = lambda x: clear_text(x)
data_clean = pd.DataFrame(data.review_body.apply(clear))
#data_clean['review_rate'] = data['review_rate'].values
data_clean['RESULTADO_BINARIO'] = data['RESULTADO_BINARIO'].values


In [120]:
X_train, X_test, y_train, y_test = train_test_split(data_clean[['review_body']], data_clean['RESULTADO_BINARIO'], test_size=0.20, random_state=40)

In [121]:
t_vector = TfidfVectorizer()
t_vector.fit(data_clean['review_body'])
train_X = t_vector.transform(X_train['review_body'])
test_X = t_vector.transform(X_test['review_body'])


In [122]:
def validar(model, model_name, y, pred):
    print("Os escores do ", model_name," são:")
    print(' acc = ', accuracy_score(y, pred), ("\n"),
      'prec = ', precision_score(y, pred), ("\n"),
      'recall = ', recall_score(y, pred), ("\n"),
      'f1 = ', f1_score(y, pred))

    fpr, tpr, threshold = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)
    print(' auc = ', roc_auc)
    return True

<h1>Modelos</h1>

<h2> Regressão Logística </h2>

<h2>Treino</h2>

In [123]:
log = LogisticRegression()
log.fit(train_X, y_train)
log_pred = log.predict(train_X)


print(' acc = ', accuracy_score(y_train, log_pred), ("\n"),
      'prec = ', precision_score(y_train, log_pred), ("\n"),
      'recall = ', recall_score(y_train, log_pred), ("\n"),
      'f1 = ', f1_score(y_train, log_pred))

fpr, tpr, threshold = roc_curve(y_train, log_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)


 acc =  0.9195422849991061 
 prec =  0.9180931345481318 
 recall =  0.9989985980372521 
 f1 =  0.9568386725493957
 auc =  0.6286659656852926


<h2>Teste</h2>

In [124]:
log_pred_test = log.predict(test_X)

print(' acc = ', accuracy_score(y_test, log_pred_test), ("\n"),
      'prec = ', precision_score(y_test, log_pred_test), ("\n"),
      'recall = ', recall_score(y_test, log_pred_test), ("\n"),
      'f1 = ', f1_score(y_test, log_pred_test))

fpr, tpr, threshold = roc_curve(y_test, log_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)


 acc =  0.9056468906361687 
 prec =  0.9082301529497451 
 recall =  0.9952114924181963 
 f1 =  0.9497334348819497
 auc =  0.5660988968940297


<h2>Random Forest otimizando default</h2>

In [125]:
def random_grid(x, labels):
    param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}, 
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]},
    ]   
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model = random_grid(train_X, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


<h2>Treino</h2>

In [126]:
rf_pred = model.predict(train_X)
print(' acc = ', accuracy_score(y_train, rf_pred), ("\n"),
      'prec = ', precision_score(y_train, rf_pred), ("\n"),
      'recall = ', recall_score(y_train, rf_pred), ("\n"),
      'f1 = ', f1_score(y_train, rf_pred))

fpr, tpr, threshold = roc_curve(y_train, rf_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)

 acc =  1.0 
 prec =  1.0 
 recall =  1.0 
 f1 =  1.0
 auc =  1.0


<h2>Teste</h2>

In [127]:
rf_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, rf_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, rf_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, rf_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, rf_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, rf_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.89707 
 prec =  0.89921 
 recall =  0.99681 
 f1 =  0.9455
 auc =  0.51895


<h2>Random Forest: grid seach otimizando acurácia</h2>

In [128]:
def random_grid(x, labels):
    param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}, 
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]},
    ]   
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model = random_grid(train_X, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


<h2>Treino</h2>

In [129]:
rf_pred = model.predict(train_X)
print(' acc = ', accuracy_score(y_train, rf_pred), ("\n"),
      'prec = ', precision_score(y_train, rf_pred), ("\n"),
      'recall = ', recall_score(y_train, rf_pred), ("\n"),
      'f1 = ', f1_score(y_train, rf_pred))

fpr, tpr, threshold = roc_curve(y_train, rf_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)

 acc =  0.9953513320221705 
 prec =  0.9948196851962542 
 recall =  1.0 
 f1 =  0.9974031162604875
 auc =  0.9783333333333333


<h2>Teste</h2>

In [130]:
rf_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, rf_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, rf_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, rf_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, rf_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, rf_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.8935 
 prec =  0.89598 
 recall =  0.99681 
 f1 =  0.94371
 auc =  0.50183


<h2>Random Forest otimizando AUC </h2>

In [131]:
def random_grid(x, labels):
    param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}, 
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]},
    ]   
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='roc_auc')
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model = random_grid(train_X, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


<h2>Treino</h2>

In [132]:
rf_pred = model.predict(train_X)
print(' acc = ', accuracy_score(y_train, rf_pred), ("\n"),
      'prec = ', precision_score(y_train, rf_pred), ("\n"),
      'recall = ', recall_score(y_train, rf_pred), ("\n"),
      'f1 = ', f1_score(y_train, rf_pred))

fpr, tpr, threshold = roc_curve(y_train, rf_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)

 acc =  0.9992848203111032 
 prec =  0.9991995197118271 
 recall =  1.0 
 f1 =  0.9995995995995995
 auc =  0.9966666666666666


<h2>Teste</h2>

In [133]:
rf_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, rf_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, rf_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, rf_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, rf_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, rf_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.89564 
 prec =  0.89564 
 recall =  1.0 
 f1 =  0.94495
 auc =  0.5


<h2>SVM otimizando default</h2>

In [134]:
def svm_grid(x, labels):
    param_grid = [
    {'kernel': ['rbf', 'sigmoid'],
        'C': [0.1, 1.0, 10.0, 100.0],
        'degree': [1]}
    ]   
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model_svc = svm_grid(train_X, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


<h2>Treino</h2>

In [135]:
svm_pred = model.predict(train_X)
print(' acc = ', round(accuracy_score(y_train, svm_pred), 5), ("\n"),
      'prec = ', round(precision_score(y_train, svm_pred), 5), ("\n"),
      'recall = ', round(recall_score(y_train, svm_pred), 5), ("\n"),
      'f1 = ', round(f1_score(y_train, svm_pred), 5))

fpr, tpr, threshold = roc_curve(y_train, svm_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.99928 
 prec =  0.9992 
 recall =  1.0 
 f1 =  0.9996
 auc =  0.99667


<h2>Teste</h2>

In [136]:
svm_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, svm_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, svm_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, svm_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, svm_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, svm_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.89564 
 prec =  0.89564 
 recall =  1.0 
 f1 =  0.94495
 auc =  0.5


<h2>SVM otimizando acurácia</h2>

In [137]:
def svm_grid(x, labels):
    param_grid = [
    {'kernel': ['rbf', 'sigmoid'],
        'C': [0.1, 1.0, 10.0, 100.0],
        'degree': [1]}
    ]   
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model_svc = svm_grid(train_X, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [138]:
SVC().get_params().keys() #os parametros que dá pra alterar

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

<h2>Treino</h2>

In [139]:
svm_pred = model.predict(train_X)
print(' acc = ', round(accuracy_score(y_train, svm_pred), 5), ("\n"),
      'prec = ', round(precision_score(y_train, svm_pred), 5), ("\n"),
      'recall = ', round(recall_score(y_train, svm_pred), 5), ("\n"),
      'f1 = ', round(f1_score(y_train, svm_pred), 5))

fpr, tpr, threshold = roc_curve(y_train, svm_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.99928 
 prec =  0.9992 
 recall =  1.0 
 f1 =  0.9996
 auc =  0.99667


<h2>Teste</h2>

In [140]:
svm_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, svm_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, svm_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, svm_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, svm_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, svm_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.89564 
 prec =  0.89564 
 recall =  1.0 
 f1 =  0.94495
 auc =  0.5


In [141]:
#refazendo os grid search, optando por otimizar a auc

<h2>SVM otimizando AUC</h2>

In [142]:
def svm_grid(x, labels):
    param_grid = [
    {'kernel': ['rbf', 'sigmoid'],
        'C': [0.1, 1.0, 10.0, 100.0],
        'degree': [1]}
    ]   
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='roc_auc')
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model_svc = svm_grid(train_X, y_train)

SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


<h2>Treino</h2>

In [143]:
svm_pred = model.predict(train_X)
print(' acc = ', round(accuracy_score(y_train, svm_pred), 5), ("\n"),
      'prec = ', round(precision_score(y_train, svm_pred), 5), ("\n"),
      'recall = ', round(recall_score(y_train, svm_pred), 5), ("\n"),
      'f1 = ', round(f1_score(y_train, svm_pred), 5))

fpr, tpr, threshold = roc_curve(y_train, svm_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.99928 
 prec =  0.9992 
 recall =  1.0 
 f1 =  0.9996
 auc =  0.99667


<h2>Teste</h2>

In [144]:
svm_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, svm_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, svm_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, svm_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, svm_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, svm_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.89564 
 prec =  0.89564 
 recall =  1.0 
 f1 =  0.94495
 auc =  0.5
