In [0]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.svm import SVC
import spacy
import re
import string

In [0]:
!pip install unidecode #para tirar os acentos, dado que tem gente que usa e outros que não



In [0]:
from unidecode import unidecode

In [0]:
data = pd.read_csv('/content/drive/My Drive/BERT/CSV_FINAL_EQUALIZADO.csv')
data = data.drop('Unnamed: 0', axis=1)
data = data.drop_duplicates(subset=['review_body'])

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
data['review_rate'].value_counts()

50    11580
30    10558
40     4847
20     3335
10     2534
Name: review_rate, dtype: int64

In [0]:
data.head()

Unnamed: 0,review_rate,review_body,RESULTADO_BINARIO
0,50,Excelente experiência !! Atendimento impecável...,1
1,30,"Local agradável, porém, já fui em melhores, co...",0
2,50,Esta é a “milésima” vez que venho a este resta...,1
3,40,"O local, o atendimento e a comida, sensacionai...",1
4,50,"Comida espetacular, atendimento ímpar! Frank e...",1


In [0]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.snowball import PortugueseStemmer

def clear_text(text):
    #remove pontuacao, palavras com numeros, deixa o texto em caixa baixa e remove o texto entre colchetes
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = unidecode(text)  
    stemizador = PortugueseStemmer()
    #removendo as stopwords 
    text = " ".join([word for word in text.split()
                  if word not in stopwords.words('portuguese')])
    text = stemizador.stem(text)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
clear = lambda x: clear_text(x)
data_clean = pd.DataFrame(data.review_body.apply(clear))
#data_clean['review_rate'] = data['review_rate'].values
data_clean['RESULTADO_BINARIO'] = data['RESULTADO_BINARIO'].values


In [0]:
print(data['review_body'][12])
data_clean['review_body'][12]

A melhor pedida para um feriadinho são os drinks sofisticados servidos na Adega. Obrigada Abel, Wellington e Antonio: O atendimento de vocês é de excelência!


'melhor pedida feriadinho sao drinks sofisticados servidos adega obrigada abel wellington antonio atendimento voces excelenc'

In [0]:
X_train, X_test, y_train, y_test = train_test_split(data_clean[['review_body']], data_clean['RESULTADO_BINARIO'], test_size=0.20, random_state=2020)

In [0]:
t_vector = TfidfVectorizer()
t_vector.fit(data_clean['review_body'])
train_X = t_vector.transform(X_train['review_body']) #Tf-idf-weighted document-term matriz
test_X = t_vector.transform(X_test['review_body'])


In [0]:
def validar(model, model_name, y, pred):
    print("Os escores do ", model_name," são:")
    print(' acc = ', accuracy_score(y, pred), ("\n"),
      'prec = ', precision_score(y, pred), ("\n"),
      'recall = ', recall_score(y, pred), ("\n"),
      'f1 = ', f1_score(y, pred))

    fpr, tpr, threshold = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)
    print(' auc = ', roc_auc)
    return True

<h1>Modelos</h1>

<h2> Regressão Logística </h2>

<h2>Treino</h2>

In [0]:
log = LogisticRegression()
log.fit(train_X, y_train)
log_pred = log.predict(train_X)


print(' acc = ', accuracy_score(y_train, log_pred), ("\n"),
      'prec = ', precision_score(y_train, log_pred), ("\n"),
      'recall = ', recall_score(y_train, log_pred), ("\n"),
      'f1 = ', f1_score(y_train, log_pred))

fpr, tpr, threshold = roc_curve(y_train, log_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)


 acc =  0.915040140014458 
 prec =  0.9169282597044637 
 recall =  0.9125266686985675 
 f1 =  0.9147221691808287
 auc =  0.9150367973783893


<h2>Teste</h2>

In [0]:
log_pred_test = log.predict(test_X)

print(' acc = ', accuracy_score(y_test, log_pred_test), ("\n"),
      'prec = ', precision_score(y_test, log_pred_test), ("\n"),
      'recall = ', recall_score(y_test, log_pred_test), ("\n"),
      'f1 = ', f1_score(y_test, log_pred_test))

fpr, tpr, threshold = roc_curve(y_test, log_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)


 acc =  0.870187186120834 
 prec =  0.873020706455542 
 recall =  0.8679987889797154 
 f1 =  0.8705025049339608
 auc =  0.8701989048937745


<h2>Random Forest</h2>

In [0]:
def random_grid(x, labels):
    param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]}, 
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]},
    ]   
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid_search.fit(x, labels)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

model = random_grid(train_X, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


<h2>Treino</h2>

In [0]:
rf_pred = model.predict(train_X)
print(' acc = ', accuracy_score(y_train, rf_pred), ("\n"),
      'prec = ', precision_score(y_train, rf_pred), ("\n"),
      'recall = ', recall_score(y_train, rf_pred), ("\n"),
      'f1 = ', f1_score(y_train, rf_pred))

fpr, tpr, threshold = roc_curve(y_train, rf_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', roc_auc)

 acc =  0.9999239051858616 
 prec =  0.9999238037183785 
 recall =  0.9999238037183785 
 f1 =  0.9999238037183785
 auc =  0.9999239050509211


<h2>Teste</h2>

In [0]:
rf_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, rf_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, rf_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, rf_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, rf_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, rf_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

 acc =  0.81844 
 prec =  0.82322 
 recall =  0.8135 
 f1 =  0.81833
 auc =  0.81847


<h2>SVM Polinomial</h2>

In [0]:
model = SVC(kernel='poly', C=100000)
model.fit(train_X, y_train)

<h2>Treino</h2>

In [0]:
svm_pred = model.predict(train_X)
print(' acc = ', round(accuracy_score(y_train, svm_pred), 5), ("\n"),
      'prec = ', round(precision_score(y_train, svm_pred), 5), ("\n"),
      'recall = ', round(recall_score(y_train, svm_pred), 5), ("\n"),
      'f1 = ', round(f1_score(y_train, svm_pred), 5))

fpr, tpr, threshold = roc_curve(y_train, svm_pred)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))

<h2>Teste</h2>

In [0]:
svm_pred_test = model.predict(test_X)
print(' acc = ', round(accuracy_score(y_test, svm_pred_test), 5), ("\n"),
      'prec = ', round(precision_score(y_test, svm_pred_test), 5), ("\n"),
      'recall = ', round(recall_score(y_test, svm_pred_test), 5), ("\n"),
      'f1 = ', round(f1_score(y_test, svm_pred_test), 5))

fpr, tpr, threshold = roc_curve(y_test, svm_pred_test)
roc_auc = auc(fpr, tpr)
print(' auc = ', round(roc_auc, 5))