In [3]:
import requests
import json
import time
import tweepy
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
%matplotlib inline

In [2]:
df = pd.read_csv('imdb-reviews-pt-br.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49459 entries, 0 to 49458
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         49459 non-null  int64 
 1   text_en    49459 non-null  object
 2   text_pt    49459 non-null  object
 3   sentiment  49459 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


In [4]:
df.sentiment.value_counts()

neg    24765
pos    24694
Name: sentiment, dtype: int64

In [5]:
# mapeando as classes de resposta para valores numéricos
df["sentiment_int"] = df["sentiment"].map({"pos": 0, "neg": 1})

In [6]:
# identificando valores nulos
df.isnull().sum()
#df.dropna(inplace=True)

id               0
text_en          0
text_pt          0
sentiment        0
sentiment_int    0
dtype: int64

In [7]:
import spacy
nlp = spacy.load('pt_core_news_sm')

2021-09-29 22:19:24.506584: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-29 22:19:24.506602: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [8]:
nltk.download('stopwords')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to /home/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/thiago/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [8]:
stop_words = nltk.corpus.stopwords.words('portuguese')
#stop_words = nlp.Defaults.stop_words

In [10]:
# determinando forma básica (lema) das palavras
def lemmatizer(text):
  sent = []
  doc = nlp(text)
  for word in doc:
    if word.pos_ == "VERB":
      sent.append(word.lemma_)
    else:
      sent.append(word.orth_)
  return " ".join(sent)

In [None]:
# padronizando texto com letras minusculas, acentuação e lematização
from unidecode import unidecode
df['text'] = df.text_pt.apply(str.lower)
df['text'] = df.text.apply(unidecode)
df['text'] = df.text.apply(lemmatizer)

In [None]:
# salvando pré processamento para reutilização
df.to_csv('imdb_pre_processing.csv')

In [4]:
# reutilizando base com pré processamento
df2 = pd.read_csv('imdb_pre_processing.csv')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [9]:
# unigrama CV
vect_uni_cv = CountVectorizer(ngram_range=(1,1), stop_words=stop_words)
vect_uni_cv.fit(df2.text)
text_vect_uni_cv = vect_uni_cv.transform(df2.text)

# separando base de treinamento x teste do Unigrama com Count Vectorizer - UCV
X_trainUCV, X_testUCV, y_trainUCV, y_testUCV = train_test_split(text_vect_uni_cv, df2["sentiment_int"], test_size = 0.2, random_state = 123)

# unigrama IDF
vect_uni_idf = TfidfVectorizer(ngram_range=(1,1), use_idf=True, norm='l2', stop_words=stop_words)
vect_uni_idf.fit(df2.text)
text_vect_uni_idf = vect_uni_idf.transform(df2.text)

# separando base de treinamento x teste do Unigrama com Tfidf Vectorizer - UIDF
X_trainUIDF, X_testUIDF, y_trainUIDF, y_testUIDF = train_test_split(text_vect_uni_idf, df2["sentiment_int"], test_size = 0.2, random_state = 123)

In [7]:
# bigrama CV
vect_bi_cf = CountVectorizer(ngram_range=(2,2), stop_words=stop_words)
vect_bi_cf.fit(df2.text)
text_vect_bi_cf = vect_bi_cf.transform(df2.text)

# separando base de treinamento x teste do Bigrama com Count Vectorizer - BCV
X_trainBCV, X_testBCV, y_trainBCV, y_testBCV = train_test_split(text_vect_bi_cf, df2["sentiment_int"], test_size = 0.2, random_state = 123)

# bigrama IDF
vect_bi_idf = TfidfVectorizer(ngram_range=(2,2), use_idf=True, norm='l2', stop_words=stop_words)
vect_bi_idf.fit(df2.text)
text_vect_bi_idf = vect_bi_idf.transform(df2.text)

# separando base de treinamento x teste do Bigrama com Tfidf Vectorizer - BIDF
X_trainBIDF, X_testBIDF, y_trainBIDF, y_testBIDF = train_test_split(text_vect_bi_idf, df2["sentiment_int"], test_size = 0.2, random_state = 123)

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
# unigrama
nbUCV = MultinomialNB().fit(X_trainUCV, y_trainUCV)
nbUIDF = MultinomialNB().fit(X_trainUIDF, y_trainUIDF)

In [11]:
# bigrama
nbBCV = MultinomialNB().fit(X_trainBCV, y_trainBCV)
nbBIDF = MultinomialNB().fit(X_trainBIDF, y_trainBIDF)

In [12]:
from nltk.tokenize import word_tokenize as tokenizador

In [13]:
from sklearn.linear_model import LogisticRegression
lr_UIDF = LogisticRegression()
lr_UIDF.fit(X_trainUIDF, y_trainUIDF)

LogisticRegression()

In [14]:
y_prediction_lrUIDF = lr_UIDF.predict(X_testUIDF)
accuracy_lrUIDF = accuracy_score(y_prediction_lrUIDF, y_testUIDF)
print("Acurácia do Unigrama Regressão Logística: ", accuracy_lrUIDF)

Acurácia do Unigrama Regressão Logística:  0.8883946623534169


In [15]:
lr_UIDF_grid = LogisticRegression()
lr_UIDF_grid_values = { 'penalty': ['l1', 'l2'], 'C': [0.001,.009,0.01,.09,1,5,10,25] }

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [17]:
lr_UIDF_grid_cv = GridSearchCV(lr_UIDF_grid, param_grid=lr_UIDF_grid_values, scoring='accuracy')
lr_UIDF_grid_cv.fit(X_trainUIDF, y_trainUIDF)

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/thiago/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
        nan 0.85447461        nan 0.88781042        nan 0.89332009
        nan 0.89129817        nan 0.88861918]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
                         'penalty': ['l1', 'l2']},
             scoring='accuracy')

In [18]:
y_prediction_lr_grid_UIDF = lr_UIDF_grid_cv.predict(X_testUIDF)
accuracy_lr_grid_UIDF = accuracy_score(y_prediction_lr_grid_UIDF, y_testUIDF)
print("Acurácia do Unigrama Regressão Logística com Grid Search: ", accuracy_lr_grid_UIDF)

Acurácia do Unigrama Regressão Logística com Grid Search:  0.8955721795390215


In [19]:
testes = ['J conseguiu ser pior que o firmino',
          'Sandra Bullock é impecável em tudo o que faz',
          'Que zagueiraço é Van Dijk!! Quem fala que ele só teve uma temporada não sabe NADA de futebol. Já chegaram a compará-lo com Kimpembe, Rudiger, Varane etc. Não tem comparação!! Depois de Sérgio Ramos, Van Dijk é DISPARADO o melhor do mundo!',
          'Obg Atalanta por fazer eu perder 3 mil time de fdp do crl tô cheio de ódio',
          'florence linda te amo fada mãe']

In [22]:
# Instancia o objeto que faz a vetorização dos dados de texto:
vectorizer = CountVectorizer(analyzer="word", tokenizer=tokenizador)

In [23]:
texto = df2['text_pt']

In [25]:
# Aplica o vetorizador nos dados de texto e retorna uma matriz esparsa ( contendo vários zeros):
freq_textos = vectorizer.fit_transform(texto)

scipy.sparse.csr.csr_matrix

In [26]:
freq_testes = vectorizer.transform(testes)

In [35]:
freq_testes

<5x163723 sparse matrix of type '<class 'numpy.int64'>'
	with 67 stored elements in Compressed Sparse Row format>

In [39]:
# 1= negativo
# 0= positivo
for t, c in zip (testes,nbBIDF.predict(freq_testes)):
    print (t)
    print (c)

ValueError: dimension mismatch