In [185]:
import nltk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
import json
%matplotlib inline

#### Stopwords are words which are filtered out before or after processing of natural language data, usually don't affect meaning

In [186]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tiagoapolo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [187]:
nltk.download('rslp')

[nltk_data] Downloading package rslp to /Users/tiagoapolo/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [188]:
with open('data/cleaned_base_flat.json') as json_file:
  json_comp = json.load(json_file)

In [189]:
json_df = pd.DataFrame(json_comp)

In [190]:
json_df.head()

Unnamed: 0,link,stars,text,sentiment
0,https://produto.mercadolivre.com.br/MLB-151998...,5,Produto totalmente original .!,positive
1,https://produto.mercadolivre.com.br/MLB-151998...,5,Excelente,positive
2,https://produto.mercadolivre.com.br/MLB-151998...,1,O cheiro não permanece!,negative
3,https://produto.mercadolivre.com.br/MLB-151998...,5,Ta bom,positive
4,https://produto.mercadolivre.com.br/MLB-151998...,2,Não comprem.,negative


In [191]:
db = []
for data in json_comp:
  db.append((data['text'], data['sentiment']))

In [208]:
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('portuguese'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

#### Stemming is the process of reducing inflected (or sometimes derived) words to their word stem

In [209]:
def stemmer_treatment(texto):
  stemmer = nltk.stem.RSLPStemmer()
  frases_sem_stemming = []
  for (palavras, sentimento) in texto:
    tokens_list = clean_doc(palavras)
    com_stemming = [str(stemmer.stem(p)) for p in tokens_list]
    frases_sem_stemming.append((com_stemming, sentimento))
  return frases_sem_stemming

In [210]:
def busca_palavras(frases):
  all = []
  for (palavras, sentimento) in frases:
    all.extend(palavras)
  return all

In [211]:
def busca_freq(palavras):
  return nltk.FreqDist(palavras)

In [212]:
def busca_palavras_unicas(frequencia):
  freq = frequencia.keys()
  return freq

#### Return document's unique words

In [248]:
def extrator_palavras_treino(documento):
  doc = set(documento)
  caract = {}
  for palavras in unique_treino:
    caract['%s' % palavras] = (palavras in doc)
  return caract

In [249]:
def extrator_palavras_teste(documento):
  doc = set(documento)
  caract = {}
  for palavras in unique_teste:
    caract['%s' % palavras] = (palavras in doc)
  return caract

#### Check database labeling for balace check

In [250]:
db_frame = pd.DataFrame(stemmer_treatment(db))
db_frame.columns = ['text', 'sentiment']

In [251]:
print('-- Classes ratio --')
print((db_frame.sentiment.value_counts() / db_frame.shape[0]) * 100)

-- Classes ratio --
positive    78.158295
negative    13.318113
neutral      8.523592
Name: sentiment, dtype: float64


#### Split traning in holdout

In [252]:
X = db_frame['text']
y = db_frame['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

In [253]:
base_palavras = busca_palavras(stemmer_treatment(db))

treino_palavras = np.concatenate([p for p in X_train])
teste_palavras = np.concatenate([p for p in X_test])

print('Quantidade de palavras na base: {}'.format(pd.DataFrame(base_palavras).count()))
print('Quantidade de palavras para treino: {}'.format(pd.DataFrame(treino_palavras).count()))
print('Quantidade de palavras para teste: {}'.format(pd.DataFrame(teste_palavras).count()))

Quantidade de palavras na base: 0    2811
dtype: int64
Quantidade de palavras para treino: 0    1389
dtype: int64
Quantidade de palavras para teste: 0    1422
dtype: int64


#### Gets word frequency on both test and training lists

In [254]:
freq_treino = busca_freq(treino_palavras)
freq_teste = busca_freq(teste_palavras)

#### Gets unique words

In [255]:
unique_treino = busca_palavras_unicas(freq_treino)
unique_teste = busca_palavras_unicas(freq_teste)

#### Merged features and labels

In [256]:
base_train_merged = [(X_train.get(key), y_train.get(key)) for key in X_train.keys()]
base_teste_merged = [(X_test.get(key), y_test.get(key)) for key in X_test.keys()]

#### The primary purpose of this function is to avoid the memory overhead involved in storing all the featuresets for every token in a corpus

In [257]:
base_treino_features = nltk.classify.apply_features(extrator_palavras_treino, base_train_merged)
base_teste_features = nltk.classify.apply_features(extrator_palavras_treino, base_teste_merged)

#### Naive Bayes

In [258]:
model = nltk.NaiveBayesClassifier.train(base_treino_features)

#### Labels

In [259]:
model.labels()

['positive', 'negative', 'neutral']

In [260]:
model.show_most_informative_features(10)

Most Informative Features
                     mei = True           neutra : positi =     21.1 : 1.0
                     bom = True           neutra : negati =     18.2 : 1.0
                    frac = True           negati : positi =     17.6 : 1.0
                benefici = True           neutra : positi =     15.1 : 1.0
                     não = True           negati : positi =     12.5 : 1.0
                     pac = True           neutra : positi =      9.0 : 1.0
                    mais = True           neutra : positi =      9.0 : 1.0
                  rabann = True           neutra : positi =      9.0 : 1.0
                     fix = True           negati : positi =      8.6 : 1.0
                   excel = True           positi : neutra =      8.2 : 1.0


In [269]:
nova_frase = 'produto nao sofisticado'

testeStem = []

stemmer = nltk.stem.RSLPStemmer()
for (p_train) in nova_frase.split():
  st = [p for p in p_train.split()]
  testeStem.append(str(stemmer.stem(st[0])))

words_extracted = extrator_palavras_treino(testeStem)
distrib = model.prob_classify(words_extracted)

for classe in distrib.samples():
  print('%s: %f' % (classe, distrib.prob(classe)))

positive: 0.466620
negative: 0.529191
neutral: 0.004189
