** Análise de Sentimentos – reviews_Clothing_Shoes_and_Jewelry Amazon

In [1]:
import nltk
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from pymongo import MongoClient
from imblearn.under_sampling import RandomUnderSampler 
from collections import Counter
from sklearn import preprocessing

**Criando uma conexão com o MongoDB**

In [2]:
client = MongoClient()
print (client)
db = client.amazon
collection = db.reviews

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)


**Lê arquivo de dados e conta a quantidade de linhas**

In [3]:
def getDF():
    i = 0
    df = {}
    for d in collection.find():
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

dataset = getDF()

dataset.count()

_id               278677
reviewerID        278677
asin              278677
reviewerName      278225
helpful           278677
reviewText        278677
overall           278677
summary           278677
unixReviewTime    278677
reviewTime        278677
classification    278677
dtype: int64

**Conta a quantidade de linhas de reviews neutros, positivos e negativos**

In [4]:
dataset[dataset.classification=='Neutral'].count()

_id               30425
reviewerID        30425
asin              30425
reviewerName      30387
helpful           30425
reviewText        30425
overall           30425
summary           30425
unixReviewTime    30425
reviewTime        30425
classification    30425
dtype: int64

In [5]:
dataset[dataset.classification=='Positive'].count()

_id               221597
reviewerID        221597
asin              221597
reviewerName      221228
helpful           221597
reviewText        221597
overall           221597
summary           221597
unixReviewTime    221597
reviewTime        221597
classification    221597
dtype: int64

In [6]:
dataset[dataset.classification=='Negative'].count()

_id               26655
reviewerID        26655
asin              26655
reviewerName      26610
helpful           26655
reviewText        26655
overall           26655
summary           26655
unixReviewTime    26655
reviewTime        26655
classification    26655
dtype: int64

** Separando reviews e suas classes
**

In [7]:
reviews = dataset['reviewText'].values

classifications = dataset['classification'].values

** Random under-sampling **

In [8]:
print('Original dataset shape {}'.format(Counter(classifications)))

le = preprocessing.LabelEncoder()
le.fit(reviews)

reviews_transformed = le.transform(reviews).reshape(-1, 1)

rus = RandomUnderSampler()
X_res, y_res = rus.fit_sample(reviews_transformed, classifications)

print('Resampled dataset shape {}'.format(Counter(y_res)))

reviews = le.inverse_transform(X_res)
classifications = y_res

Original dataset shape Counter({'Positive': 221597, 'Neutral': 30425, 'Negative': 26655})
Resampled dataset shape Counter({'Negative': 26655, 'Neutral': 26655, 'Positive': 26655})


** Pre-Processamento **

In [9]:
def PreprocessamentoSemStopWords(instancia):
    # remove links, pontos, virgulas, ponto e virgulas dos reviews
    # coloca tudo em minusculo
    # remove stopwords (palavras muito frequentes)
    instancia = re.sub(r"http\S+", "", instancia).lower().replace(',','').replace('.','').replace(';','').replace('-','')
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palavras = [i for i in instancia.split() if not i in stopwords]
    return (" ".join(palavras))

In [10]:
# Stemming reduz o termo ao seu radical, removendo afixos e vogais temáticas
def Stemming(instancia):
    stemmer = nltk.stem.RSLPStemmer()
    palavras=[]
    for w in instancia.split():
        palavras.append(stemmer.stem(w))
    return (" ".join(palavras))    

In [11]:
instancia = "My 3yr old loved this tutu skirt in pink! Was hoping to order more in different colors.She had hardly used this,the stitching came apart in 2weeks.now it's lying in her closet..Altogether she wore it like 4-5 times for 20 mins or so.wish the stitching was of better quality to hold up while little ones wear it. Can't recommend."

instancia = PreprocessamentoSemStopWords(instancia)

print(instancia)

Stemming(instancia)

3yr old loved tutu skirt pink! hoping order different colorsshe hardly used thisthe stitching came apart 2weeksnow it's lying closetaltogether wore like 45 times 20 mins sowish stitching better quality hold little ones wear can't recommend


"3yr old loved tutu skirt pink! hoping ord different colorssh hardly used thisth stitching cam apart 2weeksnow it' lying closetaltogeth wor lik 45 tim 20 mim sowish stitching bett quality hold littl one we can't recommend"

In [12]:
i = 0

for r in reviews:
    text = ''.join(map(str, r));
    text = PreprocessamentoSemStopWords(text)
    reviews[i] = Stemming(text)
    i += 1

**Treina o modelo usando o algoritmo Naive Bayes Multinomial**

In [19]:
vectorizer = CountVectorizer(analyzer="word")
freq_reviews = vectorizer.fit_transform(reviews.ravel())
modelo = MultinomialNB()
modelo.fit(freq_reviews, classifications)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

** Testando o modelo com algumas instâncias simples**

In [20]:
# Definindo instâncias de teste dentro de uma lista

#Neutral
#Positive
#Negative
#Neutral

testes = ["My 3yr old loved this tutu skirt in pink! Was hoping to order more in different colors.She had hardly used this,the stitching came apart in 2weeks.now it's lying in her closet..Altogether she wore it like 4-5 times for 20 mins or so.wish the stitching was of better quality to hold up while little ones wear it. Can't recommend.",
         "What can I say... my daughters have it in orange, black, white and pink and I am thinking to buy for they the fuccia one. It is a very good way for exalt a dancer outfit: great colors, comfortable, looks great, easy to wear, durables and little girls love it. I think it is a great buy for costumer and play too.",
        "Never GOT this item - but gave a 1 STAR because the replies from the SUPPLIER was GREAT.They tried to send the item more than once.My $ was refunded in a timely manner too.It was a shame I never got it for my daughter - it would of looked great with her OUTFIT for Dr. Seuss WEEK at school.Most original.Maybe next time.", 
       "I already own this particular Shining Image jewelry box in brown, so this was my second buy. It arrived with some of the leather scratched, even though it looked like I was the first one opening the box. This particular color pink also looked pretty bad in person. The quality of this box seemed lesser than the brown one I own.I returned this pink case for a refund without a problem.  Got another brown Shining Image jewelry case and it's fine!"]


In [21]:
i = 0

for t in testes:
    result = PreprocessamentoSemStopWords(t)
    testes[i] = Stemming(result)
    i += 1

In [22]:
freq_testes = vectorizer.transform(testes)

In [23]:
# Fazendo a classificação com o modelo treinado.

In [24]:
modelo.predict(freq_testes)

array(['Positive', 'Positive', 'Negative', 'Negative'],
      dtype='<U8')

** Avaliando o modelo **

In [25]:
# Fazendo o cross validation do modelo

In [26]:
resultados = cross_val_predict(modelo, freq_reviews, classifications, cv=10)

In [27]:
# Medindo a acurácia média do modelo

In [28]:
metrics.accuracy_score(classifications,resultados)

0.64638279247170638

In [29]:
# Medidas de validação do modelo

In [30]:
sentimento=['Positive','Negative','Neutral']
print (metrics.classification_report(classifications,resultados,sentimento))

             precision    recall  f1-score   support

   Positive       0.74      0.75      0.75     26655
   Negative       0.68      0.60      0.64     26655
    Neutral       0.53      0.58      0.56     26655

avg / total       0.65      0.65      0.65     79965



In [31]:
# Matriz de confusão

In [32]:
print(pd.crosstab(classifications, resultados, rownames=['True'], colnames=['Predicted'], margins=True))


Predicted  Negative  Neutral  Positive    All
True                                         
Negative      16092     8492      2071  26655
Neutral        6272    15496      4887  26655
Positive       1453     5102     20100  26655
All           23817    29090     27058  79965


** Melhorando resultados com Bigrams**

In [33]:
vectorizer = CountVectorizer(ngram_range=(1,2))
freq_reviews = vectorizer.fit_transform(reviews.ravel())
modelo = MultinomialNB()
modelo.fit(freq_reviews,classifications)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
resultados = cross_val_predict(modelo, freq_reviews, classifications, cv=10)

In [35]:
metrics.accuracy_score(classifications,resultados)

0.66263990495841929

In [36]:
sentimento=['Positive','Negative','Neutral']
print (metrics.classification_report(classifications,resultados,sentimento))

             precision    recall  f1-score   support

   Positive       0.77      0.76      0.77     26655
   Negative       0.70      0.62      0.65     26655
    Neutral       0.54      0.61      0.57     26655

avg / total       0.67      0.66      0.66     79965



In [37]:
print (pd.crosstab(classifications, resultados, rownames=['Real'], colnames=['Predito'], margins=True))

Predito   Negative  Neutral  Positive    All
Real                                        
Negative     16405     8577      1673  26655
Neutral       6053    16267      4335  26655
Positive      1106     5233     20316  26655
All          23564    30077     26324  79965
