## Bibliotecas

In [0]:
#===============================================
# APIs necessárias para instalar no Colab
#===============================================

#!pip install neuralcoref
#!pip install scikit-multilearn
#!pip install gensim
#!pip install spacy==2.1.0
#!python -m spacy download en
#!python -m spacy download en_core_web_lg

In [0]:
#===============================================
# Importando as bibliotecas
#===============================================
import xml.etree.ElementTree as ET
import os
import pandas as pd
import numpy as np
import pickle
import re
import matplotlib.pyplot as plt
import gensim
import spacy
import en_core_web_lg
import neuralcoref

from seg.newline.segmenter import NewLineSegmenter
from spacy import displacy
from collections import Counter, defaultdict
from google.colab import files
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import LabelPowerset
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize, word_tokenize 
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec

## Construção do modelo

In [0]:
# Carrega todos os arquios do projeto
arquivo = files.upload()

In [0]:
# Carrega o modelo treinado de NLP para o idioma inglês
nlp = en_core_web_lg.load()
neuralcoref.add_to_pipe(nlp)

In [0]:
# Carrega os dados do Opinion Lexicon
neg_file = open("neg_words.txt", encoding = "ISO-8859-1")
pos_file = open("pos_words.txt", encoding = "ISO-8859-1")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]
opinion_words = neg + pos

In [0]:
# Carrega o dataset
tree = ET.parse("Laptops_Train.xml")
root = tree.getroot()

In [34]:
# Carrega todos os termos que representam um aspecto encontrados nos reviews
labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aspects = []

    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aspects.append(aterm.get("term"))

    entry["text"], entry["aspects"] = sentence[0].text, aspects
    labeled_reviews.append(entry)

labeled_df = pd.DataFrame(labeled_reviews)
print("Existem", len(labeled_reviews), "reviews no conjunto de treinamento.")

Existem 3048 reviews no conjunto de treinamento.


In [35]:
# Lista todos os reviews que não possuem um aspecto rotulado ou cujo aspecto não faz parte da lista de aspectos mais relevantes
x = []
out = ['screen', 'price', 'keyboard', 'use', 'features', 'programs', 'Photobooth', 'processor', 'battery', 'software', 'hardware']
test1 = True
for i in range (len(labeled_df)):
    test1 = True
    if (labeled_df.aspects[i] == []):
        x.append(i)
    else:
        for f in labeled_df.aspects[i]:
            if f in out:
                test1 = False 
        if test1:
            x.append(i)
print("Existem", len(x), "reviews selecionados para exclusão.")

Existem 2711 reviews selecionados para exclusão.


In [0]:
#x = []
#for i in range (len(labeled_df)):
#    test1 = True
#    if (labeled_df.aspects[i] == []):
#        x.append(i)

In [0]:
#out = ['screen', 'price', 'keyboard', 'use', 'features', 'programs', 'Photobooth',
# 'screen', 'processor', 'battery', 'software', 'hardware']
#"scree" not in out
#len(x)

In [36]:
# Remove todos os reviews que não possuem aspectos rotulados
labeled_df = labeled_df.drop(labeled_df.index[x])
print("O tamanho final do conjunto de treinamento é", len(labeled_df))

O tamanho final do conjunto de treinamento é 337


In [0]:
#asp = labeled_df['aspects'].apply(lambda x: ' '.join(sorted([i for i in x])) if x != [] else '')

In [0]:
# Exibe o total de vezes que cada aspecto aparece nos reviews
list_asp = (labeled_df['aspects'].apply(lambda x: [i for i in x] if x != [] else ''))
Counter([i for j in list_asp for i in j]).most_common()

In [0]:
#unique_aspect = []
#for i in labeled_df['aspects']:
#    for f in range (len(i)):
#        if (not(i[f] in unique_aspect)):    
#            unique_aspect.append(i[f])

In [0]:
# Exibe a nuvem de palavras
text = " ".join(review for review in labeled_df['text'])
text = remove_stopwords(text)
wordcloud = WordCloud(background_color="white").generate(text)

fig = plt.figure(figsize=(20, 30))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Núvem de palavras da lista de reviews')
plt.axis("off")
plt.show()

In [0]:
# Salva a lista de reviews anotada
labeled_df.to_pickle("annotated_reviews_df.pkl")
labeled_df.head()

In [0]:
# Função para substituir os pronomes
def replace_pronouns(text):
    doc = nlp(text)
    return doc._.coref_resolved

In [0]:
# Carrega a lista de reviews anotada
annotated_reviews_df = pd.read_pickle("annotated_reviews_df.pkl")

# Cria uma nova coluna contendo o texto do review com a substituição dos pronomes
annotated_reviews_df["text_pro"] = annotated_reviews_df.text.apply(lambda x: replace_pronouns(x))

# Salva o dataframe com a nova coluna em um novo pickle
annotated_reviews_df.to_pickle("annotated_reviews_df2.pkl")

# Exibe os primeiros itens da nova coluna
annotated_reviews_df.text_pro.head(5)

In [0]:
# Prepara os dados de entrada e de saída
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects)
X = annotated_reviews_df.text_pro

# Separa os conjuntos de treinamento e de teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

# Salva o binarizador ajustado
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [0]:
# Cria um modelo baseado em multinomial naive bayes classification
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calcula a acurácia
np.mean(predicted == y_test)

In [0]:
# Testa a performance de um modelo SVM
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=6, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
np.mean(predicted_svm == y_test)

In [0]:
# Treina o modelo naive bayes com todo o dataset
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X, y)

# Salva o modelo para utilizar na fase de análise de sentimentos
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

## Análise de sentimentos

In [0]:
#-------------------------------------------------------------
# Carrega um word embedding treinado com o Google News dataset
#-------------------------------------------------------------
google_news_path = 'C:/Users/rpga0/Documents/GoogleNews-vectors-negative300.bin'

# Load google news vecs in gensim
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_path, binary=True)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = []
for idx in range(3000000):
    keys.append(model.index2word[idx])

# Set the vectors for our nlp object to the google news vectors
#nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

#>>> nlp.vocab.vectors.shape
#(3000000, 300)

In [0]:
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)
nlp.vocab.vectors.shape

In [0]:
word2vec = model

# Testa a similaridade entre duas palavras
model.n_similarity(['windows'], ["software"])

In [0]:
#------------------------------------------------------------------------------------
# Verifica se existe similaridade entre uma palavra e algum item da lista de aspectos
#------------------------------------------------------------------------------------
def check_similarity(aspects, word):
    similarity = []
    for aspect in aspects:
        similarity.append(word2vec.n_similarity([aspect], [word]))
    
    # Define uma similaridade mínima de 0.30
    if max(similarity) > 0.30:
        # Retorna o aspecto que tem a maior similaridade com a palavra
        return aspects[np.argmax(similarity)]
    else:
        return None

In [0]:
#------------------------------------------------------------------------------------
# Relaciona termos aos aspectos e indica se o sentimento é positivo ou negativo
#------------------------------------------------------------------------------------
def assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred):
 
     # Lista de principais aspectos
    aspects = ['screen', 'price', 'hardware', 'software', 'battery']
    
    for term in sent_dict:
        try:
            # Primeiro tenta fazer a relação utilizando word2vec
            if check_similarity(aspects, term.split()[-1]):
                terms_dict[check_similarity(aspects, term.split()[-1])][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[check_similarity(aspects, term.split()[-1])]["pos"] += sent_dict[term]
                else:
                    aspect_sent[check_similarity(aspects, term.split()[-1])]["neg"] += abs(sent_dict[term])
            # Agora tenta relacionar utilizando o modelo NB treinado
            elif (pred[0] == "anecdotes/miscellaneous"):
                continue
            elif (len(pred) == 1):
                terms_dict[pred[0]][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[pred[0]]["pos"] += sent_dict[term]
                else:
                    aspect_sent[pred[0]]["neg"] += abs(sent_dict[term])
            # Em último caso, classifica como miscelanious
            else:
                terms_dict["misc"][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent["misc"]["pos"] += sent_dict[term]
                else:
                    aspect_sent["misc"]["neg"] += abs(sent_dict[term])
        except:
            print(term, "not in vocab")
            continue
    return aspect_sent, terms_dict

In [0]:
nlseg = NewLineSegmenter()
nlp2 = spacy.load('en_core_web_lg')
nlp2.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')

In [0]:
#------------------------------------------------------------------------------------
# Adiciona termos ao dicionário e qualifica o sentimento
#------------------------------------------------------------------------------------
def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''
    sentence = nlp(sentence)
    sent_dict = Counter()
    #sentence = nlp(sentence)
    debug = 0

    for token in sentence:
       # print(token)
        #print(token.text,token.dep_, token.head, token.head.dep_)

        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            #print("sim, ", token)
            sentiment = 1 if token.text in pos else -1

            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] += sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1

                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] += sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] += sentiment
                    debug += 1
    return sent_dict

#------------------------------------------------------------------------------------
# Classifica uma sentença em uma categoria e associa a um sentimento
#------------------------------------------------------------------------------------
def classify_and_sent(sentence, aspect_sent, terms_dict):
    # Classifica a sentença utilizando o classificador NB
    naive_model1 = pickle.load(open("naive_model1.pkl", 'rb'))
    predicted = naive_model1.predict([sentence])
    pred = mlb.inverse_transform(predicted)

    sent_dict = feature_sentiment(sentence)
    aspect_sent, terms_dict = assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred[0])

    return aspect_sent, terms_dict

#------------------------------------------------------------------------------------
# Separa o review em uma lista de sentenças usando o spacy's sentence parser
#------------------------------------------------------------------------------------
def split_sentence(text):
    review = nlp(text)
    bag_sentence = []
    start = 0

    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i-1)])
            start = token.i
        if token.i == len(review)-1:
            bag_sentence.append(review[start:(token.i+1)])

    return bag_sentence

#------------------------------------------------------------------------------------
# Remove caracteres especiais utilizando RegEx
#------------------------------------------------------------------------------------
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)

#------------------------------------------------------------------------------------
# Faz a análise de sentimento em cada sentença de um review
#------------------------------------------------------------------------------------
def review_pipe(review, aspect_sent, terms_dict={'screen':Counter(), 'price':Counter(),'hardware':Counter(),'software':Counter(),'battery':Counter()}):
    review = replace_pronouns(review)
    sentences = split_sentence(review)
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
        aspect_sent, terms_dict = classify_and_sent(sentence.lower(), aspect_sent, terms_dict)
    return aspect_sent, terms_dict

In [0]:
# Teste da análise de sentimento para uma sentença
sentence = "I bought it from HSN because it was \"bundled\" with extra software, but as it turns out, that software just crashes it more often"
feature_sentiment(sentence)

In [0]:
# Teste da análise de sentimento sobre um review
terms_dict={'screen':Counter(), 'price':Counter(), 'hardware':Counter(),
            'software':Counter(), 'battery':Counter()}
aspect_sent={'screen':Counter(), 'price':Counter(), 'hardware':Counter(),
            'software':Counter(),'battery':Counter()}
review = "I bought it from HSN because it was \"bundled\" with extra Software, but as it turns out, that software just crashes it more often"
review_pipe(review, aspect_sent, terms_dict)

In [0]:
# Faz a análise de sentimento em toda a base de teste
for txt in X_test:
    review = txt
    aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    print("Lista de aspectos ", aspect_sent, "/n")
    print("Lista de termos", terms_dict, "\n")

In [0]:
aspect_sent

In [0]:
terms_dict

In [0]:
displacy.render(nlp2(sentence), style='dep')