## Trabalho de Grupo - Análise de Sentimento
Para este trabalho decidimos utilizar o dataset SFU Review Corpus.

In [1]:
import os
import csv
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
import nltk
import nltk.stem as stem
from nltk import FeatDict
from nltk.classify import naivebayes, maxent
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import collections
import string
import sklearn
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
df_train = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_train.csv", encoding="utf-8")
df_test = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8")

In [3]:
test_text = df_test['text'].tolist()
test_recommended = df_test['recommended'].tolist()

In [4]:
train_text = df_train['text'].tolist()
train_recommended = df_train['recommended'].tolist()

# 1.2 - Preparação dos dados e criação de uma baseline

### 1.2.1 - TexBlob

In [5]:
list_test = list(zip(df_test['text'], df_test['recommended']))

In [6]:
somaclassificacao =0
for text, recommended in list_test:
    classification = TextBlob(text).sentiment.polarity
    if(classification <0 and recommended == "no"):
        somaclassificacao +=1
    elif(classification >=0 and recommended =="yes"):
        somaclassificacao +=1  

In [7]:
print ("A Accuracy do TextBlob é", somaclassificacao/len(list_test))

A Accuracy do TextBlob é 0.6375


# 1.3 Aplicação de um léxico de sentimentos


### 1.3.1 Sem tratamento da negação

In [8]:
#Leitura do léxico
lex={}
with open("TM/data/en/NCR-lexicon.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="," )
    for i, d in enumerate(reader):
        lex[ d["English"] ] = int(d["Positive"]) - int(d["Negative"])

In [9]:
#Leitura dos dados de teste, a variável temp serve para não perdermos a lista original
list_test_data=[]
temp=[]
with open("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile )
    for d in reader:
        list_test_data.append( (d["text"],d["recommended"]) )
        temp.append( (d["text"],d["recommended"]) )

In [10]:
#Divisão dos dados por palavras através do .split()
for i, text in enumerate(list_test_data):
    list_test_data[i] = text[0].split()

In [11]:
#Contagem dos scores de cada review
count_pos=0
count_neg=0

list_scores=[]

for i in range(len(list_test_data)):
    for j in range(len(list_test_data[i])):
        if list_test_data[i][j] in lex:
            valor = lex[str(list_test_data[i][j])]
            if valor == 1 :
                count_pos+=1
            elif valor == -1 :
                count_neg+=1
        
    list_scores.append((count_pos - count_neg)/(count_pos + count_neg + 2))
    count_pos=0
    count_neg=0
        
#print(list_scores)
#print(temp[:3])

In [12]:
#Criação da lista final com a review e com a classificação de acordo com os scores obtidos
list_final = []

for i in range(len(list_scores)):
    if list_scores[i] >=0 :
        list_final.append([temp[i][0],"yes"])
    else :
        list_final.append([temp[i][0],"no"])
        
#list_final[:3]

In [13]:
#Criação da lista apenas com as novas classificações para ser utilizada no cálculo da accuracy
test_recommended_pred = []

for i in range(len(list_final)):
    test_recommended_pred.append(list_final[i][1])

In [14]:
#Cálculo da accuracy
score = sklearn.metrics.accuracy_score(test_recommended,test_recommended_pred)
print("A Accuracy do Lexicon é:   %0.3f" % score)

A Accuracy do Lexicon é:   0.562


### 1.3.2 Com tratamento da negação

In [15]:
#Leitura do léxico
lex2={}
with open("TM/data/en/NCR-lexicon.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="," )
    for i, d in enumerate(reader):
        lex2[ d["English"] ] = int(d["Positive"]) - int(d["Negative"])

In [16]:
#Leitura dos dados de teste, a variável temp2 e temp3 servem para não perdermos a lista original
#Neste caso sentimos a necessidade de criar duas variáveis temp, uma vez que tivémos que utilizar uma para aplicar a negação

list_test_data2=[]
temp2=[]
temp3=[]

with open("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile )
    for d in reader:
        list_test_data2.append( (d["text"],d["recommended"]) )
        temp2.append( (d["text"],d["recommended"]) )
        temp3.append( (d["text"],d["recommended"]) )

In [17]:
#Divisão dos dados por palavras através do .split()
for i, text in enumerate(list_test_data2):
    list_test_data2[i] = text[0].split()

In [18]:
for i, text in enumerate(temp2):
    temp2[i] = text[0].split()

In [19]:
#Aplicação do tratamento da negação
list_word_neg = ['not', 'no', "can't", "haven't", "isn't", "aren't", "won't", 'nor', "doesn't","don't",
                 "couldn't", "daren't", "didn't", "hasn't", "hadn't", "mayn't", "mightn't", "mustn't",
                 "needn't", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't", "mayn't", 
                 "Ain't"
                ]
delims = ["?",".",",","!",":",";"]

for i in range(len(list_test_data2)):
    for j in range(len(list_test_data2[i])):
        tmp = list_test_data2[i][j]
        if list_test_data2[i][j] in list_word_neg:
            list_test_data2[i][j+1] = ("NOT_" + list_test_data2[i][j+1])
        elif "NOT_" in list_test_data2[i][j-1] and j!= 0:
            list_test_data2[i][j] = ("NOT_" + list_test_data2[i][j])
            for p in delims:
                if p in list_test_data2[i][j-1]:
                    list_test_data2[i][j] = tmp
                    
#print(list_test_data2[62])
#print(list_word_neg)

In [20]:
#Contagem dos scores de cada review
count_pos=0
count_neg=0

list_scores2=[]

for i in range(len(temp2)):
    for j in range(len(temp2[i])):
        if temp2[i][j] in lex2 and "NOT_" in list_test_data2[i][j]:
            valor = lex2[str(temp2[i][j])] * (-1)
        elif temp2[i][j] in lex2 and "NOT_" not in list_test_data2[i][j]: 
            valor = lex2[str(temp2[i][j])]
        if valor == 1 :
            count_pos+=1
        elif valor == -1 :
            count_neg+=1
        
    list_scores2.append((count_pos - count_neg)/(count_pos + count_neg + 2))
    count_pos=0
    count_neg=0

#print(len(list_scores2))
#print(list_scores2)

In [21]:
#Criação da lista final com a review e com a classificação de acordo com os scores obtidos
list_final2 = []

for i in range(len(list_scores2)):
    if list_scores2[i] >=0 :
        list_final2.append([temp3[i][0],"yes"])
    else :
        list_final2.append([temp3[i][0],"no"])
        
#list_final[:3]

In [22]:
#Criação da lista apenas com as novas classificações para ser utilizada no cálculo da accuracy
test_recommended_pred2 = []

for i in range(len(list_final2)):
    test_recommended_pred2.append(list_final2[i][1])

In [23]:
#Cálculo da accuracy
score = sklearn.metrics.accuracy_score(test_recommended,test_recommended_pred2)
print("A Accuracy do Lexicon com tratamento da negação é:   %0.3f" % score)

A Accuracy do Lexicon com tratamento da negação é:   0.600


### 1.3.3 Pre-processamento e sem tratamento da negação

In [24]:
#Leitura do léxico
lex3={}
with open("TM/data/en/NCR-lexicon.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="," )
    for i, d in enumerate(reader):
        lex3[ d["English"] ] = int(d["Positive"]) - int(d["Negative"])

In [25]:
#Leitura dos dados de teste, a variável temp4 serve para não perdermos a lista original
list_test_data3=[]
temp4=[]

with open("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile )
    for d in reader:
        list_test_data3.append( (d["text"],d["recommended"]) )
        temp4.append( (d["text"],d["recommended"]) )

In [26]:
#Divisão dos dados por palavras através do .split()
for i, text in enumerate(list_test_data3):
    list_test_data3[i] = text[0].split()

In [27]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(list_test_data3)):
    for j in range(len(list_test_data3[i])):
        list_test_data3[i][j] = lemmatizer.lemmatize(list_test_data3[i][j], 'v') #verbo
        list_test_data3[i][j] = lemmatizer.lemmatize(list_test_data3[i][j], 'a') #adjetivo
        list_test_data3[i][j] = lemmatizer.lemmatize(list_test_data3[i][j], 'n') #nomes
        list_test_data3[i][j] = lemmatizer.lemmatize(list_test_data3[i][j], 's') #adjetivo saturado
        list_test_data3[i][j] = lemmatizer.lemmatize(list_test_data3[i][j], 'r') #adverbio


#print(list_test_data4[0])

In [28]:
#Aplicação do stemming
stemmer = stem.porter.PorterStemmer()

for i in range(len(list_test_data3)):
    for j in range(len(list_test_data3[i])):
        list_test_data3[i][j] = stemmer.stem(list_test_data3[i][j])

#print(list_test_data3[:1])

In [29]:
#Contagem dos scores de cada review
count_pos=0
count_neg=0

list_scores3=[]

for i in range(len(list_test_data3)):
    for j in range(len(list_test_data3[i])):
        if list_test_data3[i][j] in lex3:
            valor = lex3[str(list_test_data3[i][j])]
            if valor == 1 :
                count_pos+=1
            elif valor == -1 :
                count_neg+=1
        
    list_scores3.append((count_pos - count_neg)/(count_pos + count_neg + 2))
    count_pos=0
    count_neg=0
        
#print(list_scores)
#print(temp[:3])

In [30]:
#Criação da lista final com a review e com a classificação de acordo com os scores obtidos
list_final3 = []

for i in range(len(list_scores3)):
    if list_scores3[i] >=0 :
        list_final3.append([temp4[i][0],"yes"])
    else :
        list_final3.append([temp4[i][0],"no"])
        
#list_final3[:3]

In [31]:
#Criação da lista apenas com as novas classificações para ser utilizada no cálculo da accuracy
test_recommended_pred3 = []

for i in range(len(list_final3)):
    test_recommended_pred3.append(list_final3[i][1])

In [32]:
#Cálculo da accuracy
score = sklearn.metrics.accuracy_score(test_recommended,test_recommended_pred3)
print("A Accuracy do Lexicon com stemming:   %0.3f" % score)

A Accuracy do Lexicon com stemming:   0.625


### 1.3.4 Pre-processamento e com tratamento da negação

In [33]:
#Leitura do léxico
lex4={}
with open("TM/data/en/NCR-lexicon.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="," )
    for i, d in enumerate(reader):
        lex4[ d["English"] ] = int(d["Positive"]) - int(d["Negative"])

In [34]:
#Leitura dos dados de teste, a variável temp5 e temp6 servem para não perdermos a lista original
#Neste caso sentimos a necessidade de criar duas variáveis temp, uma vez que tivémos que utilizar uma para aplicar a negação
list_test_data4=[]
temp5=[]
temp6=[]

with open("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile )
    for d in reader:
        list_test_data4.append( (d["text"],d["recommended"]) )
        temp5.append( (d["text"],d["recommended"]) )
        temp6.append( (d["text"],d["recommended"]) )

In [35]:
for i, text in enumerate(temp5):
    temp5[i] = text[0].split()
#print(temp5)

In [36]:
#Divisão dos dados por palavras através do .split()
for i, text in enumerate(list_test_data4):
    list_test_data4[i] = text[0].split()

In [37]:
#Aplicação do tratamento da negação
list_word_neg = ['not', 'no', "can't", "haven't", "isn't", "aren't", "won't", 'nor', "doesn't","don't",
                 "couldn't", "daren't", "didn't", "hasn't", "hadn't", "mayn't", "mightn't", "mustn't",
                 "needn't", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't", "mayn't", 
                 "ain't", "NOT", "Not"
                ]

delims = ["?",".",",","!",":",";"]

for i in range(len(list_test_data4)):
    for j in range(len(list_test_data4[i])):
        tmp = list_test_data4[i][j]
        if list_test_data4[i][j] in list_word_neg:
            list_test_data4[i][j+1] = ("NOT_" + list_test_data4[i][j+1])
        elif "NOT_" in list_test_data4[i][j-1] and j!= 0:
            list_test_data4[i][j] = ("NOT_" + list_test_data4[i][j])
            for p in delims:
                if p in list_test_data4[i][j-1]:
                    list_test_data4[i][j] = tmp
                    
#print(list_test_data4[:2])
#print(list_word_neg)

In [38]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(list_test_data4)):
    for j in range(len(list_test_data4[i])):
        list_test_data4[i][j] = lemmatizer.lemmatize(list_test_data4[i][j], 'v') #verbo
        list_test_data4[i][j] = lemmatizer.lemmatize(list_test_data4[i][j], 'a') #adjetivo
        list_test_data4[i][j] = lemmatizer.lemmatize(list_test_data4[i][j], 'n') #nomes
        list_test_data4[i][j] = lemmatizer.lemmatize(list_test_data4[i][j], 's') #adjetivo saturado
        list_test_data4[i][j] = lemmatizer.lemmatize(list_test_data4[i][j], 'r') #adverbio


#print(list_test_data4[0])

In [39]:
#Aplicação do stemming
stemmer = stem.porter.PorterStemmer()

for i in range(len(list_test_data4)):
    for j in range(len(list_test_data4[i])):
        list_test_data4[i][j] = stemmer.stem(list_test_data4[i][j])

#print(list_test_data4[:1])

In [40]:
#Contagem dos scores de cada review
count_pos=0
count_neg=0

list_scores4=[]

for i in range(len(temp5)):
    for j in range(len(temp5[i])):
        if temp5[i][j] in lex4 and "not_" in list_test_data4[i][j]:
            valor = lex4[str(temp5[i][j])] * (-1)
        elif temp5[i][j] in lex4 and "not_" not in list_test_data4[i][j]: 
            valor = lex4[str(temp5[i][j])]
        if valor == 1 :
            count_pos+=1
        elif valor == -1 :
            count_neg+=1
        
    list_scores4.append((count_pos - count_neg)/(count_pos + count_neg + 2))
    count_pos=0
    count_neg=0

#print(len(list_scores2))
#print(list_scores4)

In [41]:
list_final4 = []

for i in range(len(list_scores4)):
    if list_scores4[i] >=0 :
        list_final4.append([temp6[i][0],"yes"])
    else :
        list_final4.append([temp6[i][0],"no"])
        
#list_final4[:3]

In [42]:
test_recommended_pred4 = []

for i in range(len(list_final4)):
    test_recommended_pred4.append(list_final4[i][1])

In [43]:
score = sklearn.metrics.accuracy_score(test_recommended,test_recommended_pred4)
print("A Accuracy do Lexicon com tratamento da negação e stemming é:   %0.3f" % score)

A Accuracy do Lexicon com tratamento da negação e stemming é:   0.613


# 2.0 - Aprendizagem automática
## 2.1 - Aprendizagem automática a usar a biblioteca NLTK

In [44]:
# LÊ CSV COM OS DADOS DO CONJUNTO DE TREINO
recommended=('yes', 'no')
text = []
y = []

with open("TM/data/en-sentiment/SFU_Review_Corpus_train.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        if row["recommended"] == recommended[1]:
            y.append(1)
        else:
            y.append(0)
        text.append(row["text"])
        
        
#LÊ CSV COM OS DADOS DO CONJUNTO DE TESTE        
recommended_testing=('yes', 'no')
Test_text = []
test_y = []

with open("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        if row["recommended"] == recommended_testing[1]:
            test_y.append(1)
        else:
            test_y.append(0)
        Test_text.append(row["text"])

In [45]:
#treino
#tokenizer = word_tokenize()
docs = []

for words in text:
    doc = collections.Counter()
    for w in word_tokenize(words):
        doc[w] += 1
    docs.append(doc)
    
#test
#tokenizer_test = word_tokenize()
docs_test = []

for words in Test_text:
    doc_test = collections.Counter()
    for w in word_tokenize(words):
        doc_test[w] += 1
    docs_test.append(doc_test)

In [46]:
#VOCABULÁRIO TREINO
voc_length = 1500 #DEFINE A QUANTIDADE DE FEATURES

tf = collections.Counter()
df = collections.Counter()

for d in docs:
    for w in d:
        tf[w] += d[w]
        df[w] += 1

idfs = {}
for w in tf:
    if tf[w] > 2:
        idfs[w] = np.log(len(docs)/df[w])
        
#CRIA O VOCÁBULÁRIO COM AS FEATURES MAIS IMPORTANTES DO CONJUNTO DE TREINO
voc = sorted(idfs, key=idfs.get, reverse=True)[:voc_length]


#VOCABULÁRIO TESTE
test_voc_length = 1500 #DEFINE A QUANTIDADE DE FEATURES

tf_test = collections.Counter()
df_test = collections.Counter()

for d in docs_test:
    for w in d:
        tf_test[w] += d[w]
        df_test[w] += 1

idfs_test = {}
for w in tf_test:
    if tf_test[w] > 2:
        idfs_test[w] = np.log(len(docs_test)/df_test[w])
        
#CRIA O VOCÁBULÁRIO COM AS FEATURES MAIS IMPORTANTES DO CONJUNTO DE TESTE
voc_test = sorted(idfs_test, key=idfs_test.get, reverse=True)[:test_voc_length]

In [47]:
#cria um índice para o conjunto treino - ou seja, associa o vocabulário a um índice
indice = {}
for i,w in enumerate(sorted(voc)):
    indice[w] = i
    
#cria um índice para o conjunto teste - ou seja, associa o vocabulário a um índice
indice_test = {}
for i,w in enumerate(sorted(voc_test)):
    indice_test[w] = i

In [48]:
#treino
docrep = []
for d in docs:
    valores = np.zeros([len(voc)])
    for w in d:
        if w in indice:
            valores[ indice[w] ] = d[w]
    docrep.append (valores)
    
#teste    
docrep_test = []
for d in docs_test:
    valores_test = np.zeros([len(voc_test)])
    for w in d:
        if w in indice_test:
            valores_test[indice_test[w] ] = d[w]
    docrep_test.append (valores_test)

In [49]:
#treino
newdocrep = []
for d,c in zip(docs, y):
    docwords={}
    for w in d:
        if w in indice:
            docwords[w] = d[w]
    newdocrep.append ( (docwords, recommended[c] ) )

#teste
newdocrep_test = []
for d,c in zip(docs_test, test_y):
    docwords_test={}
    for w in d:
        if w in indice_test:
            docwords_test[w] = d[w]
    newdocrep_test.append ( (docwords_test, recommended_testing[c] ) )

In [50]:
nbc = naivebayes.NaiveBayesClassifier.train(newdocrep)
nltk.classify.accuracy(nbc,newdocrep)

0.94375

In [51]:
mec = maxent.MaxentClassifier.train(newdocrep, bernoulli=False,max_iter=15, trace=3)
nltk.classify.accuracy(mec,newdocrep)

  ==> Training (15 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.491
             2          -0.48233        0.912
             3          -0.38165        0.925
             4          -0.32115        0.928
             5          -0.28037        0.928
             6          -0.25084        0.931
             7          -0.22838        0.941
             8          -0.21067        0.944
             9          -0.19631        0.944
            10          -0.18441        0.944
            11          -0.17436        0.944
            12          -0.16577        0.944
            13          -0.15831        0.944
            14          -0.15178        0.944
         Final          -0.14601        0.944


0.94375

In [52]:
#cria variável final para a classificação do conjunto de teste
final_test = [(text) for text, recommended in newdocrep_test]

In [53]:
#FAZ A CLASSIFICAÇÃO DO CONJUNTO DE TESTE ULIZANDO DOIS MÉTODOS: 

#NBC - Naive Bayes
final_recommended_testing = nbc.classify_many(final_test)

#MEC - Maximum Entropy (Logistic Regression)
final_recommended_testing2 = mec.classify_many(final_test)

In [54]:
#CRIA A FUNÇÃO PARA CALCULAR A ACCURACY DO NLTK APLICADO A OUTROS CONJUNTOS DE DADOS
def accuracy_nltk(reference, test):
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    return sum(x == y for x, y in zip(reference, test)) / len(test)

In [55]:
#NBC
NBC = accuracy_nltk(test_recommended,final_recommended_testing)
print(NBC)

0.6


In [56]:
#MEC
MEC = accuracy_nltk(test_recommended,final_recommended_testing2)
print(MEC)

0.5375


In [57]:
NLTK_accuracy_media = (NBC + MEC) /2
print(NLTK_accuracy_media)

0.56875


## 2.2 - Aprendizagem automática a usar a biblioteca SKLEARN - CountVectorizer()

In [58]:
#reset os inputs
df_train = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_train.csv", encoding="utf-8")
df_test = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8")
test_text = df_test['text'].tolist()
test_recommended = df_test['recommended'].tolist()
train_text = df_train['text'].tolist()
train_recommended = df_train['recommended'].tolist()

In [59]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.

vectorizer = CountVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X = vectorizer.fit_transform(train_text)
test_X = vectorizer.transform(test_text)
print(train_X.shape, test_X.shape)


(320, 1500) (80, 1500)


In [60]:
train_X = train_X.todense()
test_X = test_X.todense()

In [61]:
#cria modelo 
nb = MultinomialNB()
model_count_nb = nb.fit(train_X,train_recommended)
y_pred = model_count_nb.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

nb_count = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 32 out of 80
Accuracy:  0.6
Precision:  0.5991244527829893
Recall:  0.5996228786926461
F1-measure:  0.5989974937343359


In [62]:
#cria modelo 
lr = LogisticRegression(max_iter=500)
model_count_lr=lr.fit(train_X, train_recommended)
y_pred = model_count_lr.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

lr_count = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 22 out of 80
Accuracy:  0.725
Precision:  0.7458854509545754
Recall:  0.7347580138277813
F1-measure:  0.7234443746071653


In [63]:
#cria modelo 
svmc = LinearSVC(max_iter=500)
model_count_svc = svmc.fit(train_X, train_recommended)
y_pred = model_count_svc.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

svmc_count = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 23 out of 80
Accuracy:  0.7125
Precision:  0.7366666666666667
Recall:  0.7231301068510372
F1-measure:  0.7102818453786804


In [64]:
#cria modelo 
gnb = GaussianNB()
model_count_gnb = gnb.fit(train_X, train_recommended)
y_pred = model_count_gnb.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), test_X.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

gnb_count = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 27 out of 80
Accuracy:  0.6625
Precision:  0.671994884910486
Recall:  0.669076052796983
F1-measure:  0.6620247222656861


In [65]:
#Faz validação cruzada  - k = 10
scores_count_cv_gnb = cross_val_score(model_count_gnb, train_X, y=train_recommended, cv=10)
scores_count_cv_svc = cross_val_score(model_count_svc, train_X, y=train_recommended, cv=10)
scores_count_cv_lr = cross_val_score(model_count_lr, train_X, y=train_recommended, cv=10)
scores_count_cv_nb = cross_val_score(model_count_nb, train_X, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
count_cv_gnb = scores_count_cv_gnb.mean()
count_cv_svc = scores_count_cv_svc.mean()
count_cv_lr = scores_count_cv_lr.mean()
count_cv_nb = scores_count_cv_nb.mean()

#faz média global das validações cruzadas
count_media = (count_cv_gnb + count_cv_svc + count_cv_lr + count_cv_nb) / 4 

print(count_media)
print(count_cv_gnb)
print(count_cv_svc)
print(count_cv_lr)
print(count_cv_nb)

0.7359375
0.696875
0.7625
0.7875
0.696875


# 2.2.1 - Aprendizagem Automática com Pre-processamento

##  Cenário 1 
Remoção da pontuação; Lowerization; Tokenization; Lemmatization; Stemming

In [66]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [67]:
cenario1 =[]

for i, text in enumerate(train_text_new):
    cenario1.append(text)
    cenario1[i] = text.lower()

In [68]:
# Faz Tokenização
for i in range(len(cenario1)):
    cenario1[i] = cenario1[i].split()
#print(cenario1)

In [69]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(cenario1)):
    for j in range(len(cenario1[i])):
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'v') #verbo
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'a') #adjetivo
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'n') #nomes
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 's') #adjetivo saturado
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'r') #adverbio


#print(cenario1[0])

In [70]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario1)):
    for j in range(len(cenario1[i])):
        cenario1[i][j] = stemmer.stem(cenario1[i][j])
#print(cenario1[:2])

In [71]:
def untokenize(cenario1):
    for tokens in cenario1:
        yield ' '.join(tokens)

untokenized_data_cenario1 = list(untokenize(cenario1))
#print(untokenized_data)

In [72]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario1

vectorizer = CountVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario1 = vectorizer.fit_transform(train_pp)
test_X_cenario1 = vectorizer.transform(test_text)
print(train_X_cenario1.shape, test_X_cenario1.shape)

(320, 1500) (80, 1500)


In [73]:
train_X_cenario1 = train_X_cenario1.todense()
test_X_cenario1 = test_X_cenario1.todense()

In [74]:
#cria modelo 
nb = MultinomialNB()
model_count_nb_cenario1 = nb.fit(train_X_cenario1,train_recommended)
y_pred = model_count_nb_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 31 out of 80
Accuracy:  0.6125
Precision:  0.6098484848484849
Recall:  0.609365179132621
F1-measure:  0.6095103133364824


In [75]:
#cria modelo 
lr = LogisticRegression(max_iter=500)
model_count_lr_cenario1=lr.fit(train_X_cenario1, train_recommended)
y_pred = model_count_lr_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 33 out of 80
Accuracy:  0.5875
Precision:  0.6416666666666666
Recall:  0.6068510370835952
F1-measure:  0.567992145311733


In [76]:
#cria modelo 
svmc = LinearSVC(max_iter=500)
model_count_svc_cenario1 = svmc.fit(train_X_cenario1, train_recommended)
y_pred = model_count_svc_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))



Mislabeled points: 35 out of 80
Accuracy:  0.5625
Precision:  0.6083333333333334
Recall:  0.5817096165933375
F1-measure:  0.5418098510882017


In [77]:
#cria modelo 
gnb = GaussianNB()
model_count_gnb_cenario1 = gnb.fit(train_X_cenario1, train_recommended)
y_pred = model_count_gnb_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), test_X_cenario1.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))



Mislabeled points: 27 out of 80
Accuracy:  0.6625
Precision:  0.7249216300940439
Recall:  0.680389692017599
F1-measure:  0.6502024291497976


In [78]:
#Faz validação cruzada  - k = 10
scores_count_cv_gnb_cenario1 = cross_val_score(model_count_gnb_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_count_cv_svc_cenario1 = cross_val_score(model_count_svc_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_count_cv_lr_cenario1 = cross_val_score(model_count_lr_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_count_cv_nb_cenario1 = cross_val_score(model_count_nb_cenario1, train_X_cenario1, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
count_cv_gnb_cenario1 = scores_count_cv_gnb_cenario1.mean()
count_cv_svc_cenario1 = scores_count_cv_svc_cenario1.mean()
count_cv_lr_cenario1 = scores_count_cv_lr_cenario1.mean()
count_cv_nb_cenario1 = scores_count_cv_nb_cenario1.mean()

#faz média global das validações cruzadas
count_media_cenario1 = (count_cv_gnb_cenario1 + count_cv_svc_cenario1 + count_cv_lr_cenario1 + count_cv_nb_cenario1) / 4 

print(count_media_cenario1)
print(count_cv_gnb_cenario1)
print(count_cv_svc_cenario1)
print(count_cv_lr_cenario1)
print(count_cv_nb_cenario1)

0.72734375
0.675
0.7625
0.790625
0.68125


## Cenário 2 
Remoção da pontuação; Lowerization; Tokenization; Lemmatization;

In [79]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [80]:
cenario2 =[]

for i, text in enumerate(train_text_new):
    cenario2.append(text)
    cenario2[i] = text.lower()

In [81]:
# Faz Tokenização
for i in range(len(cenario2)):
    cenario2[i] = cenario2[i].split()
#print(cenario2)

In [82]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(cenario2)):
    for j in range(len(cenario2[i])):
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'v') #verbo
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'a') #adjetivo
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'n') #nomes
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 's') #adjetivo saturado
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'r') #adverbio


#print(cenario2[0])

In [83]:
def untokenize(cenario2):
    for tokens in cenario2:
        yield ' '.join(tokens)

untokenized_data_cenario2 = list(untokenize(cenario2))
#print(untokenized_data)

In [84]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario2

vectorizer = CountVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario2 = vectorizer.fit_transform(train_pp)
test_X_cenario2 = vectorizer.transform(test_text)
print(train_X_cenario2.shape, test_X_cenario2.shape)

(320, 1500) (80, 1500)


In [85]:
train_X_cenario2 = train_X_cenario2.todense()
test_X_cenario2 = test_X_cenario2.todense()

In [86]:
#Cria modelo
nb = MultinomialNB()
model_count_nb_cenario2 = nb.fit(train_X_cenario2,train_recommended)
y_pred = model_count_nb_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

nb_count_cenario2 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 32 out of 80
Accuracy:  0.6
Precision:  0.5968253968253968
Recall:  0.5958516656191075
F1-measure:  0.595959595959596


In [87]:
lr = LogisticRegression(max_iter=500)
model_count_lr_cenario2 = lr.fit(train_X_cenario2, train_recommended)
y_pred = model_count_lr_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 25 out of 80
Accuracy:  0.6875
Precision:  0.71
Recall:  0.6979886863607794
F1-measure:  0.6850889623681311


In [88]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_count_svc_cenario2 = svmc.fit(train_X_cenario2, train_recommended)
y_pred = model_count_svc_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 29 out of 80
Accuracy:  0.6375
Precision:  0.6566666666666666
Recall:  0.647705845380264
F1-measure:  0.6347031963470319


In [89]:
#Cria modelo
gnb = GaussianNB()
model_count_gnb_cenario2 = gnb.fit(train_X_cenario2, train_recommended)
y_pred = model_count_gnb_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

gnb_count_cenario2 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 22 out of 80
Accuracy:  0.725
Precision:  0.7651991614255764
Recall:  0.73852922690132
F1-measure:  0.7206349206349206


In [90]:
#Faz validação cruzada  - k = 10
scores_count_cv_gnb_cenario2 = cross_val_score(model_count_gnb_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_count_cv_svc_cenario2 = cross_val_score(model_count_svc_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_count_cv_lr_cenario2 = cross_val_score(model_count_lr_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_count_cv_nb_cenario2 = cross_val_score(model_count_nb_cenario2, train_X_cenario2, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
count_cv_gnb_cenario2 = scores_count_cv_gnb_cenario2.mean()
count_cv_svc_cenario2 = scores_count_cv_svc_cenario2.mean()
count_cv_lr_cenario2 = scores_count_cv_lr_cenario2.mean()
count_cv_nb_cenario2 = scores_count_cv_nb_cenario2.mean()

#faz média global das validações cruzadas
count_media_cenario2 = (count_cv_gnb_cenario2 + count_cv_svc_cenario2 + count_cv_lr_cenario2 + count_cv_nb_cenario2) / 4 

print(count_media_cenario2)
print(count_cv_gnb_cenario2)
print(count_cv_svc_cenario2)
print(count_cv_lr_cenario2)
print(count_cv_nb_cenario2)

0.7281249999999999
0.7
0.76875
0.76875
0.675


## Cenário 3

Remoção da pontuação; Lowerization; Tokenization; Stemming

In [91]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [92]:
cenario3 =[]

for i, text in enumerate(train_text_new):
    cenario3.append(text)
    cenario3[i] = text.lower()

In [93]:
# Faz Tokenização
for i in range(len(cenario3)):
    cenario3[i] = cenario3[i].split()
#print(cenario3)

In [94]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario3)):
    for j in range(len(cenario3[i])):
        cenario3[i][j] = stemmer.stem(cenario3[i][j])
#print(cenario3[:2])

In [95]:
def untokenize(cenario3):
    for tokens in cenario3:
        yield ' '.join(tokens)

untokenized_data_cenario3 = list(untokenize(cenario3))
#print(untokenized_data)

In [96]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario3

vectorizer = CountVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario3 = vectorizer.fit_transform(train_pp)
test_X_cenario3 = vectorizer.transform(test_text)
print(train_X_cenario3.shape, test_X_cenario3.shape)

(320, 1500) (80, 1500)


In [97]:
train_X_cenario3 = train_X_cenario3.todense()
test_X_cenario3 = test_X_cenario3.todense()

In [98]:
#Cria modelo
nb = MultinomialNB()
model_count_nb_cenario3 = nb.fit(train_X_cenario3,train_recommended)
y_pred = model_count_nb_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

nb_count_cenario3 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 27 out of 80
Accuracy:  0.6625
Precision:  0.6625
Recall:  0.6634192331866751
F1-measure:  0.6620247222656861


In [99]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_count_lr_cenario3 = lr.fit(train_X_cenario3, train_recommended)
y_pred = model_count_lr_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

lr_count_cenario3 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 33 out of 80
Accuracy:  0.5875
Precision:  0.6550179211469533
Recall:  0.6087366436203645
F1-measure:  0.5628415300546448


In [100]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_count_svc_cenario3 = svmc.fit(train_X_cenario3, train_recommended)
y_pred = model_count_svc_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

svmc_count_cenario3 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 35 out of 80
Accuracy:  0.5625
Precision:  0.6083333333333334
Recall:  0.5817096165933375
F1-measure:  0.5418098510882017


In [101]:
#Cria modelo
gnb = GaussianNB()
model_count_gnb_cenario3 = gnb.fit(train_X_cenario3, train_recommended)
y_pred = model_count_gnb_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), train_X_cenario3.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

gnb_count_cenario3 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 29 out of 320
Accuracy:  0.6375
Precision:  0.7083333333333333
Recall:  0.6571338780641106
F1-measure:  0.6203567337587956


In [102]:
#Cria modelo

#Faz validação cruzada  - k = 10
scores_count_cv_gnb_cenario3 = cross_val_score(model_count_gnb_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_count_cv_svc_cenario3 = cross_val_score(model_count_svc_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_count_cv_lr_cenario3 = cross_val_score(model_count_lr_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_count_cv_nb_cenario3 = cross_val_score(model_count_nb_cenario3, train_X_cenario3, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
count_cv_gnb_cenario3 = scores_count_cv_gnb_cenario3.mean()
count_cv_svc_cenario3 = scores_count_cv_svc_cenario3.mean()
count_cv_lr_cenario3 = scores_count_cv_lr_cenario3.mean()
count_cv_nb_cenario3 = scores_count_cv_nb_cenario3.mean()

#faz média global das validações cruzadas
count_media_cenario3 = (count_cv_gnb_cenario3 + count_cv_svc_cenario3 + count_cv_lr_cenario3 + count_cv_nb_cenario3) / 4 

print(count_media_cenario3)

0.7257812499999999


## Cenário 4

Tokenization; Stemming

In [103]:
# Faz Tokenização
cenario4 = train_text
for i in range(len(cenario4)):
    cenario4[i] = cenario4[i].split()
#print(cenario4)

In [104]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario4)):
    for j in range(len(cenario4[i])):
        cenario4[i][j] = stemmer.stem(cenario4[i][j])
#print(cenario4[:2])

In [105]:
def untokenize(cenario4):
    for tokens in cenario4:
        yield ' '.join(tokens)

untokenized_data_cenario4 = list(untokenize(cenario4))
#print(untokenized_data)

In [106]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario4

vectorizer = CountVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario4 = vectorizer.fit_transform(train_pp)
test_X_cenario4 = vectorizer.transform(test_text)
print(train_X_cenario4.shape, test_X_cenario4.shape)

(320, 1500) (80, 1500)


In [107]:
train_X_cenario4 = train_X_cenario4.todense()
test_X_cenario4 = test_X_cenario4.todense()

In [108]:
#Cria modelo
nb = MultinomialNB()
model_count_nb_cenario4 = nb.fit(train_X_cenario4,train_recommended)
y_pred = model_count_nb_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 30 out of 80
Accuracy:  0.625
Precision:  0.6260162601626016
Recall:  0.6266499057196732
F1-measure:  0.6247654784240149


In [109]:
#Cria modelo

lr = LogisticRegression(max_iter=500)
model_count_lr_cenario4 = lr.fit(train_X_cenario4, train_recommended)
y_pred = model_count_lr_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 31 out of 80
Accuracy:  0.6125
Precision:  0.6908602150537635
Recall:  0.6338780641106223
F1-measure:  0.5893359827786058


In [110]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_count_svc_cenario4 = svmc.fit(train_X_cenario4, train_recommended)
y_pred = model_count_svc_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 33 out of 80
Accuracy:  0.5875
Precision:  0.6308777429467085
Recall:  0.6049654305468259
F1-measure:  0.5724696356275304


In [111]:
#Cria modelo

gnb = GaussianNB()
model_count_gnb_cenario4 = gnb.fit(train_X_cenario4, train_recommended)
y_pred = model_count_gnb_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), train_X_cenario4.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 24 out of 320
Accuracy:  0.7
Precision:  0.737246680642907
Recall:  0.7133878064110621
F1-measure:  0.6952380952380952


In [112]:
#Faz validação cruzada  - k = 10
scores_count_cv_gnb_cenario4 = cross_val_score(model_count_gnb_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_count_cv_svc_cenario4 = cross_val_score(model_count_svc_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_count_cv_lr_cenario4 = cross_val_score(model_count_lr_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_count_cv_nb_cenario4 = cross_val_score(model_count_nb_cenario4, train_X_cenario4, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
count_cv_gnb_cenario4 = scores_count_cv_gnb_cenario4.mean()
count_cv_svc_cenario4 = scores_count_cv_svc_cenario4.mean()
count_cv_lr_cenario4 = scores_count_cv_lr_cenario4.mean()
count_cv_nb_cenario4 = scores_count_cv_nb_cenario4.mean()

#faz média global das validações cruzadas
count_media_cenario4 = (count_cv_gnb_cenario4 + count_cv_svc_cenario4 + count_cv_lr_cenario4 + count_cv_nb_cenario4) / 4 

print(count_media_cenario4)

0.7015625


In [113]:
print(count_media)
print(count_media_cenario1)
print(count_media_cenario2)
print(count_media_cenario3)
print(count_media_cenario4)

0.7359375
0.72734375
0.7281249999999999
0.7257812499999999
0.7015625


## 2.3 - Aprendizagem automática a usar a biblioteca SKLEARN - TfidfVectorizer()

In [114]:
#resetar o input 
df_train = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_train.csv", encoding="utf-8")
df_test = pd.read_csv("TM/data/en-sentiment/SFU_Review_Corpus_test.csv", encoding="utf-8")

test_text = df_test['text'].tolist()
test_recommended = df_test['recommended'].tolist()
train_text = df_train['text'].tolist()
train_recommended = df_train['recommended'].tolist()

In [115]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.

vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X = vectorizer.fit_transform(train_text)
test_X = vectorizer.transform(test_text)
print(train_X.shape, test_X.shape)

(320, 1500) (80, 1500)


In [116]:
train_X = train_X.todense()
test_X = test_X.todense()

In [117]:
#Cria modelo
nb = MultinomialNB()
model_tfidf_nb = nb.fit(train_X,train_recommended)
y_pred = model_tfidf_nb.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 24 out of 80
Accuracy:  0.7
Precision:  0.7195523370638578
Recall:  0.7096165933375236
F1-measure:  0.6983029541169077


In [118]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_tfidf_lr=lr.fit(train_X, train_recommended)
y_pred = model_tfidf_lr.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 24 out of 80
Accuracy:  0.7
Precision:  0.7275185936443542
Recall:  0.711502199874293
F1-measure:  0.696969696969697


In [119]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_tfidf_svc = svmc.fit(train_X, train_recommended)
y_pred = model_tfidf_svc.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 18 out of 80
Accuracy:  0.775
Precision:  0.7841269841269841
Recall:  0.781269641734758
F1-measure:  0.7748592870544091


In [120]:
#Cria modelo
gnb = GaussianNB()
model_tfidf_gnb = gnb.fit(train_X, train_recommended)
y_pred = model_tfidf_gnb.predict(test_X)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), test_X.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 20 out of 80
Accuracy:  0.75
Precision:  0.7510944340212633
Recall:  0.7523570081709616
F1-measure:  0.7498436522826766


In [121]:
#Faz validação cruzada  - k = 10
scores_tfidf_cv_gnb = cross_val_score(model_tfidf_gnb, train_X, y=train_recommended, cv=10)
scores_tfidf_cv_svc = cross_val_score(model_tfidf_svc, train_X, y=train_recommended, cv=10)
scores_tfidf_cv_lr = cross_val_score(model_tfidf_lr, train_X, y=train_recommended, cv=10)
scores_tfidf_cv_nb = cross_val_score(model_tfidf_nb, train_X, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
tfidf_cv_gnb = scores_tfidf_cv_gnb.mean()
tfidf_cv_svc = scores_tfidf_cv_svc.mean()
tfidf_cv_lr = scores_tfidf_cv_lr.mean()
tfidf_cv_nb = scores_tfidf_cv_nb.mean()

#faz média global das validações cruzadas
tfidf_media = (tfidf_cv_gnb + tfidf_cv_svc + tfidf_cv_lr + tfidf_cv_nb) / 4 

print(tfidf_media)

0.734375


# 2.3.1 - Aprendizagem Automática com Pre-processamento

## Cenário 1 
Remoção da pontuação; Lowerization; Tokenization; Lemmatization; Stemming

In [122]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [123]:
cenario1 =[]

for i, text in enumerate(train_text_new):
    cenario1.append(text)
    cenario1[i] = text.lower()

In [124]:
# Faz Tokenização
for i in range(len(cenario1)):
    cenario1[i] = cenario1[i].split()
#print(cenario1)

In [125]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(cenario1)):
    for j in range(len(cenario1[i])):
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'v') #verbo
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'a') #adjetivo
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'n') #nomes
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 's') #adjetivo saturado
        cenario1[i][j] = lemmatizer.lemmatize(cenario1[i][j], 'r') #adverbio


#print(cenario1[0])

In [126]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario1)):
    for j in range(len(cenario1[i])):
        cenario1[i][j] = stemmer.stem(cenario1[i][j])
#print(cenario1[:2])

In [127]:
def untokenize(cenario1):
    for tokens in cenario1:
        yield ' '.join(tokens)

untokenized_data_cenario1 = list(untokenize(cenario1))
#print(untokenized_data)

In [128]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario1

vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario1 = vectorizer.fit_transform(train_pp)
test_X_cenario1 = vectorizer.transform(test_text)
print(train_X_cenario1.shape, test_X_cenario1.shape)

(320, 1500) (80, 1500)


In [129]:
train_X_cenario1 = train_X_cenario1.todense()
test_X_cenario1 = test_X_cenario1.todense()

In [130]:
#Cria modelo
nb = MultinomialNB()
model_tfidf_nb_cenario1 = nb.fit(train_X_cenario1,train_recommended)
y_pred = model_tfidf_nb_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 23 out of 80
Accuracy:  0.7125
Precision:  0.7184343434343434
Recall:  0.7174732872407291
F1-measure:  0.7124550711048601


In [131]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_tfidf_lr_cenario1=lr.fit(train_X_cenario1, train_recommended)
y_pred = model_tfidf_lr_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 32 out of 80
Accuracy:  0.6
Precision:  0.6414950419527079
Recall:  0.61659333752357
F1-measure:  0.5873629916183107


In [132]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_tfidf_svc_cenario1 = svmc.fit(train_X_cenario1, train_recommended)
y_pred = model_tfidf_svc_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario1)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 34 out of 80
Accuracy:  0.575
Precision:  0.6036363636363636
Recall:  0.5895663104965431
F1-measure:  0.5652173913043478


In [133]:
#Cria modelo
gnb = GaussianNB()
model_tfidf_gnb_cenario1 = gnb.fit(train_X_cenario1, train_recommended)
y_pred = model_tfidf_gnb_cenario1.predict(test_X_cenario1)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), test_X_cenario1.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 21 out of 80
Accuracy:  0.7375
Precision:  0.7362155388471178
Recall:  0.7369578881206789
F1-measure:  0.7364705882352942


In [134]:
#Faz validação cruzada  - k = 10
scores_tfidf_cv_gnb_cenario1 = cross_val_score(model_tfidf_gnb_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_tfidf_cv_svc_cenario1 = cross_val_score(model_tfidf_svc_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_tfidf_cv_lr_cenario1 = cross_val_score(model_tfidf_lr_cenario1, train_X_cenario1, y=train_recommended, cv=10)
scores_tfidf_cv_nb_cenario1 = cross_val_score(model_tfidf_nb_cenario1, train_X_cenario1, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
tfidf_cv_gnb_cenario1 = scores_tfidf_cv_gnb_cenario1.mean()
tfidf_cv_svc_cenario1 = scores_tfidf_cv_svc_cenario1.mean()
tfidf_cv_lr_cenario1 = scores_tfidf_cv_lr_cenario1.mean()
tfidf_cv_nb_cenario1 = scores_tfidf_cv_nb_cenario1.mean()

#faz média global das validações cruzadas
tfidf_media_cenario1 = (tfidf_cv_gnb_cenario1 + tfidf_cv_svc_cenario1 + tfidf_cv_lr_cenario1 + tfidf_cv_nb_cenario1) / 4 

print(tfidf_media_cenario1)

0.75625


## Cenário 2 
Remoção da pontuação; Lowerization; Tokenization; Lemmatization;

In [135]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [136]:
cenario2 =[]

for i, text in enumerate(train_text_new):
    cenario2.append(text)
    cenario2[i] = text.lower()

In [137]:
# Faz Tokenização
for i in range(len(cenario2)):
    cenario2[i] = cenario2[i].split()
#print(cenario2)

In [138]:
 # Faz Lemmatização
lemmatizer = WordNetLemmatizer()

for i in range(len(cenario2)):
    for j in range(len(cenario2[i])):
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'v') #verbo
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'a') #adjetivo
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'n') #nomes
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 's') #adjetivo saturado
        cenario2[i][j] = lemmatizer.lemmatize(cenario2[i][j], 'r') #adverbio


#print(cenario2[0])

In [139]:
def untokenize(cenario2):
    for tokens in cenario2:
        yield ' '.join(tokens)

untokenized_data_cenario2 = list(untokenize(cenario2))
#print(untokenized_data)

In [140]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario2

vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario2 = vectorizer.fit_transform(train_pp)
test_X_cenario2 = vectorizer.transform(test_text)
print(train_X_cenario2.shape, test_X_cenario2.shape)

(320, 1500) (80, 1500)


In [141]:
train_X_cenario2 = train_X_cenario2.todense()
test_X_cenario2 = test_X_cenario2.todense()

In [142]:
#Cria modelo
nb = MultinomialNB()
model_tfidf_nb_cenario2 = nb.fit(train_X_cenario2,train_recommended)
y_pred = model_tfidf_nb_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 25 out of 80
Accuracy:  0.6875
Precision:  0.6975703324808185
Recall:  0.6942174732872407
F1-measure:  0.6870599280237835


In [143]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_tfidf_lr_cenario2=lr.fit(train_X_cenario2, train_recommended)
y_pred = model_tfidf_lr_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 23 out of 80
Accuracy:  0.7125
Precision:  0.7231457800511509
Recall:  0.7193588937774984
F1-measure:  0.7120951337818808


In [144]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_tfidf_svc_cenario2 = svmc.fit(train_X_cenario2, train_recommended)
y_pred = model_tfidf_svc_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))



Mislabeled points: 23 out of 80
Accuracy:  0.7125
Precision:  0.7231457800511509
Recall:  0.7193588937774984
F1-measure:  0.7120951337818808


In [145]:
#Cria modelo
gnb = GaussianNB()
model_tfidf_gnb_cenario2 = gnb.fit(train_X_cenario2, train_recommended)
y_pred = model_tfidf_gnb_cenario2.predict(test_X_cenario2)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario2)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 19 out of 80
Accuracy:  0.7625
Precision:  0.7625
Recall:  0.7639849151477058
F1-measure:  0.7621655452980753


In [146]:
#Faz validação cruzada  - k = 10
scores_tfidf_cv_gnb_cenario2 = cross_val_score(model_tfidf_gnb_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_tfidf_cv_svc_cenario2 = cross_val_score(model_tfidf_svc_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_tfidf_cv_lr_cenario2 = cross_val_score(model_tfidf_lr_cenario2, train_X_cenario2, y=train_recommended, cv=10)
scores_tfidf_cv_nb_cenario2 = cross_val_score(model_tfidf_nb_cenario2, train_X_cenario2, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
tfidf_cv_gnb_cenario2 = scores_tfidf_cv_gnb_cenario2.mean()
tfidf_cv_svc_cenario2 = scores_tfidf_cv_svc_cenario2.mean()
tfidf_cv_lr_cenario2 = scores_tfidf_cv_lr_cenario2.mean()
tfidf_cv_nb_cenario2 = scores_tfidf_cv_nb_cenario2.mean()

#faz média global das validações cruzadas
tfidf_media_cenario2 = (tfidf_cv_gnb_cenario2 + tfidf_cv_svc_cenario2 + tfidf_cv_lr_cenario2 + tfidf_cv_nb_cenario2) / 4 

print(tfidf_media_cenario2)

0.7445312499999999


## Cenário 3

Remoção da pontuação; Lowerization; Tokenization; Stemming

In [147]:
# Remove pontuação
import string

train_text_pp = train_text
for text in range(len(train_text)):
    train_text_new = ["".join([char for char in text if char.isalnum() or char == " "]) for text in train_text_pp]

In [148]:
cenario3 =[]

for i, text in enumerate(train_text_new):
    cenario3.append(text)
    cenario3[i] = text.lower()

In [149]:
# Faz Tokenização
for i in range(len(cenario3)):
    cenario3[i] = cenario3[i].split()
#print(cenario3)

In [150]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario3)):
    for j in range(len(cenario3[i])):
        cenario3[i][j] = stemmer.stem(cenario3[i][j])
#print(cenario3[:2])

In [151]:
def untokenize(cenario3):
    for tokens in cenario3:
        yield ' '.join(tokens)

untokenized_data_cenario3 = list(untokenize(cenario3))
#print(untokenized_data)

In [152]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario3

vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario3 = vectorizer.fit_transform(train_pp)
test_X_cenario3 = vectorizer.transform(test_text)
print(train_X_cenario3.shape, test_X_cenario3.shape)


(320, 1500) (80, 1500)


In [153]:
train_X_cenario3 = train_X_cenario3.todense()
test_X_cenario3 = test_X_cenario3.todense()

In [154]:
#Cria modelo

nb = MultinomialNB()
model_tfidf_nb_cenario3 = nb.fit(train_X_cenario3,train_recommended)
y_pred = model_tfidf_nb_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 25 out of 80
Accuracy:  0.6875
Precision:  0.6931818181818181
Recall:  0.6923318667504714
F1-measure:  0.6874511642444132


In [155]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_tfidf_lr_cenario3=lr.fit(train_X_cenario3, train_recommended)
y_pred = model_tfidf_lr_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 32 out of 80
Accuracy:  0.6
Precision:  0.652138821630347
Recall:  0.6184789440603394
F1-measure:  0.5833333333333334


In [156]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_tfidf_svc_cenario3 = svmc.fit(train_X_cenario3, train_recommended)
y_pred = model_tfidf_svc_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario3)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 32 out of 80
Accuracy:  0.6
Precision:  0.6327272727272727
Recall:  0.6147077309868008
F1-measure:  0.5907928388746803


In [157]:
#Cria modelo
gnb = GaussianNB()
model_tfidf_gnb_cenario3 = gnb.fit(train_X_cenario3, train_recommended)
y_pred = model_tfidf_gnb_cenario3.predict(test_X_cenario3)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), train_X_cenario3.shape[0]))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

gnb_tfidf_cenario3 = metrics.accuracy_score(test_recommended, y_pred)

Mislabeled points: 22 out of 320
Accuracy:  0.725
Precision:  0.7291011942174732
Recall:  0.7291011942174732
F1-measure:  0.725


In [158]:
#Faz validação cruzada  - k = 10
scores_tfidf_cv_gnb_cenario3 = cross_val_score(model_tfidf_gnb_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_tfidf_cv_svc_cenario3 = cross_val_score(model_tfidf_svc_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_tfidf_cv_lr_cenario3 = cross_val_score(model_tfidf_lr_cenario3, train_X_cenario3, y=train_recommended, cv=10)
scores_tfidf_cv_nb_cenario3 = cross_val_score(model_tfidf_nb_cenario3, train_X_cenario3, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
tfidf_cv_gnb_cenario3 = scores_tfidf_cv_gnb_cenario3.mean()
tfidf_cv_svc_cenario3 = scores_tfidf_cv_svc_cenario3.mean()
tfidf_cv_lr_cenario3 = scores_tfidf_cv_lr_cenario3.mean()
tfidf_cv_nb_cenario3 = scores_tfidf_cv_nb_cenario3.mean()

#faz média global das validações cruzadas
tfidf_media_cenario3 = (tfidf_cv_gnb_cenario3 + tfidf_cv_svc_cenario3 + tfidf_cv_lr_cenario3 + tfidf_cv_nb_cenario3) / 4 

print(tfidf_media_cenario3)

0.74765625


## Cenário 4

Tokenization; Stemming

In [159]:
# Faz Tokenização
cenario4 = train_text
for i in range(len(cenario4)):
    cenario4[i] = cenario4[i].split()
#print(cenario4)

In [160]:
stemmer = stem.porter.PorterStemmer()

for i in range(len(cenario4)):
    for j in range(len(cenario4[i])):
        cenario4[i][j] = stemmer.stem(cenario4[i][j])
#print(cenario4[:2])

In [161]:
def untokenize(cenario4):
    for tokens in cenario4:
        yield ' '.join(tokens)

untokenized_data_cenario4 = list(untokenize(cenario4))
#print(untokenized_data)

In [162]:
#converte palavras em números usando a abordagem do bag of words e define que vamos utilizar as 1500 palavras mais ocorrentes como features
#Min_df = 3 significa que essas palavras têm de ocorrer em pelo menos 3 comentários
#max_df = 0.7 significa que vamos incluir as palavras que ocorrem em no máximo 70% de todos os documentos, uma vez Palavras que ocorrem em quase todos os documentos geralmente não são adequadas para classificação porque não fornecem informações exclusivas sobre o documento.
train_pp = untokenized_data_cenario4

vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words= 'english')
train_X_cenario4 = vectorizer.fit_transform(train_pp)
test_X_cenario4 = vectorizer.transform(test_text)
print(train_X_cenario4.shape, test_X_cenario4.shape)

(320, 1500) (80, 1500)


In [163]:
train_X_cenario4 = train_X_cenario4.todense()
test_X_cenario4 = test_X_cenario4.todense()

In [164]:
#Cria modelo
nb = MultinomialNB()
model_tfidf_nb_cenario4 = nb.fit(train_X_cenario4,train_recommended)
y_pred = model_tfidf_nb_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

Mislabeled points: 24 out of 80
Accuracy:  0.7
Precision:  0.737246680642907
Recall:  0.7133878064110621
F1-measure:  0.6952380952380952


In [165]:
#Cria modelo
lr = LogisticRegression(max_iter=500)
model_tfidf_lr_cenario4=lr.fit(train_X_cenario4, train_recommended)
y_pred = model_tfidf_lr_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))


Mislabeled points: 26 out of 80
Accuracy:  0.675
Precision:  0.7489911218724778
Recall:  0.6939032055311125
F1-measure:  0.6614583333333333


In [166]:
#Cria modelo
svmc = LinearSVC(max_iter=500)
model_tfidf_svc_cenario4 = svmc.fit(train_X_cenario4, train_recommended)
y_pred = model_tfidf_svc_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))



Mislabeled points: 26 out of 80
Accuracy:  0.675
Precision:  0.7330282227307399
Recall:  0.6920175989943432
F1-measure:  0.6647324306898774


In [167]:
#Cria modelo
gnb = GaussianNB()
model_tfidf_gnb_cenario4 = gnb.fit(train_X_cenario4, train_recommended)
y_pred = model_tfidf_gnb_cenario4.predict(test_X_cenario4)
print("Mislabeled points: %d out of %d"% ((test_recommended!=y_pred).sum(), len(test_X_cenario4)))
print("Accuracy: ", metrics.accuracy_score(test_recommended, y_pred))
print("Precision: ", metrics.precision_score(test_recommended, y_pred, average="macro"))
print("Recall: ", metrics.recall_score(test_recommended, y_pred, average="macro"))
print("F1-measure: ", metrics.f1_score(test_recommended, y_pred, average="macro"))

Mislabeled points: 21 out of 80
Accuracy:  0.7375
Precision:  0.7436868686868687
Recall:  0.7426147077309868
F1-measure:  0.7374589779653071


In [168]:
#Faz validação cruzada  - k = 10
scores_tfidf_cv_gnb_cenario4 = cross_val_score(model_tfidf_gnb_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_tfidf_cv_svc_cenario4 = cross_val_score(model_tfidf_svc_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_tfidf_cv_lr_cenario4 = cross_val_score(model_tfidf_lr_cenario4, train_X_cenario4, y=train_recommended, cv=10)
scores_tfidf_cv_nb_cenario4 = cross_val_score(model_tfidf_nb_cenario4, train_X_cenario4, y=train_recommended, cv=10)

#faz média da validação cruzada para cada modelo
tfidf_cv_gnb_cenario4 = scores_tfidf_cv_gnb_cenario4.mean()
tfidf_cv_svc_cenario4 = scores_tfidf_cv_svc_cenario4.mean()
tfidf_cv_lr_cenario4 = scores_tfidf_cv_lr_cenario4.mean()
tfidf_cv_nb_cenario4 = scores_tfidf_cv_nb_cenario4.mean()

#faz média global das validações cruzadas
tfidf_media_cenario4 = (tfidf_cv_gnb_cenario4 + tfidf_cv_svc_cenario4 + tfidf_cv_lr_cenario4 + tfidf_cv_nb_cenario4) / 4 

print(tfidf_media_cenario4)

0.7343750000000001


In [169]:
#print médias global de cada experiência com o TF-IDF
print(tfidf_media)
print(tfidf_media_cenario1)
print(tfidf_media_cenario2)
print(tfidf_media_cenario3)
print(tfidf_media_cenario4)

0.734375
0.75625
0.7445312499999999
0.74765625
0.7343750000000001


In [170]:
#print médias global de cada experiência com vectorização por contagem de palavras
print(count_media)
print(count_media_cenario1)
print(count_media_cenario2)
print(count_media_cenario3)
print(count_media_cenario4)

0.7359375
0.72734375
0.7281249999999999
0.7257812499999999
0.7015625
