# NLTK + Scikit-Learn Integration

## Imports and Setup

In [37]:
# Importing libraries
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics import confusionmatrix  # try using this later maybe and not just the accuracy score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

# My own functions
from NLP_Functions import clean_up, tokenize, stem_and_lemmatize, remove_stopwords

In [38]:
# Importing the 5k processed comments sample DF
df = pd.read_json('Datasets/comments_5ksample.json')

In [40]:
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target
0,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou,"[meh, game, doe, includ, uniqu, featur, love, ...",Pos
1,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS,"[dead, game, dead, even, lol, ha, harroween, g...",Neg
2,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello,"[play, da, calm, initi, rage, took, month, bel...",Pos
3,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine,"[stori, veri, goodonlin, rightmus, best, pokem...",Pos
4,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222,"[favorit, fp, time, bloodi, fun, shooter, love...",Pos
...,...,...,...,...,...,...,...
4995,FIFA Soccer 12,PC,6,"The gameplay can be as smooth as possible, but...",Minidivine,"[gameplay, smooth, possibl, one, problem, game...",Pos
4996,Call of Duty: Modern Warfare 2,PC,0,"The lack of dedicated server support, along w...",AlanB.,"[lack, dedic, server, support, along, player, ...",Neg
4997,Grand Theft Auto IV,PlayStation3,10,"Absolutely the best game this year, last year,...",MattWix,"[absolut, best, game, thi, year, last, year, p...",Pos
4998,Grand Theft Auto: San Andreas,PC,10,The first game I played seriously.And also my ...,John67,"[first, game, play, serious, also, favorit, ma...",Pos


In [None]:
# Testing opening the pickled naive bayes
## not using this for now
classifier_f = open('naivebayes.pickle', 'rb')
classifier_og = pickle.load(classifier_f)
classifier_f.close()

classifier_og  # this one is the trained algo with the whole comments df

para ter uma ideia do score exato posso fazer o sentiment analysis às reviews e criar uma coluna com esses valores
e depois pegar na label do classifier + o sentiment e tentar usar isso como X de um modelo de regressão e ver se ele consegue fazer algum tipo de regressão

In [41]:
# Defining again needed functions
# Building the features
def find_features(document, bow):
    text = document.lower()
    features = dict()
    for w, c in bow:
        features[w] = w in text
    return features

def make_matrix(series_text, series_target, bow):
    return [(find_features(s, bow), t) for s, t in zip(series_text.values, series_target.values)]

In [42]:
print(df['Target'].value_counts())  # 80.7% accuracy

# Changing labels to see how accuracy responds
df['Target_2'] = np.where((df['Userscore'] <= 5), 'Negative', 'Positive')
df['Target_2'] = np.where(((df['Userscore'] >= 6) & (df['Userscore'] <= 8)), 'Neutral', df['Target_2'])

df['Target_2'].value_counts()  # this made my accuracy drop to 57%!!!

Pos    4155
Neg     845
Name: Target, dtype: int64


Positive    2970
Negative    1019
Neutral     1011
Name: Target_2, dtype: int64

In [43]:
# Trying another label change to see how accuracy responds
df['Target_3'] = np.where((df['Userscore'] <= 5), 'Negative', 'Positive')
df['Target_3'] = np.where(((df['Userscore'] >= 6) & (df['Userscore'] <= 8)), 'Negative', df['Target_2'])

df['Target_3'].value_counts()  # 63.9% accuracy

Positive    2970
Negative    2030
Name: Target_3, dtype: int64

In [44]:
df.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target,Target_2,Target_3
0,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou,"[meh, game, doe, includ, uniqu, featur, love, ...",Pos,Neutral,Negative
1,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS,"[dead, game, dead, even, lol, ha, harroween, g...",Neg,Negative,Negative
2,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello,"[play, da, calm, initi, rage, took, month, bel...",Pos,Negative,Negative
3,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine,"[stori, veri, goodonlin, rightmus, best, pokem...",Pos,Positive,Positive
4,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222,"[favorit, fp, time, bloodi, fun, shooter, love...",Pos,Positive,Positive


## Initializing the Original NLTK NB Model

In [49]:
# test the whole confusion matrix instead of just accuracy to see if it all matches nicely

# INITIALIZING EVERYTHING TO BEGIN TESTING WITH DIFFERENT MODELS

# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.20)  # xxx -> 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier_1 = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier_1.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy:', str(round(nltk.classify.accuracy(classifier_1, testing_set) * 100, 2)) + '%')

Most Informative Features
                  refund = True              Neg : Pos    =     24.8 : 1.0
                 monitor = True              Neg : Pos    =     18.2 : 1.0
                 exercis = True              Neg : Pos    =     18.2 : 1.0
                 blatant = True              Neg : Pos    =     14.9 : 1.0
                 disgrac = True              Neg : Pos    =     14.9 : 1.0
                 lesbian = True              Neg : Pos    =     14.9 : 1.0
                 billion = True              Neg : Pos    =     14.9 : 1.0
                 unenjoy = True              Neg : Pos    =     14.9 : 1.0
                 stutter = True              Neg : Pos    =     14.9 : 1.0
                uninstal = True              Neg : Pos    =     14.9 : 1.0
                  insult = True              Neg : Pos    =     13.1 : 1.0
                  redeem = True              Neg : Pos    =     12.1 : 1.0
                  lesson = True              Neg : Pos    =     11.6 : 1.0

### Testing with Target_2 (Pos, Neg, Neutral)

In [50]:
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target_2'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.20)  # xxx -> 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier_2 = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier_2.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy with Target_2 labels:', 
      str(round(nltk.classify.accuracy(classifier_2, testing_set) * 100, 2)) + '%')

Most Informative Features
                laughabl = True           Negati : Positi =     25.9 : 1.0
                unbalanc = True           Negati : Positi =     22.1 : 1.0
                   worst = True           Negati : Positi =     21.0 : 1.0
                  recycl = True           Negati : Positi =     18.2 : 1.0
                  insult = True           Negati : Positi =     17.8 : 1.0
                    hash = True           Negati : Positi =     17.8 : 1.0
                  pathet = True           Negati : Positi =     16.0 : 1.0
                  garbag = True           Negati : Positi =     14.8 : 1.0
                   invis = True           Negati : Positi =     14.4 : 1.0
                     cow = True           Negati : Positi =     14.4 : 1.0
                  forest = True           Neutra : Positi =     14.2 : 1.0
                  rental = True           Neutra : Positi =     14.2 : 1.0
                 smaller = True           Neutra : Positi =     13.0 : 1.0

### Testing with Target_3 (Pos, Neg -> pretty well-balanced now)

In [51]:
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target_3'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.20)  # xxx -> 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier_3 = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier_3.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy with more balanced labels (Target_3):', 
      str(round(nltk.classify.accuracy(classifier_3, testing_set) * 100, 2)) + '%')

Most Informative Features
                unbalanc = True           Negati : Positi =     18.6 : 1.0
                laughabl = True           Negati : Positi =     15.7 : 1.0
                  recycl = True           Negati : Positi =     15.7 : 1.0
                   worst = True           Negati : Positi =     13.5 : 1.0
                   invis = True           Negati : Positi =     12.9 : 1.0
                    hash = True           Negati : Positi =     11.1 : 1.0
                   circl = True           Negati : Positi =     11.0 : 1.0
                  pathet = True           Negati : Positi =     10.8 : 1.0
                  insult = True           Negati : Positi =     10.0 : 1.0
                  forest = True           Negati : Positi =     10.0 : 1.0
                   queue = True           Negati : Positi =      9.0 : 1.0
                  disast = True           Negati : Positi =      9.0 : 1.0
                  atroci = True           Negati : Positi =      9.0 : 1.0

## Trying to Predict the Label

In [52]:
# pode ser interessante ter uma coluna com o sentiment score e depois fazer um scale p estar de 0-10 e um t-test no
# final para comparar a media deles com a original
# se houver correspondencia posso usar o input p fazer uma regressão?

# List comprehension to make the predictions for the whole testing_set
pred_list_1 = [classifier_1.classify(testing_set[i][0]) for i in range(len(testing_set))]

In [53]:
# This was with the original Target label column
pred_list_1

['Pos',
 'Neg',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',
 'Pos',


In [54]:
# Predictions with the 'Target_2' column
pred_list_2 = [classifier_2.classify(testing_set[i][0]) for i in range(len(testing_set))]

In [56]:
pred_list_2

['Positive',
 'Negative',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Neutral',
 'Positive',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Negative',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Neutral',
 'Neutral',
 'Negative',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Neutral',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Neutral',
 'Neutral',
 'Positiv

In [55]:
# Predictions with the 'Target_3' column
pred_list_3 = [classifier_3.classify(testing_set[i][0]) for i in range(len(testing_set))]

In [57]:
pred_list_3

['Positive',
 'Negative',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Negative',
 'Positive',
 'Positive',
 'Negative',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Negative',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Negative',
 'Negative',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Negative',
 'Positive',
 'Positive',
 'Negative',

## Testing with other models

### Multinomial NB

In [61]:
# Testing the Multinomial NB model with Target column

# Building the features and making the matrix
matrix_1 = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size_1 = int(len(matrix_1) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_1 = matrix_1[size_1:]
testing_set_1 = matrix_1[:size_1]

MNB_classifier_1 = SklearnClassifier(MultinomialNB())
MNB_classifier_1.train(training_set_1)
#MNB_classifier_1.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target):', 
      str(round(nltk.classify.accuracy(MNB_classifier_1, testing_set_1) * 100, 2)) + '%')

MultinomialNB accuracy (Target): 87.2%


In [64]:
# Testing the Multinomial NB model with Target_2 column

# Building the features and making the matrix
matrix_2 = make_matrix(df['Comment'], df['Target_2'], most_common)

# Defining the size to use for the training and testing
size_2 = int(len(matrix_2) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_2 = matrix_2[size_2:]
testing_set_2 = matrix_2[:size_2]

MNB_classifier_2 = SklearnClassifier(MultinomialNB())
MNB_classifier_2.train(training_set_2)
#MNB_classifier_1.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target_2):', 
      str(round(nltk.classify.accuracy(MNB_classifier_2, testing_set_2) * 100, 2)) + '%')

MultinomialNB accuracy (Target_2): 67.5%


In [65]:
# Testing the Multinomial NB model with Target_3 column

# Building the features and making the matrix
matrix_3 = make_matrix(df['Comment'], df['Target_3'], most_common)

# Defining the size to use for the training and testing
size_3 = int(len(matrix_3) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_3 = matrix_3[size_3:]
testing_set_3 = matrix_3[:size_3]

MNB_classifier_3 = SklearnClassifier(MultinomialNB())
MNB_classifier_3.train(training_set_3)
#MNB_classifier_3.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target_3):', 
      str(round(nltk.classify.accuracy(MNB_classifier_3, testing_set_3) * 100, 2)) + '%')

MultinomialNB accuracy (Target_3): 73.8%


### Bernoulli NB

In [19]:
# Testing the Bernoulli NB model
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)

print('BernoulliNB accuracy:', str(round(nltk.classify.accuracy(BNB_classifier, testing_set) * 100, 2)) + '%')

BernoulliNB accuracy: 83.2%


### Logistic Regression

In [68]:
LogisticRegression_classifier_1 = SklearnClassifier(LogisticRegression(max_iter = 200))
LogisticRegression_classifier_1.train(training_set_1)

print('Logistic Regression accuracy (Target):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_1, testing_set_1) * 100, 2)) + '%')

Logistic Regression accuracy (Target): 87.4%


In [69]:
LogisticRegression_classifier_2 = SklearnClassifier(LogisticRegression(max_iter = 200))
LogisticRegression_classifier_2.train(training_set_2)

print('Logistic Regression accuracy (Target_2):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_2, testing_set_2) * 100, 2)) + '%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression accuracy (Target_2): 67.8%


In [70]:
LogisticRegression_classifier_3 = SklearnClassifier(LogisticRegression(max_iter = 200))
LogisticRegression_classifier_3.train(training_set_3)

print('Logistic Regression accuracy (Target_3):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_3, testing_set_3) * 100, 2)) + '%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression accuracy (Target_3): 75.4%


### SGD Classifier

In [21]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print('SGDClassifier_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100, 2)) + '%')

SGDClassifier_classifier accuracy: 84.4%


### SVC

In [22]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

print('SVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SVC_classifier, testing_set) * 100, 2)) + '%')

SVC_classifier accuracy: 85.2%


### Linear SVC

In [23]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print('LinearSVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(LinearSVC_classifier, testing_set) * 100, 2)) + '%')

LinearSVC_classifier accuracy: 85.2%


### NuSVC

In [None]:
# This one isn't working
'''NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print('NuSVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(NuSVC_classifier, testing_set) * 100, 2)) + '%')'''