# NLTK + Scikit-Learn Integration

## Imports and Setup

In [20]:
# Importing libraries
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics import confusionmatrix  # try using this later maybe and not just the accuracy score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

# My own functions
from NLP_Functions import find_features, make_matrix ,clean_up, tokenize, stem_and_lemmatize, remove_stopwords

In [2]:
# Importing the 5k processed comments sample DF
df = pd.read_json('Datasets/comments_5ksample.json')
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target
0,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou,"[meh, game, doe, includ, uniqu, featur, love, ...",Pos
1,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS,"[dead, game, dead, even, lol, ha, harroween, g...",Neg
2,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello,"[play, da, calm, initi, rage, took, month, bel...",Pos
3,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine,"[stori, veri, goodonlin, rightmus, best, pokem...",Pos
4,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222,"[favorit, fp, time, bloodi, fun, shooter, love...",Pos
...,...,...,...,...,...,...,...
4995,FIFA Soccer 12,PC,6,"The gameplay can be as smooth as possible, but...",Minidivine,"[gameplay, smooth, possibl, one, problem, game...",Pos
4996,Call of Duty: Modern Warfare 2,PC,0,"The lack of dedicated server support, along w...",AlanB.,"[lack, dedic, server, support, along, player, ...",Neg
4997,Grand Theft Auto IV,PlayStation3,10,"Absolutely the best game this year, last year,...",MattWix,"[absolut, best, game, thi, year, last, year, p...",Pos
4998,Grand Theft Auto: San Andreas,PC,10,The first game I played seriously.And also my ...,John67,"[first, game, play, serious, also, favorit, ma...",Pos


In [None]:
# Testing opening the pickled naive bayes
## not using this for now
#classifier_f = open('naivebayes.pickle', 'rb')
#classifier_og = pickle.load(classifier_f)
#classifier_f.close()

#classifier_og  # this one is the trained algo with the whole comments df

para ter uma ideia do score exato posso fazer o sentiment analysis às reviews e criar uma coluna com esses valores
e depois pegar na label do classifier + o sentiment e tentar usar isso como X de um modelo de regressão e ver se ele consegue fazer algum tipo de regressão

In [4]:
print(df['Target'].value_counts())  # 80.7% accuracy

# Changing labels to see how accuracy responds
## tentar fazer um resampling aqui - por fora algumas positivas pra ficar mais balanced
df['Target_2'] = np.where((df['Userscore'] <= 5), 'Negative', 'Positive')
df['Target_2'] = np.where(((df['Userscore'] >= 6) & (df['Userscore'] <= 8)), 'Neutral', df['Target_2'])

df['Target_2'].value_counts()  # this made my accuracy drop to 57%!!!

Pos    4155
Neg     845
Name: Target, dtype: int64


Positive    2970
Negative    1019
Neutral     1011
Name: Target_2, dtype: int64

In [5]:
# Trying another label change to see how accuracy responds
df['Target_3'] = np.where((df['Userscore'] <= 5), 'Negative', 'Positive')
df['Target_3'] = np.where(((df['Userscore'] >= 6) & (df['Userscore'] <= 8)), 'Negative', df['Target_2'])

df['Target_3'].value_counts()  # 63.9% accuracy

Positive    2970
Negative    2030
Name: Target_3, dtype: int64

In [6]:
df.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target,Target_2,Target_3
0,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou,"[meh, game, doe, includ, uniqu, featur, love, ...",Pos,Neutral,Negative
1,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS,"[dead, game, dead, even, lol, ha, harroween, g...",Neg,Negative,Negative
2,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello,"[play, da, calm, initi, rage, took, month, bel...",Pos,Negative,Negative
3,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine,"[stori, veri, goodonlin, rightmus, best, pokem...",Pos,Positive,Positive
4,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222,"[favorit, fp, time, bloodi, fun, shooter, love...",Pos,Positive,Positive


## Initializing the Original NLTK NB Model

In [7]:
# test the whole confusion matrix instead of just accuracy to see if it all matches nicely

# INITIALIZING EVERYTHING TO BEGIN TESTING WITH DIFFERENT MODELS

# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix_1 = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size_1 = int(len(matrix_1) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_1 = matrix_1[size_1:]
testing_set_1 = matrix_1[:size_1]

# Initializing and training the model
classifier_1 = nltk.NaiveBayesClassifier.train(training_set_1)

# Showing the top 15 most informative features
classifier_1.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy (Target):', 
      str(round(nltk.classify.accuracy(classifier_1, testing_set_1) * 100, 2)) + '%')

Most Informative Features
                  refund = True              Neg : Pos    =     24.8 : 1.0
                 monitor = True              Neg : Pos    =     18.2 : 1.0
                 exercis = True              Neg : Pos    =     18.2 : 1.0
                 unenjoy = True              Neg : Pos    =     14.9 : 1.0
                uninstal = True              Neg : Pos    =     14.9 : 1.0
                 stutter = True              Neg : Pos    =     14.9 : 1.0
                 blatant = True              Neg : Pos    =     14.9 : 1.0
                 lesbian = True              Neg : Pos    =     14.9 : 1.0
                 disgrac = True              Neg : Pos    =     14.9 : 1.0
                 billion = True              Neg : Pos    =     14.9 : 1.0
                  insult = True              Neg : Pos    =     13.1 : 1.0
                  redeem = True              Neg : Pos    =     12.1 : 1.0
                    blop = True              Neg : Pos    =     11.6 : 1.0

### Testing with Target_2 (Positive, Negative, Neutral)

In [8]:
# Building the features and making the matrix
matrix_2 = make_matrix(df['Comment'], df['Target_2'], most_common)

# Defining the size to use for the training and testing
size_2 = int(len(matrix_2) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_2 = matrix_2[size_2:]
testing_set_2 = matrix_2[:size_2]

# Initializing and training the model
classifier_2 = nltk.NaiveBayesClassifier.train(training_set_2)

# Showing the top 15 most informative features
classifier_2.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy (Target_2):', 
      str(round(nltk.classify.accuracy(classifier_2, testing_set_2) * 100, 2)) + '%')

Most Informative Features
                laughabl = True           Negati : Positi =     25.9 : 1.0
                unbalanc = True           Negati : Positi =     22.1 : 1.0
                   worst = True           Negati : Positi =     21.0 : 1.0
                  recycl = True           Negati : Positi =     18.2 : 1.0
                  insult = True           Negati : Positi =     17.8 : 1.0
                    hash = True           Negati : Positi =     17.8 : 1.0
                  pathet = True           Negati : Positi =     16.0 : 1.0
                  garbag = True           Negati : Positi =     14.8 : 1.0
                     cow = True           Negati : Positi =     14.4 : 1.0
                   invis = True           Negati : Positi =     14.4 : 1.0
                  rental = True           Neutra : Positi =     14.2 : 1.0
                  forest = True           Neutra : Positi =     14.2 : 1.0
                 smaller = True           Neutra : Positi =     13.0 : 1.0

### Testing with Target_3 (Positive, Negative --> pretty well-balanced now)

In [9]:
# Building the features and making the matrix
matrix_3 = make_matrix(df['Comment'], df['Target_3'], most_common)

# Defining the size to use for the training and testing
size_3 = int(len(matrix_3) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_3 = matrix_3[size_3:]
testing_set_3 = matrix_3[:size_3]

# Initializing and training the model
classifier_3 = nltk.NaiveBayesClassifier.train(training_set_3)

# Showing the top 15 most informative features
classifier_3.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy (Target_3):', 
      str(round(nltk.classify.accuracy(classifier_3, testing_set_3) * 100, 2)) + '%')

Most Informative Features
                unbalanc = True           Negati : Positi =     18.6 : 1.0
                  recycl = True           Negati : Positi =     15.7 : 1.0
                laughabl = True           Negati : Positi =     15.7 : 1.0
                   worst = True           Negati : Positi =     13.5 : 1.0
                   invis = True           Negati : Positi =     12.9 : 1.0
                    hash = True           Negati : Positi =     11.1 : 1.0
                   circl = True           Negati : Positi =     11.0 : 1.0
                  pathet = True           Negati : Positi =     10.8 : 1.0
                  insult = True           Negati : Positi =     10.0 : 1.0
                  forest = True           Negati : Positi =     10.0 : 1.0
                  atroci = True           Negati : Positi =      9.0 : 1.0
                   queue = True           Negati : Positi =      9.0 : 1.0
                  disast = True           Negati : Positi =      9.0 : 1.0

## Predicting the Label of the 'testing_set'

In [10]:
# pode ser interessante ter uma coluna com o sentiment score e depois fazer um scale p estar de 0-10 e um t-test no
# final para comparar a media deles com a original
# se houver correspondencia posso usar o input p fazer uma regressão?

# List comprehension to make the predictions for the whole testing_set
pred_list_1 = [classifier_1.classify(testing_set_1[i][0]) for i in range(len(testing_set_1))]

In [11]:
# This was with the original Target label column
print(pred_list_1.count('Pos'))  # 780
print(pred_list_1.count('Neg'))  # 220

780
220


In [None]:
# Creating the bag of words
#bow = [word for lst in df['Comments_Processed'] for word in lst]
#fdist = FreqDist(bow)

# Getting just the 5k most common words
#most_common = fdist.most_common(5000)

# Building the features and making the matrix
#matrix_1 = make_matrix(df['Comment'], df['Target'], most_common)

In [184]:
df[['Comment', 'Target']].values#[0]

array([['To me, it\'s a "Meh" game. It does include some unique features, in which, I love, and the game also includes a certain magic that always drags me back to the game. An easy choice, go buy it, best game I\'ve played for awhile that was worth my money.',
        'Pos'],
       ['Dead game is dead, even LoL has their harroween.Garbage matchmakingGarbage report systemGarbage game engine results in 9 games without someone being able to load and you reenter the queue0 communication and 0 visibility no visible MMR no ladders)No region locks.You take a perfect game and turn it into worst game ever created.GG WP',
        'Neg'],
       ['After having played DA2 for a while and calming myself down from the initial rage - it took months believe - I can write a review of this game with reason.Is DA2 a bad game per se? No, it\'s not. It\'s fun in its own way, but If the game was called "Grandma Hawke killing ugly monsters" I wouldn\'t have problems with it, maybe I would have enjoyed it, 

In [224]:
sorted(most_common, key = lambda x: x[0])[1100:]
#['hate']

[('delux', 9),
 ('demand', 27),
 ('demasiado', 10),
 ('demo', 62),
 ('demograph', 6),
 ('demon', 92),
 ('demonstr', 5),
 ('den', 17),
 ('deni', 18),
 ('depart', 24),
 ('departur', 9),
 ('depend', 73),
 ('depress', 23),
 ('depth', 162),
 ('der', 10),
 ('derail', 6),
 ('dernier', 6),
 ('descent', 7),
 ('describ', 57),
 ('descript', 16),
 ('desd', 11),
 ('desert', 23),
 ('deserv', 217),
 ('design', 531),
 ('desir', 36),
 ('desk', 5),
 ('desktop', 10),
 ('desper', 9),
 ('despis', 7),
 ('despit', 164),
 ('despu', 6),
 ('destin', 9),
 ('destini', 86),
 ('destroy', 89),
 ('destruct', 59),
 ('detail', 270),
 ('detal', 10),
 ('detect', 40),
 ('determin', 18),
 ('detest', 5),
 ('detract', 20),
 ('deu', 32),
 ('dev', 67),
 ('devast', 6),
 ('develop', 489),
 ('devic', 11),
 ('devil', 22),
 ('devot', 12),
 ('dew', 5),
 ('dewitt', 6),
 ('dexter', 8),
 ('di', 37),
 ('diablo', 239),
 ('dialog', 43),
 ('dialogu', 171),
 ('diamond', 12),
 ('diari', 6),
 ('dice', 40),
 ('dick', 8),
 ('didnt', 80),
 ('die

In [225]:
a = find_features('This game sucks so much. I hate it a lot. This is complete garbage', most_common)
a['hate']

True

In [227]:
classifier_3.classify(a)

'Positive'

In [207]:
# It seems I need a featureset (a dict) to be able to predict something

'''
Since the .classify is taking the testing_set as an arg, I looked into it and the testing_set is a dict with the
keys being the words (already processed, I think) and the value is true or false - if they are or not in the
5k most_common list??

With that in mind I decided to create this func that would clean up a string that it receives to be somewhat
closer to the type of input the .classify receives (aka the testing_set).'''
def text_cleaner(text):
    clean = clean_up(text)
    token = tokenize(clean)
    stem_lem = stem_and_lemmatize(token)
    final = remove_stopwords(stem_lem)
    return final
    #text = list(text)
    #final = list(map(lambda string: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(string)))), text))
    #return [item for row in final for item in row]
    #return final
text_cleaner('I love this game!! @@')

#classifier_1.classify()
#testing_set_1[0][0]
#matrix_1[0][0]

['love', 'thi', 'game']

In [12]:
# Predictions with the 'Target_2' column
pred_list_2 = [classifier_2.classify(testing_set_2[i][0]) for i in range(len(testing_set_2))]

In [13]:
pred_list_2
print(pred_list_2.count('Positive'))  # 609
print(pred_list_2.count('Negative'))  # 98
print(pred_list_2.count('Neutral'))  # 293

609
98
293


In [16]:
# Predictions with the 'Target_3' column
pred_list_3 = [classifier_3.classify(testing_set_3[i][0]) for i in range(len(testing_set_3))]

In [17]:
pred_list_3
print(pred_list_3.count('Positive'))  # 620
print(pred_list_3.count('Negative'))  # 380

620
380


## Testing with other models

### Multinomial NB

In [None]:
# Testing the Multinomial NB model with Target column

MNB_classifier_1 = SklearnClassifier(MultinomialNB())
MNB_classifier_1.train(training_set_1)
#MNB_classifier_1.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target):', 
      str(round(nltk.classify.accuracy(MNB_classifier_1, testing_set_1) * 100, 2)) + '%')

In [None]:
# Predictions with the Target column
pred_list_MNB_1 = [MNB_classifier_1.classify(testing_set_1[i][0]) for i in range(len(testing_set_1))]

In [None]:
print(pred_list_MNB_1.count('Pos'))
print(pred_list_MNB_1.count('Neg'))

In [None]:
# Testing the Multinomial NB model with Target_2 column

MNB_classifier_2 = SklearnClassifier(MultinomialNB())
MNB_classifier_2.train(training_set_2)
#MNB_classifier_1.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target_2):', 
      str(round(nltk.classify.accuracy(MNB_classifier_2, testing_set_2) * 100, 2)) + '%')

In [None]:
# Testing the Multinomial NB model with Target_3 column

MNB_classifier_3 = SklearnClassifier(MultinomialNB())
MNB_classifier_3.train(training_set_3)
#MNB_classifier_3.show_most_informative_features(15)  # thought this would work, but guess not
## would prob be the same features maybe??

print('MultinomialNB accuracy (Target_3):', 
      str(round(nltk.classify.accuracy(MNB_classifier_3, testing_set_3) * 100, 2)) + '%')

### Bernoulli NB

In [None]:
# Testing the Bernoulli NB model
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set_1)

print('BernoulliNB accuracy:', str(round(nltk.classify.accuracy(BNB_classifier, testing_set_1) * 100, 2)) + '%')

### Logistic Regression

In [None]:
LogisticRegression_classifier_1 = SklearnClassifier(LogisticRegression(max_iter = 200))
LogisticRegression_classifier_1.train(training_set_1)

print('Logistic Regression accuracy (Target):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_1, testing_set_1) * 100, 2)) + '%')

In [None]:
LogisticRegression_classifier_2 = SklearnClassifier(LogisticRegression(max_iter = 200))
LogisticRegression_classifier_2.train(training_set_2)

print('Logistic Regression accuracy (Target_2):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_2, testing_set_2) * 100, 2)) + '%')

In [None]:
LogisticRegression_classifier_3 = SklearnClassifier(LogisticRegression(max_iter = 300))
LogisticRegression_classifier_3.train(training_set_3)

print('Logistic Regression accuracy (Target_3):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_3, testing_set_3) * 100, 2)) + '%')

### SGD Classifier

In [None]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set_1)

print('SGDClassifier_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SGDClassifier_classifier, testing_set_1) * 100, 2)) + '%')

### SVC

In [None]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set_1)

print('SVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SVC_classifier, testing_set_1) * 100, 2)) + '%')

### Linear SVC

In [None]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set_1)

print('LinearSVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(LinearSVC_classifier, testing_set_1) * 100, 2)) + '%')