# NLTK + Scikit-Learn Integration

## Imports and Setup

In [17]:
# Importing libraries
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
# Importing the 5k processed comments sample DF
df = pd.read_csv('Datasets/comments_5ksample.csv')

# Dropping the unnamed column
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
# Testing opening the pickled naive bayes
classifier_f = open('naivebayes.pickle', 'rb')
classifier_og = pickle.load(classifier_f)
classifier_f.close()

classifier_og  # this one is the trained algo with the whole comments df

In [10]:
# Functions that shouldn't be needed since I already had the cleaned dataframe from the previous notebook...
## still, had to get all of this again and now the outputs are working as expected...

# Defining functions to cleanup and process the comments
def clean_up(s):
    return re.sub(r'  *', ' ', re.sub(r'[^a-z]', ' ', 
                                      re.sub(r'www\.\S*', ' ', re.sub(r'http[s]?://\S*', ' ', s.lower())))).strip()

def tokenize(s):
    return nltk.word_tokenize(s)

def stem_and_lemmatize(l):
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    
    stemmed_list = [ps.stem(w) for w in l]
    lemmed_on_stemmed_list = [wnl.lemmatize(w) for w in stemmed_list]
    
    return lemmed_on_stemmed_list

def remove_stopwords(lst, lang = 'english'):
    stop_words = stopwords.words(lang)
    return [word for word in lst if word not in stop_words]


# actually needed functions
# Building the features
def find_features(document, bow):
    text = document.lower()
    features = dict()
    for w, c in bow:
        features[w] = w in text
    return features

def make_matrix(series_text, series_target, bow):
    return [(find_features(s, bow), t) for s, t in zip(series_text.values, series_target.values)]

In [11]:
# Doing this again, even though it shouldn't be necessary, but since it wasn't working...
df['Comments_Processed'] = df['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target
0,Pro Evolution Soccer 2015,PlayStation4,10,this game is absolutely awesome. i am happy th...,call_of_duty,"[thi, game, absolut, awesom, happi, took, one,...",Pos
1,Battlefield: Bad Company 2,PC,10,"The guy who said his PC can't run the game, I...",LentiniM,"[guy, said, hi, pc, run, game, guess, singl, c...",Pos
2,Sly Cooper and the Thievius Raccoonus,PlayStation2,8,A solid game however it does get repetitive ov...,AKthaBeast,"[solid, game, howev, doe, get, repetit, awhil,...",Pos
3,Call of Duty: Black Ops II,Xbox360,4,I was excited to pick this game up after readi...,ctruluck1324,"[wa, excit, pick, thi, game, read, review, cri...",Neg
4,Middle-earth: Shadow of Mordor,PlayStation4,10,This is by far the most fun I have had dismemb...,shadowmancer66,"[thi, far, fun, dismemb, orc, long, time, shad...",Pos
...,...,...,...,...,...,...,...
4995,The Legend of Zelda: A Link Between Worlds,3DS,10,A great follow up to 'A Link To The Past'. The...,ForeverFalling,"[great, follow, link, past, soundtrack, one, b...",Pos
4996,The Witcher 2: Assassins of Kings,PC,5,Alright let me begin by saying that this game ...,akelz7,"[alright, let, begin, say, thi, game, goti, ma...",Pos
4997,Fallout 4,PlayStation4,10,"This review contains spoilers, cli...",Ninja-Puffs,"[thi, review, contain, spoiler, click, expand,...",Pos
4998,NBA 2K18,PlayStation4,0,MyTeam is a joke. MyGm is the worst manager mo...,zizhazhu,"[myteam, joke, mygm, worst, manag, mode, exper...",Neg


## Initializing the Original NLTK NB Model

In [12]:
# INITIALIZING EVERYTHING TO BEGIN TESTING WITH DIFFERENT MODELS

# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.25)  # xxx -> 25% of the data

# Training with 75% of the data and testing against the remaining 25%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%')

Most Informative Features
                   grace = True              Neg : Pos    =     13.6 : 1.0
               fundament = True              Neg : Pos    =     13.6 : 1.0
                 acclaim = True              Neg : Pos    =     13.6 : 1.0
                    peer = True              Neg : Pos    =     13.6 : 1.0
                    reus = True              Neg : Pos    =     13.6 : 1.0
                   messi = True              Neg : Pos    =     13.6 : 1.0
                    scam = True              Neg : Pos    =     13.6 : 1.0
                   queue = True              Neg : Pos    =     11.8 : 1.0
                  ticket = True              Neg : Pos    =     11.0 : 1.0
                 desktop = True              Neg : Pos    =     10.6 : 1.0
                  tester = True              Neg : Pos    =     10.6 : 1.0
                 fastest = True              Neg : Pos    =     10.6 : 1.0
                   drain = True              Neg : Pos    =     10.6 : 1.0

## Testing with other models

### Multinomial NB

In [18]:
# Testing the Multinomial NB model
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
#MNB_classifier.show_most_informative_features(15)  # thought this would work, but guess not

print('MultinomialNB accuracy:', str(round(nltk.classify.accuracy(MNB_classifier, testing_set) * 100, 2)) + '%')

MultinomialNB accuracy: 87.76%


### Bernoulli NB

In [19]:
# Testing the Bernoulli NB model
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)

print('BernoulliNB accuracy:', str(round(nltk.classify.accuracy(BNB_classifier, testing_set) * 100, 2)) + '%')

BernoulliNB accuracy: 83.2%


### Logistic Regression

In [20]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print('LogisticRegression_classifier accuracy:', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier, testing_set) * 100, 2)) + '%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression_classifier accuracy: 87.2%


### SGD Classifier

In [21]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print('SGDClassifier_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100, 2)) + '%')

SGDClassifier_classifier accuracy: 84.4%


### SVC

In [22]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

print('SVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(SVC_classifier, testing_set) * 100, 2)) + '%')

SVC_classifier accuracy: 85.2%


### Linear SVC

In [23]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print('LinearSVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(LinearSVC_classifier, testing_set) * 100, 2)) + '%')

LinearSVC_classifier accuracy: 85.2%


### NuSVC

In [None]:
# This one isn't working
'''NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print('NuSVC_classifier accuracy:', 
      str(round(nltk.classify.accuracy(NuSVC_classifier, testing_set) * 100, 2)) + '%')'''