# Application of chosen model on whole dataset

## Imports and setup

In [6]:
# Importing libraries
import pandas as pd
import numpy as np
from collections import Counter
import pickle

import nltk
from nltk.probability import FreqDist
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics import ConfusionMatrix

from sklearn.linear_model import LogisticRegression

# My own functions
from NLP_Functions import find_features, make_matrix, clean_up, tokenize, stem_and_lemmatize, remove_stopwords

In [7]:
# Importing the dataset
df = pd.read_csv('Datasets/games_comments_cleaned.csv', index_col = 0)

# Preparing the dataset

## Creating 'Target' column

I will be using the 'Target' column, but I decided to create the Target_NPS one as well just to see how different they would be in the total dataset.

In [8]:
# Creating the 'Target' column
## 9-10 positive
## 8 and below negative
df['Target'] = np.where((df['Userscore'] <= 8), 'Negative', 'Positive')

# Creating the 'Target_NPS' column
## NPS Scale - https://en.wikipedia.org/wiki/Net_Promoter
## 9-10 = positive
## 7-8 = neutral
## 0-6 = negative
df['Target_NPS'] = np.where((df['Userscore'] <= 6), 'Negative', 'Positive')
df['Target_NPS'] = np.where(((df['Userscore'] >= 7) & (df['Userscore'] <= 8)), 'Neutral', df['Target_NPS'])

# Checking the different proportion of values
## unsure if I should balance these or not
print(df['Target'].value_counts())
df['Target_NPS'].value_counts()

Positive    164629
Negative    117572
Name: Target, dtype: int64


Positive    164629
Negative     70385
Neutral      47187
Name: Target_NPS, dtype: int64

## Processing the comments

In [9]:
# Creating the 'Comments_Processed' column
df['Comments_Processed'] = df['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))
## this took 25 mins to run...

In [10]:
df.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Target,Target_NPS,Comments_Processed
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus,Positive,Positive,"[everyth, oot, near, perfect, realli, wonder, ..."
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin,Positive,Positive,"[bore, everyon, alreadi, say, amaz, thi, game,..."
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody,Positive,Positive,"[anyon, give, masterpiec, either, hate, astoun..."
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman,Positive,Positive,"[one, peopl, think, thi, greatest, game, time,..."
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA,Positive,Positive,"[thi, game, highest, rate, game, metacrit, goo..."


In [11]:
# Rearring the columns order to be prettier
cols = df.columns.tolist()
df = df[['Title', 'Platform', 'Userscore', 'Comment', 'Comments_Processed', 'Username', 'Target', 'Target_NPS']]
df.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Comments_Processed,Username,Target,Target_NPS
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...","[everyth, oot, near, perfect, realli, wonder, ...",SirCaestus,Positive,Positive
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,"[bore, everyon, alreadi, say, amaz, thi, game,...",Kaistlin,Positive,Positive
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,"[anyon, give, masterpiec, either, hate, astoun...",Jacody,Positive,Positive
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,"[one, peopl, think, thi, greatest, game, time,...",doodlerman,Positive,Positive
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,"[thi, game, highest, rate, game, metacrit, goo...",StevenA,Positive,Positive


In [12]:
# Exporting the DF now just to be safe!
df.to_json('Datasets/comments_processed.json')

## Initializing the basic variables for the model

In [None]:
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 10k most common words
most_common = fdist.most_common(10000)  # wonder if I should change this value around or not (trying w/ 10k this time)

'''FOR THE TARGET COLUMN'''
# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

## Logistic Regression Model Application

In [None]:
# TESTING WITH TARGET
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver = 'saga', n_jobs = -1))
LogisticRegression_classifier.train(training_set)
# saga solver = 78.7%

print('Logistic Regression accuracy (Target):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier, testing_set) * 100, 2)) + '%')

## Confusion Matrix

In [None]:
# Predictions of the testing_set with the Target column
pred_list_LR = [LogisticRegression_classifier.classify(testing_set[i][0]) for i in range(len(testing_set))]
print(pred_list_LR.count('Positive'))  # 1809
print(pred_list_LR.count('Negative'))  # 1191

In [None]:
# Unpacking the references (Positive or Negative) of the testing set to use for the confusion matrix
ref = [testing_set[i][1] for i in range(len(testing_set))]  # pos: 1748 | neg: 1258

# List of predictions ran above
tagged = pred_list_LR

# The actual confusion matrix
cm = ConfusionMatrix(ref, tagged)

print(cm)
cm

labels = set('Positive Negative'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]

# This seems to be the best model as the accuracy already predicted
print('TP + TN:', sum(true_positives.values()), true_positives)
print('FP + FN:', sum(false_positives.values()), false_positives)

## Function to Predict a Review's Label

In [None]:
def predictor(text):
    prediction = find_features(text, most_common)
    return print('Prediction:', LogisticRegression_classifier.classify(prediction))

predictor('This game is amazing!')
predictor('This game is terrible.')

## Exporting Files for Future Use

In [None]:
# Saving the 5k most_common words of this whole DF
save_most_common = open('5k_most_common_final.pickle', 'wb')
pickle.dump(most_common, save_most_common)
save_most_common.close()

In [None]:
# Saving the trained LR algorithm
save_classifier = open('Logistic_Regression_final.pickle', 'wb')
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()