# Natural Language Processing

## Imports and Setup

In [38]:
# Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import time
import pickle

In [2]:
# Importing datasets
games = pd.read_csv('Datasets/games_info_clean.csv')
games_scores = pd.read_csv('Datasets/games_scores_grouped.csv')
comments = pd.read_csv('Datasets/games_comments_cleaned.csv')
games_scores_comments = pd.read_csv('Datasets/avg_commented_userscore.csv')

In [3]:
# Dropping the 'Unnamed: 0' column from all dataframes
games.drop('Unnamed: 0', axis = 1, inplace = True)
games_scores.drop('Unnamed: 0', axis = 1, inplace = True)
comments.drop('Unnamed: 0', axis = 1, inplace = True)
games_scores_comments.drop('Unnamed: 0', axis = 1, inplace = True)

In [4]:
# Defining functions to cleanup and process the comments
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    return re.sub(r'  *', ' ', re.sub(r'[^a-z]', ' ', 
                                      re.sub(r'www\.\S*', ' ', re.sub(r'http[s]?://\S*', ' ', s.lower())))).strip()


def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return nltk.word_tokenize(s)


def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    
    stemmed_list = [ps.stem(w) for w in l]
    lemmed_on_stemmed_list = [wnl.lemmatize(w) for w in stemmed_list]
    
    return lemmed_on_stemmed_list


def remove_stopwords(lst, lang = 'english'):
    """
    Remove English (default) stopwords from a list of strings.

    Args:
        lst: A list of strings.
        l = Language of the stopwords.

    Returns:
        A list of strings after stop words are removed.
    """
    stop_words = stopwords.words(lang)
    return [word for word in lst if word not in stop_words]

## Cleaning up the Comments

In [5]:
# Starting with a small 5k sample to see if it all works
df = comments.sample(5000)
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username
243925,Pro Evolution Soccer 2015,PlayStation4,10,this game is absolutely awesome. i am happy th...,call_of_duty
151957,Battlefield: Bad Company 2,PC,10,"The guy who said his PC can't run the game, I...",LentiniM
176881,Sly Cooper and the Thievius Raccoonus,PlayStation2,8,A solid game however it does get repetitive ov...,AKthaBeast
228261,Call of Duty: Black Ops II,Xbox360,4,I was excited to pick this game up after readi...,ctruluck1324
206222,Middle-earth: Shadow of Mordor,PlayStation4,10,This is by far the most fun I have had dismemb...,shadowmancer66
...,...,...,...,...,...
83669,The Legend of Zelda: A Link Between Worlds,3DS,10,A great follow up to 'A Link To The Past'. The...,ForeverFalling
147524,The Witcher 2: Assassins of Kings,PC,5,Alright let me begin by saying that this game ...,akelz7
164234,Fallout 4,PlayStation4,10,"This review contains spoilers, cli...",Ninja-Puffs
279697,NBA 2K18,PlayStation4,0,MyTeam is a joke. MyGm is the worst manager mo...,zizhazhu


In [6]:
# Preparing the text for the analysis
%time  # wanted to try and see execution time - but I don't quite get this output?
df['Comments_Processed'] = df['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))
df

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed
243925,Pro Evolution Soccer 2015,PlayStation4,10,this game is absolutely awesome. i am happy th...,call_of_duty,"[thi, game, absolut, awesom, happi, took, one,..."
151957,Battlefield: Bad Company 2,PC,10,"The guy who said his PC can't run the game, I...",LentiniM,"[guy, said, hi, pc, run, game, guess, singl, c..."
176881,Sly Cooper and the Thievius Raccoonus,PlayStation2,8,A solid game however it does get repetitive ov...,AKthaBeast,"[solid, game, howev, doe, get, repetit, awhil,..."
228261,Call of Duty: Black Ops II,Xbox360,4,I was excited to pick this game up after readi...,ctruluck1324,"[wa, excit, pick, thi, game, read, review, cri..."
206222,Middle-earth: Shadow of Mordor,PlayStation4,10,This is by far the most fun I have had dismemb...,shadowmancer66,"[thi, far, fun, dismemb, orc, long, time, shad..."
...,...,...,...,...,...,...
83669,The Legend of Zelda: A Link Between Worlds,3DS,10,A great follow up to 'A Link To The Past'. The...,ForeverFalling,"[great, follow, link, past, soundtrack, one, b..."
147524,The Witcher 2: Assassins of Kings,PC,5,Alright let me begin by saying that this game ...,akelz7,"[alright, let, begin, say, thi, game, goti, ma..."
164234,Fallout 4,PlayStation4,10,"This review contains spoilers, cli...",Ninja-Puffs,"[thi, review, contain, spoiler, click, expand,..."
279697,NBA 2K18,PlayStation4,0,MyTeam is a joke. MyGm is the worst manager mo...,zizhazhu,"[myteam, joke, mygm, worst, manag, mode, exper..."


In [13]:
# Creating the 'target' column
df['Target'] = np.where((df['Userscore'] >= 5), 'Pos', 'Neg')

# Not sure if I need 3 labels or not, if I need this 'neutral' or if I should leave it like this
#df['Target'] = np.where((df['Userscore'] == 5), '2', df['Target'])

In [14]:
# Checking the values
df['Target'].value_counts()

Pos    4105
Neg     895
Name: Target, dtype: int64

In [15]:
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features
def find_features(document, bow):
    text = document.lower()
    features = dict()
    for w, c in bow:
        features[w] = w in text
    return features

def make_matrix(series_text, series_target, bow):
    # initially I had bool(t) in the return, and then I decided to remove it to see if I could actually get the
    ## labels to show up in the most_inf_feats instead of just true and false things, and it worked, but now it
    ## won't work again with bool(t) for some reason, the most_inf_feats always returns without any results
    ## I tried with the same values as in the NLP lab (0, 2, 4 - or just 0 - 4) still didn't work, so yeah
    return [(find_features(s, bow), t) for s, t in zip(series_text.values, series_target.values)]

matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Testing the model
size = int(len(matrix) * 0.2)  # xxx -> 20% of the data

# Tried inversing the order and training with 80% of the data and testing against the 20% left
training_set = matrix[size:]
testing_set = matrix[:size]

classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier.show_most_informative_features(15)

Most Informative Features
                 acclaim = True              Neg : Pos    =     13.8 : 1.0
                 desktop = True              Neg : Pos    =     13.8 : 1.0
                   messi = True              Neg : Pos    =     13.8 : 1.0
                    reus = True              Neg : Pos    =     13.8 : 1.0
                    rmah = True              Neg : Pos    =     13.8 : 1.0
                    rins = True              Neg : Pos    =     13.8 : 1.0
               fundament = True              Neg : Pos    =     13.8 : 1.0
                    peer = True              Neg : Pos    =     13.8 : 1.0
                   queue = True              Neg : Pos    =     12.0 : 1.0
                 auction = True              Neg : Pos    =     11.8 : 1.0
                 disgust = True              Neg : Pos    =     11.2 : 1.0
                   fleet = True              Neg : Pos    =     10.8 : 1.0
                   drain = True              Neg : Pos    =     10.8 : 1.0

In [33]:
#most_common
#matrix[0]

In [19]:
# This second time I ran the algo the results increased dramatically for some reason :x -> from 66% to 83%
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%')

Model accuracy: 83.2%


## Training the model in the whole DF

In [34]:
# Preparing the text for the analysis
comments['Comments_Processed'] = \
comments['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))

# Creating the 'target' column
comments['Target'] = np.where((comments['Userscore'] >= 5), 'Pos', 'Neg')

# gotta check if this 3rd label improves performance or not
#comments['Target'] = np.where((comments['Userscore'] == 5), 'Neutral', comments['Target'])

# Creating the bag of words
bow = [word for lst in comments['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(comments['Comment'], comments['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.25)  # xxx -> 25% of the data

# Training with 75% of the data and testing against the remaining 25%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
most_inf_feat = classifier.show_most_informative_features(15)
print('')

# Printing the model accuracy
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%')

Most Informative Features
              ticketsnew = True              Neg : Pos    =     30.2 : 1.0
              diretideno = True              Neg : Pos    =     28.6 : 1.0
                 maestra = True              Pos : Neg    =     19.3 : 1.0
               diretideg = True              Neg : Pos    =     19.1 : 1.0
                  refund = True              Neg : Pos    =     18.6 : 1.0
                  pikmin = True              Pos : Neg    =     18.5 : 1.0
                   glado = True              Pos : Neg    =     17.3 : 1.0
                    duda = True              Pos : Neg    =     15.8 : 1.0
                  sonora = True              Pos : Neg    =     14.0 : 1.0
                 disgrac = True              Neg : Pos    =     13.6 : 1.0
                 juegazo = True              Pos : Neg    =     13.2 : 1.0
                    scam = True              Neg : Pos    =     13.1 : 1.0
                  impecc = True              Pos : Neg    =     11.4 : 1.0

In [36]:
'''
I find it strange that after all this significantly bigger training the model's accuracy has barely increase.
I would expect to see at least a 70% there.
On a side note, I find it very odd that almost all the most informative words are all neutral?? That's just so strange
to me, since neutral only accounts for 10k records, whereas positive has over 220k values!!

After I ran everything again the scores changed, however, the point still stands. After training with a huge amount
of data, the scores barely changed. Don't know if it is a good thing or not.

Not sure if I might have a problem with this dataset being imbalanced? Since I have so many pos over negs? Have to
look into it.
'''
comments.Target.value_counts()

Pos    233865
Neg     48336
Name: Target, dtype: int64

In [37]:
# Just taking a look at the final dataframe
comments

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus,"[everyth, oot, near, perfect, realli, wonder, ...",Pos
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin,"[bore, everyon, alreadi, say, amaz, thi, game,...",Pos
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody,"[anyon, give, masterpiec, either, hate, astoun...",Pos
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman,"[one, peopl, think, thi, greatest, game, time,...",Pos
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA,"[thi, game, highest, rate, game, metacrit, goo...",Pos
...,...,...,...,...,...,...,...
282196,Etrian Odyssey Untold: The Millennium Girl,3DS,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell,"[extrem, similar, eo, obvious, bad, thing, say...",Pos
282197,Etrian Odyssey Untold: The Millennium Girl,3DS,0,Typical overrated Atlus trash. A game i should...,TemplarGR,"[typic, overr, atlu, trash, game, like, sinc, ...",Neg
282198,Etrian Odyssey Untold: The Millennium Girl,3DS,9,While I find the story mode to have annoying c...,midipon,"[find, stori, mode, annoy, charact, intrus, st...",Pos
282199,Etrian Odyssey Untold: The Millennium Girl,3DS,8,"Pretty good, but it certainly lacks the visual...",night4,"[pretti, good, certainli, lack, visual, audio,...",Pos


In [39]:
# Saving the trained algorithm
save_classifier = open('naivebayes.pickle', 'wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [None]:
# To open the classifier and use it
'''classifier_f = open('naivebayes.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()'''