In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline

import time
import pickle

# NLP modules we will use for text normalization
import re #regex 
import nltk # the natural language toolkit
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag


# import

In [2]:
df = pd.read_csv('processed_remsy\metacritic_comments_remsy.csv')
df

Unnamed: 0,Username,Userscore,Title,Comment
0,SirCaestus,10.0,The Legend of Zelda: Ocarina of Time,"Everything in OoT is so near at perfection, it..."
1,Kaistlin,10.0,The Legend of Zelda: Ocarina of Time,I won't bore you with what everyone is already...
2,Jacody,10.0,The Legend of Zelda: Ocarina of Time,Anyone who gives the masterpiece below a 7 or ...
3,doodlerman,10.0,The Legend of Zelda: Ocarina of Time,I'm one of those people who think that this is...
4,StevenA,10.0,The Legend of Zelda: Ocarina of Time,This game is the highest rated game on Metacr...
...,...,...,...,...
269962,RileyWRussell,7.0,Etrian Odyssey Untold: The Millennium Girl,"Extremely similar to EO:4, which obviously isn..."
269963,TemplarGR,,Etrian Odyssey Untold: The Millennium Girl,Typical overrated Atlus trash. A game i should...
269964,midipon,9.0,Etrian Odyssey Untold: The Millennium Girl,While I find the story mode to have annoying c...
269965,night4,8.0,Etrian Odyssey Untold: The Millennium Girl,"Pretty good, but it certainly lacks the visual..."


In [3]:
XGB_model = pickle.load(open('models/xgb_baseline.pkl', 'rb'))

# SENTIMENTAL IMPUTE

In [4]:
# def a function to process_text for TFIDF
def process_text(text, min_length):

    # drop non-english words
    words = set(nltk.corpus.words.words())
    # " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

    # get common stop words that we'll remove during tokenization/text normalization
    stop_words = stopwords.words('english')

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
   

    # lower case everything
    text_lower = text.lower()
    
    # remove stop words and punctuations 
    text_norm = [x for x in word_tokenize(text_lower) if ((x.isalpha()) & (x not in stop_words))]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(text_norm))) 

    # cutoff for tokenized length
    if len(wordnet_tagged) <= min_length:
        return ''
    else:
         # rejoins lemmatized sentence 
         text_norm = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
         return text_norm

In [5]:
# subset the dataset p * origin
# then do text process
# lastly, get rid of null, na and duplicates, because of the limit of length

def subset_and_process(p, df, colname):
    
    start = time.time()

    sample_size = int(df.shape[0]*p)
    print(f'subset size: {sample_size}')
    print(f'full dataset size: {df.shape[0]}')
    print(f'{p*100}%')

    df_sub = df.sample(sample_size,random_state=2333) 
    df_sub[colname].value_counts(normalize=True)

    start = time.time()
    df_sub[colname] = df_sub[colname].apply(process_text, min_length = 10)
    end = time.time()
    print(f'{end-start} seconds slipped.')

    df_new = df_sub[df_sub[colname] != '']
    df_new.drop_duplicates(inplace=True)

    print(f'output dataset size: {df_new.shape[0]}')
    
    end = time.time()
    print(f'{end-start} secs slipped..')

    return df_new

In [6]:
df_nan = df[df['Userscore'].isna()==True]
df_nan

Unnamed: 0,Username,Userscore,Title,Comment
341,Thajocoth,,The Legend of Zelda: Ocarina of Time,"I was very disappointed by this game, as it di..."
346,The_Legend,,The Legend of Zelda: Ocarina of Time,The original Legend of Zelda: Ocarina of Time ...
348,thedaarkbatty,,The Legend of Zelda: Ocarina of Time,I played it for the first time last year and w...
350,sweg,,The Legend of Zelda: Ocarina of Time,The Legend of Zelda: Ocarina of Time for Ninte...
352,J-malJ,,The Legend of Zelda: Ocarina of Time,"Boring game, graphics suck, gameplay is borin..."
...,...,...,...,...
269832,Not_Casual,,Assassin's Creed: Revelations,"I got this solely for the multiplayer, which i..."
269850,monkeylion,,Assassin's Creed: Revelations,"Boring, repetitive, predictable, booooring, I ..."
269851,Mykwon,,Assassin's Creed: Revelations,"At 87%, 39 hours elapsed and finished the game..."
269920,riley794,,Battle Brothers,So much promise so little gameplay. Played al...


In [9]:
# select the review with no score
df_nan_proc = subset_and_process(1, df_nan, 'Comment')

subset size: 19142
full dataset size: 19142
100%
1755.2710671424866 seconds slipped.
output dataset size: 18070
1755.2939529418945 secs slipped..


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# df_nan_proc.to_csv(r'processed_df\df_nan_proc.csv', index = False)


In [13]:
df_nan_proc = pd.read_csv('processed_df\df_nan_proc.csv')
df_nan_proc

Unnamed: 0,Username,Userscore,Title,Comment
0,halexman99,,God of War,pull teeth god war soft reboot suffers issue c...
1,samsu8,,Dota 2,game good take valve get bad game game become ...
2,Logan,,Call of Duty: Modern Warfare 2,many glitch exploit make game unfairly play al...
3,GIVEDIRETIDESUC,,Dota 2,diretideno new herono updatesno normal matchma...
4,PsyA,,Call of Duty: Modern Warfare 2,wth remove dedicated server avaliable sdk game...
...,...,...,...,...
18065,mickoh20000,,Call of Duty: Modern Warfare 3,probably bad excuse game ever play ever play l...
18066,chakrazz,,Dota 2,want make event new bloom whole new level reta...
18067,NaturalMyst,,Diablo III,good faith give game score high awful drm drm ...
18068,Logikal1,,Dragon Age II,game bad consider unplayable bioware fan decad...


# Getting the data ready for Surprise

In [62]:
data = df[['Username', 'Title', 'Userscore']]

reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(data, reader=reader)

In [63]:
# train test split

trainset, testset = train_test_split(data, test_size=.2, random_state = 2333)

In [64]:
print('Type trainset :',type(trainset),'\n')
print('Type testset :',type(testset))

Type trainset : <class 'surprise.trainset.Trainset'> 

Type testset : <class 'list'>


In [65]:
print(len(testset))
print(testset[0])

50165
('Naessarne', 'Divinity: Original Sin II', 8.0)


In [50]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  109730 

Number of items:  2309 



KNN

- baseline

In [66]:
sim_cos = {'name':'cosine', 'user_based':False}

In [67]:
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1663954a280>

In [68]:
basic.sim

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [69]:
predictions = basic.test(testset)

In [70]:
print(accuracy.rmse(predictions))

RMSE: 4.0359
4.035938571338445


# SVD

- baseline

In [111]:
svd = SVD()
svd.fit(trainset)

predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 3.9951


3.995106898826457

- GridSearch

In [107]:
start = time.time()

params = {'n_factors': [20, 50, 100], 'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1, cv=5)
g_s_svd.fit(data)

end = time.time()
print(f'time spent {end-start}')

time spent 20.846306324005127


In [108]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

# slightly better

{'rmse': 3.9880269156619725, 'mae': 3.7042668679363695}
{'rmse': {'n_factors': 100, 'reg_all': 0.1}, 'mae': {'n_factors': 100, 'reg_all': 0.02}}


- predict

# IMPORT IMPUTED DF 

In [3]:
df_nan_proc = pd.read_csv('processed_remsy\df_nan_imputed_tf.csv')
df_nan_proc

Unnamed: 0,Username,Userscore,Title,Comment
0,halexman99,7.835098,God of War,pull teeth god war soft reboot suffers issue c...
1,samsu8,6.199385,Dota 2,game good take valve get bad game game become ...
2,Logan,8.605755,Call of Duty: Modern Warfare 2,many glitch exploit make game unfairly play al...
3,GIVEDIRETIDESUC,7.801022,Dota 2,diretideno new herono updatesno normal matchma...
4,PsyA,3.190217,Call of Duty: Modern Warfare 2,wth remove dedicated server avaliable sdk game...
...,...,...,...,...
18065,mickoh20000,3.926747,Call of Duty: Modern Warfare 3,probably bad excuse game ever play ever play l...
18066,chakrazz,7.221959,Dota 2,want make event new bloom whole new level reta...
18067,NaturalMyst,7.545428,Diablo III,good faith give game score high awful drm drm ...
18068,Logikal1,6.495458,Dragon Age II,game bad consider unplayable bioware fan decad...


In [4]:
# merge non nan df with imputed

frames = [df[df['Userscore'].isna()==False], df_nan_proc]
df_con = pd.concat(frames)

- prepare data

In [5]:
data_con = df_con[['Username', 'Title', 'Userscore']]

reader = Reader(line_format='user item rating', sep=',')
data_con = Dataset.load_from_df(data_con, reader=reader)

In [6]:
# train test split

trainset, testset = train_test_split(data_con, test_size=.2, random_state = 2333)

- KNN

In [7]:
sim_cos = {'name':'cosine', 'user_based':False}

basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x24ab6c97b80>

In [8]:
predictions = basic.test(testset)
print(accuracy.rmse(predictions))

RMSE: 3.9404
3.940443248021918


- SVD

In [9]:
svd = SVD()
svd.fit(trainset)

predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 3.8975


3.8975487529571926