In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import pickle
import time

# IMPORT

In [28]:
# df = pd.read_csv('processed_df\metacritic_comments_nlp.csv')
# df.head()

Unnamed: 0,Title,Userscore,Comment
0,The Legend of Zelda: Ocarina of Time,10.0,"Everything in OoT is so near at perfection, it..."
1,The Legend of Zelda: Ocarina of Time,10.0,I won't bore you with what everyone is already...
2,The Legend of Zelda: Ocarina of Time,10.0,Anyone who gives the masterpiece below a 7 or ...
3,The Legend of Zelda: Ocarina of Time,10.0,I'm one of those people who think that this is...
4,The Legend of Zelda: Ocarina of Time,10.0,This game is the highest rated game on Metacr...


In [2]:
df = pickle.load(open('processed_df\preprocessed_nlp_30p.pkl','rb'))
df.head()

Unnamed: 0,Title,Userscore,Comment,lang
211190,Star Wars: Battlefront II,2.0,there nothing quite gun entire platoon clone d...,en
39249,The Witcher 3: Wild Hunt,10.0,game absolutely gorgeous run steady gtx hairwo...,en
204931,This War of Mine,10.0,really good game didnt expect first survival g...,en
75752,Golden Sun,10.0,people eagerly anticipate initial release game...,en
199503,Assassin's Creed IV: Black Flag,8.0,best assassin creed far open world fresh missi...,en


# Train test split

In [3]:
print(f'shape before: {df.shape}')
df = df[df['Comment'] != '']
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(f'shape after: {df.shape}')

shape before: (74709, 4)
shape after: (70569, 4)


In [4]:
X = df['Comment']
y = df['Userscore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# Random Forest

- baseline

In [5]:
steps = [('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)), ('random forest', RandomForestRegressor())]
rf_pipeline = Pipeline(steps)

In [6]:
start = time.time()

rf_pipeline.fit(X_train, y_train)

end = time.time()
print(f'{end-start} secs slipped..')

1448.0000002384186 secs slipped..


In [7]:
y_pred = rf_pipeline.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')

y_pred = rf_pipeline.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')

test MSE4.524888242855965
test MAE1.5253740429211553
train MSE0.6388260215558237
train MAE0.5650253586255478


In [15]:
rf_pipeline.score(X_test,y_test)

0.20206640425757916

In [14]:
# seems like overfit a little bit
# it's a very slow model

- simple man tune

In [11]:
steps = [('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)), ('random forest', RandomForestRegressor(max_depth=16))]
rf_pipeline_m = Pipeline(steps)

In [12]:
rf_pipeline_m.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)),
                ('random forest', RandomForestRegressor(max_depth=16))])

In [13]:
y_pred = rf_pipeline_m.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')

y_pred = rf_pipeline_m.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')

test MSE 4.594916642394695
test MAE 1.571633311573439
train MSE 3.445144774264481
train MAE 1.3624296415729784
