In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from xgboost import XGBRegressor
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import pickle
import time

# IMPORT

In [7]:
# df = pickle.load(open('processed_df\preprocessed_nlp_5p.pkl','rb'))
# df.head()

df = pd.read_csv('processed_df\preprocessed_nlp_30p.csv')
df.head()

df100 = pd.read_csv('processed_df\preprocessed_nlp_100p.csv')
df100.head()

Unnamed: 0,Title,Userscore,Comment,lang
0,Star Wars: Battlefront II,2.0,there nothing quite gun entire platoon clone d...,en
1,The Witcher 3: Wild Hunt,10.0,game absolutely gorgeous run steady gtx hairwo...,en
2,This War of Mine,10.0,really good game didnt expect first survival g...,en
3,Golden Sun,10.0,people eagerly anticipate initial release game...,en
4,Assassin's Creed IV: Black Flag,8.0,best assassin creed far open world fresh missi...,en


In [8]:
print(f'shape before: {df.shape}')
df_new = df[df['Comment'] != '']
df_new.drop_duplicates(inplace=True)
df_new.dropna(inplace=True)
print(f'shape after: {df_new.shape}')


shape before: (74709, 4)
shape after: (70569, 4)


In [9]:
print(f'shape before: {df100.shape}')
df100_new = df100[df100['Comment'] != '']
df100_new.drop_duplicates(inplace=True)
df100_new.dropna(inplace=True)
print(f'shape after: {df100_new.shape}')


shape before: (249031, 4)
shape after: (235301, 4)


# Train test split

In [30]:
print(f'shape before: {df.shape}')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(f'shape after: {df.shape}')

shape before: (74709, 4)
shape after: (70569, 4)


In [10]:
X = df_new['Comment']
y = df_new['Userscore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [12]:
X100 = df100_new['Comment']
y100 = df100_new['Userscore']

X100_train, X100_test, y100_train, y100_test = train_test_split(X100, y100, test_size = 0.3)

# XGBoost

- baseline, 30% data, 160 vol

In [13]:
steps = [('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)), ('XGBoost', XGBRegressor())]
xgb_pipeline = Pipeline(steps)

In [14]:
xgb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)),
                ('XGBoost', XGBRegressor())])

In [23]:
y_pred = xgb_pipeline.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')
print(f'test r2 {r2_score(y_test, y_pred)}')

print('  ')

y_pred = xgb_pipeline.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')
print(f'train r2 {r2_score(y_train, y_pred)}')

test MSE 4.519310029091267
test MAE 1.5819647954861784
test r2 0.24648527166105128
  
train MSE 4.367237188106699
train MAE 1.5534261206566233
train r2 0.265865449814367


In [48]:
xgb_pipeline['XGBoost'].get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'reg:linear',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

- baseline, 30% data, 1000 vol

In [16]:
steps = [('tfidf', TfidfVectorizer(max_features=1000)), ('XGBoost', XGBRegressor())]
xgb_pipeline_m = Pipeline(steps)

In [17]:
start = time.time()

xgb_pipeline_m.fit(X_train, y_train)

end = time.time()
print(f'{end-start} secs slipped..')

26.105666399002075 secs slipped..


In [22]:
y_pred = xgb_pipeline_m.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')
print(f'test r2 {r2_score(y_test, y_pred)}')

print('  ')

y_pred = xgb_pipeline_m.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')
print(f'train r2 {r2_score(y_train, y_pred)}')

test MSE 3.993797768647329
test MAE 1.4743300248575304
test r2 0.3341051131010562
  
train MSE 3.824718278293081
train MAE 1.445663888501061
train r2 0.35706312437801613


- baseline, 30% data, 5000 vol

In [26]:
steps = [('tfidf', TfidfVectorizer(max_features=5000)), ('XGBoost', XGBRegressor())]
xgb_pipeline_2 = Pipeline(steps)

start = time.time()
xgb_pipeline_2.fit(X_train, y_train)
end = time.time()
print(f'training used {end-start} secs')
print('  ')

y_pred = xgb_pipeline_2.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')
print(f'test r2 {r2_score(y_test, y_pred)}')
print('  ')

y_pred = xgb_pipeline_2.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')
print(f'train r2 {r2_score(y_train, y_pred)}')


training used 31.541131258010864 secs
  
test MSE 3.9503174621338166
test MAE 1.4706294199531955
test r2 0.34135468242463096
  
train MSE 3.788241365533563
train MAE 1.442666850452882
train r2 0.36319490994116255


- baseline, 100% data, 5000 vol

In [29]:
steps = [('tfidf', TfidfVectorizer(max_features=5000)), ('XGBoost', XGBRegressor())]
xgb_pipeline_3 = Pipeline(steps)

start = time.time()
xgb_pipeline_3.fit(X100_train, y100_train)
end = time.time()
print(f'training used {end-start} secs')
print('  ')

training used 107.31179189682007 secs
  


In [28]:

y_pred = xgb_pipeline_3.predict(X100_test)
print(f'test MSE {mean_squared_error(y100_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y100_test, y_pred)}')
print(f'test r2 {r2_score(y100_test, y_pred)}')
print('  ')

y_pred = xgb_pipeline_3.predict(X100_train)
print(f'train MSE {mean_squared_error(y100_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y100_train, y_pred)}')
print(f'train r2 {r2_score(y100_train, y_pred)}')


test MSE 3.934551791541657
test MAE 1.4646042223258096
test r2 0.33982759927227246
  
train MSE 3.8727982137592587
train MAE 1.456405781815883
train r2 0.3482727056047701


# GRID SEARCH