In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, plot_confusion_matrix, classification_report, plot_roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from xgboost import XGBRegressor
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.metrics import mean_squared_error, mean_absolute_error

import pickle
import time

# IMPORT

In [28]:
# df = pickle.load(open('processed_df\preprocessed_nlp_5p.pkl','rb'))
# df.head()

df = pd.read_csv('processed_df\preprocessed_nlp_30p.csv')
df.head()

Unnamed: 0,Title,Userscore,Comment,lang
0,Star Wars: Battlefront II,2.0,there nothing quite gun entire platoon clone d...,en
1,The Witcher 3: Wild Hunt,10.0,game absolutely gorgeous run steady gtx hairwo...,en
2,This War of Mine,10.0,really good game didnt expect first survival g...,en
3,Golden Sun,10.0,people eagerly anticipate initial release game...,en
4,Assassin's Creed IV: Black Flag,8.0,best assassin creed far open world fresh missi...,en


In [29]:
print(f'shape before: {df.shape}')
df_new = df[df['Comment'] != '']
print(f'shape after: {df.shape}')


shape before: (74709, 4)
shape after: (74709, 4)


# Train test split

In [30]:
print(f'shape before: {df.shape}')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(f'shape after: {df.shape}')

shape before: (74709, 4)
shape after: (70569, 4)


In [31]:
X = df['Comment']
y = df['Userscore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# XGBoost

- baseline

In [32]:
steps = [('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)), ('XGBoost', XGBRegressor())]
xgb_pipeline = Pipeline(steps)

In [33]:
xgb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)),
                ('XGBoost', XGBRegressor())])

In [45]:
y_pred = xgb_pipeline.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')

y_pred = xgb_pipeline.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')

test MSE 4.570678465059893
test MAE 1.5924729288488684
train MSE 4.3473159237920385
train MAE 1.5505740932088288


In [48]:
xgb_pipeline['XGBoost'].get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'reg:linear',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

- some man tuning

In [38]:
steps = [('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)), ('XGBoost', XGBClassifier(learning_rate=1, subsample=0.8))]
xgb_pipeline_m = Pipeline(steps)

In [39]:
start = time.time()

xgb_pipeline_m.fit(X_train, y_train)

end = time.time()
print(f'{end-start} secs slipped..')

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.95, min_df=0.05)),
                ('XGBoost',
                 XGBClassifier(learning_rate=1, objective='multi:softprob',
                               subsample=0.8))])

In [40]:
y_pred = xgb_pipeline_m.predict(X_test)
mean_squared_error(y_test, y_pred)

6.936375230267819

In [46]:
y_pred = xgb_pipeline_m.predict(X_test)
print(f'test MSE {mean_squared_error(y_test, y_pred)}')
print(f'test MAE {mean_absolute_error(y_test, y_pred)}')

y_pred = xgb_pipeline_m.predict(X_train)
print(f'train MSE {mean_squared_error(y_train, y_pred)}')
print(f'train MAE {mean_absolute_error(y_train, y_pred)}')

test MSE 6.936375230267819
test MAE 1.557507911766095
train MSE 4.927831086278797
train MAE 1.1116037086521722
