In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
import xgboost as xgb

In [31]:
game_train = pd.read_csv("../input/game-review/game_train.csv")
print(len(game_train))
game_train.head()

The dataset has 10494 observations, with 5 columns. The target variable is user_suggestion and the key variables that we need to use is the user_review, check if the game was fun or not.

In [32]:
game_test = pd.read_csv("../input/game-review/game_test.csv")
print(len(game_test))
game_test.head()

The data has 6996 observations with 4 colums. Compared to the train dataset, it doesn't have target variable.

In [33]:
games = pd.read_csv("../input/game-review/games.csv")
print(len(games))
games.head()

The games listed in the games.csv file are 64 in total, let's see if the games listed in train & test set are all in this game list. If yes, then we can merge this file and can have more features to train the model.

In [34]:
game_train_list = game_train.title.unique()
print(f"number of games reviewed(game_train): {len(game_train_list)}")
game_test_list = game_test.title.unique()
print(f"number of games reviewed(game_test): {len(game_test_list)}")
game_list = games.title.unique()
print(f"number of games reviewed(games): {len(game_list)}")

The number of the games in train and test data is not equal as the number of games in games.csv file. We then need to manually check if the games in the training and testing set in the games.csv file.

In [35]:
def all_in_list(compare, base):
    classifier = 1
    for i in compare:
        classifier = classifier * (i in base)
    if classifier == 1:
        return True
    else: 
        return False

In [36]:
print(all_in_list(game_train_list, game_list))
print(all_in_list(game_test_list, game_list))

All the games in the training dataset and testing dataset are in the game list, so we can merge the dataset games with game_test and game_train

In [37]:
game_train = game_train.merge(games, how = 'left', on = 'title')
game_test = game_test.merge(games, how = 'left', on = 'title')

In [38]:
print(len(game_train))
game_train.head()

In [39]:
print(len(game_test))
game_test.head()

Given that any observation wasn't omitted, the dataframes were well merged, we'll further go on analyzing the data.

Since the number of the games are not big enough to make a reasonable assumptions with developer & publisher (the max no. of game developed in same company is 4, which is small to make a hypothesis that the developer affects the quality of games). Moreover, the overview of the game is just an introduction of the game from the publisher, which makes it hard to be a objective data. So we'll drop those features.

However, the tags are important since there might be a correlation between genres of the game and probability that the user will recommend the game, so we'll leave it, but preprocess it to use it in a model.

In [40]:
game_train = game_train.drop(['developer','publisher','overview'], axis = 1)
game_train.head()

In [41]:
game_test = game_test.drop(['developer','publisher','overview'], axis = 1)
game_test.head()

In [42]:
game_train.tags = game_train.tags.apply(lambda x: eval(x))
game_train_df = game_train.explode(column='tags')

In [43]:
dummy_variable_name = []
for i in game_train_df.tags.unique():
    dummy_variable_name.append(f"tags_{i}")
print(dummy_variable_name)

In [44]:
a = pd.get_dummies(game_train_df, columns=['tags']).groupby('review_id', as_index=False)[dummy_variable_name].sum()
game_train = game_train.merge(a, on = "review_id")
game_train = game_train.drop('tags', axis = 1)
game_train.head()

In [45]:
# reordering columns in game_train
cols = game_train.columns.tolist()
cols_1 = cols[:4]
cols_2 = cols[5:]
cols_3 = cols[4]
cols = cols_1 + cols_2
cols.append(cols_3)
game_train = game_train[cols]
game_train.head()

In [46]:
game_train.shape

In [47]:
game_test.tags = game_test.tags.apply(lambda x: eval(x))
game_test_df = game_test.explode(column='tags')

In [48]:
dummy_variable_name = []
for i in game_test_df.tags.unique():
    dummy_variable_name.append(f"tags_{i}")
print(dummy_variable_name)

In [49]:
a = pd.get_dummies(game_test_df, columns=['tags']).groupby('review_id', as_index=False)[dummy_variable_name].sum()
game_test = game_test.merge(a, on = "review_id")
game_test = game_test.drop('tags', axis = 1)
game_test.head()

Since the user_review is the key variable in getting the review, we'll first only use user_review to predict whether the user has recommended or not. We used TfidfVectorizer and CountVectorizer provided in sklearn package.

In [50]:
game_train_sentiment = game_train.loc[:,['review_id','user_review','user_suggestion']]
game_train_sentiment.head()

In [51]:
class_df = game_train_sentiment['user_suggestion']
feature_df = game_train_sentiment['user_review']

X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, 
                                                    test_size = 0.3, random_state = 4300)

In [52]:
feature_test_df = game_test['user_review']

In [53]:
feature_test_df

## TfidfVectorizer with Logistic Regression

In [54]:
for i in range(1,6): 
    print(f"C = {i}")
    pipeline = Pipeline([('cnt_vect', TfidfVectorizer(max_df = .8, stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C=i))])
    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_probs = pipeline.predict_proba(X_test)

    print(pred_probs)
    print(f"Accuracy Score: {accuracy_score(y_test,pred)}, roc_auc_score: {roc_auc_score(y_test, pred_probs[:,1])}")

Changing C parameters doesn't change the result much. But C=4 shows the best performance

In [55]:
for i in [.2,.4,.6,.7,.8,.9,1]: 
    print(f"Max_df = {i}")
    pipeline = Pipeline([('cnt_vect', TfidfVectorizer(max_df = i, stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C=4))])
    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_probs = pipeline.predict_proba(X_test)

    print(pred_probs)
    print(f"Accuracy Score: {accuracy_score(y_test,pred)}, roc_auc_score: {roc_auc_score(y_test, pred_probs[:,1])}")

Model with Max_df = 0.9 shows the best performance. Therefore we choose the final model to be with parameter C= 4 and Max_df = 0.9

In [56]:
pipeline = Pipeline([('cnt_vect', TfidfVectorizer(max_df = .9, stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C=4))])
pipeline.fit(X_train, y_train)
predicted_result = pipeline.predict(feature_test_df)

In [57]:
TFid_DF = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
TFid_DF = TFid_DF.sort_values(by=['review_id'])
TFid_DF.to_csv('TfidVectorizer_Logistic.csv', index=False)

## CountVectorizer with Logistic Regression

In [58]:
for i in range(1,6):     
    print(f"C = {i}")
    pipeline = Pipeline([('cnt_vect', CountVectorizer(stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C=i, max_iter = 200))])
    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_probs = pipeline.predict_proba(X_test)

    #print(pred_probs)
    print(f"Accuracy Score: {accuracy_score(y_test,pred)}, roc_auc_score: {roc_auc_score(y_test, pred_probs[:,1])}")

Different with TFidVectorizer, CountVectorizer has the best performance at C = 1.

In [59]:
for i in [.7,.75,.8]:     
    print(f"Max_df = {i}")
    pipeline = Pipeline([('cnt_vect', CountVectorizer(max_df= i, stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C= 1, max_iter = 200))])
    pipeline.fit(X_train, y_train)
    pred = pipeline.predict(X_test)
    pred_probs = pipeline.predict_proba(X_test)

    #print(pred_probs)
    print(f"Accuracy Score: {accuracy_score(y_test,pred)}, roc_auc_score: {roc_auc_score(y_test, pred_probs[:,1])}")

There is a plateau from 0.6 <= Max_df <= 0.8, so we can choose a max_df within this range.

In [60]:
pipeline = Pipeline([('cnt_vect', CountVectorizer(max_df= 0.6, stop_words = 'english', ngram_range = (1,2))),
                        ('lr_clf', LogisticRegression(C=1, max_iter = 200))])
pipeline.fit(X_train, y_train)
predicted_result = pipeline.predict(feature_test_df)

In [61]:
Count_df = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
Count_df = Count_df.sort_values(by=['review_id'])
Count_df.to_csv('CountVectorizer_Logistic.csv', index=False)

In [62]:
Tfid_vectorizer = TfidfVectorizer(max_df = .8, stop_words = 'english', ngram_range = (1,2))
Tfid_X = Tfid_vectorizer.fit_transform(feature_df)


Count_vectorizer = CountVectorizer(max_df = .8, stop_words = 'english', ngram_range = (1,2))
Count_X = Count_vectorizer.fit_transform(feature_df)


In [63]:
Tfid_test_X = Tfid_vectorizer.transform(feature_test_df)

In [64]:
feature_test_df.shape

## Decision Tree Classifier

In [65]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    tree = DecisionTreeClassifier(max_depth = 10)
    tree.fit(train_text, y_train)
    tree_pred = tree.predict(test_text)
    print (classification_report(y_test, tree_pred))

The model's performance is not so good, so we won't export this result.

## Random Forest

In [66]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    forest = RandomForestClassifier(n_estimators = 20, max_depth = 10)
    forest.fit(train_text, y_train)
        
    forest_pred = forest.predict(test_text)
    print (classification_report(y_test, forest_pred))   

## Linear SVC

In [67]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    linearSVM = LinearSVC()
    linearSVM.fit(train_text, y_train)
        
    svm_pred = linearSVM.predict(test_text)
    print (classification_report(y_test, svm_pred))  

In [68]:
C_list = [0.8,0.9,1,1.1,1.2] 
for i in C_list:
    score_list = []
    print (f"C = {i}")
    kf = KFold(n_splits = 10)
    for train_indices, test_indices in kf.split(Tfid_X):
        train_text = Tfid_X[train_indices]
        y_train = class_df[train_indices]

        test_text = Tfid_X[test_indices]
        y_test = class_df[test_indices]

        linearSVM = LinearSVC(C = i)
        linearSVM.fit(train_text, y_train)

        svm_pred = linearSVM.predict(test_text)
        score_list.append(accuracy_score(y_test, svm_pred))
    print(sum(score_list)/len(score_list))

In [69]:
linearSVM = LinearSVC(C = 1)
linearSVM.fit(train_text, y_train)

In [70]:
predicted_result = linearSVM.predict(Tfid_test_X)

In [71]:
print(Tfid_test_X.shape)
print(Tfid_X.shape)

In [72]:
linearSVM_df = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
linearSVM_df = linearSVM_df.sort_values(by=['review_id'])
linearSVM_df.to_csv('LinearSVC.csv', index=False)

In [73]:
linearSVM_df.head()

## Non-Linear SVM

In [74]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    SVM = SVC(kernel = 'rbf')
    SVM.fit(train_text, y_train)
        
    svm_pred = SVM.predict(test_text)
    print (classification_report(y_test, svm_pred))  

In [75]:
predicted_result = SVM.predict(Tfid_test_X)
SVM_df = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
SVM_df = SVM_df.sort_values(by=['review_id'])
SVM_df.to_csv('SVM.csv', index=False)

In [76]:
SVM_df.head()

## SGD Classifier

In [77]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    SGD = SGDClassifier()
    SGD.fit(train_text, y_train)
        
    sgd_pred = SGD.predict(test_text)
    print (classification_report(y_test, sgd_pred)) 

In [78]:
predicted_result = SGD.predict(Tfid_test_X)
SGD_df = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
SGD_df = SGD_df.sort_values(by=['review_id'])
SGD_df.to_csv('SGD.csv', index=False)

In [79]:
SGD_df.head()

The performance of SVM seems better performing compared to the tree/forest models which is interesting. After running different models with SVM algorithms, linear SVC yields the best result when submitting the results.

In [80]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))

def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

def lemmatizer(text):
    Lemmatizer = WordNetLemmatizer()
    return Lemmatizer.lemmatize(text)

In [81]:
temp = []
#Splitting pd.Series to list
data_to_list = feature_df.values.tolist()
for i in range(len(data_to_list)):
    temp.append(data_to_list[i])
data_words = list(sent_to_words(temp))
data = []
for i in range(len(data_words)):
    data.append(lemmatizer(detokenize(data_words[i])))
print(data[:5])

In [82]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
reviews = pad_sequences(sequences)
print(reviews)

In [83]:
temp = []
#Splitting pd.Series to list
data_to_list = feature_test_df.values.tolist()
for i in range(len(data_to_list)):
    temp.append(data_to_list[i])
data_words = list(sent_to_words(temp))
data_test = []
for i in range(len(data_words)):
    data_test.append(lemmatizer(detokenize(data_words[i])))
print(data_test[:5])

In [84]:
tokenizer.fit_on_texts(data_test)
sequences_test = tokenizer.texts_to_sequences(data_test)
reviews_test = pad_sequences(sequences_test)
print(reviews_test)

In [85]:
def find_max_list(list):
    list_len = [len(i) for i in list]
    print(max(list_len))

#print output#
find_max_list(sequences_test)
find_max_list(sequences)
find_max_list(reviews)
find_max_list(reviews_test)

In [86]:
print(reviews.shape)
print(class_df.shape)
print(reviews_test.shape)

In [87]:
X_train_tokenized, X_test_tokenized, y_train, y_test = train_test_split(reviews, class_df, 
                                                                        test_size = 0.2, random_state = 4300)
print(X_train_tokenized.shape)
print(X_test_tokenized.shape)
print(y_train.shape)
print(y_test.shape)

In [88]:
X_train, X_validate, y_train, y_validate = train_test_split(X_train_tokenized, y_train,
                                                           test_size = 0.25, random_state = 4300)
print(X_train.shape)
print(X_validate.shape)
print(y_train.shape)
print(y_validate.shape)

## Simple LSTM Model

In [89]:
model1 = Sequential()
model1.add(layers.Embedding(max_words, 64)) #The embedding layer
model1.add(layers.LSTM(64,dropout=0.8)) #Our LSTM layer
model1.add(layers.Dense(32,activation='relu'))
model1.add(layers.Dense(1,activation='sigmoid'))


model1.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True,
                              mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=30,validation_data=(X_validate, y_validate),
                     callbacks=[checkpoint1])

In [90]:
best_model = keras.models.load_model("best_model1.hdf5")
y_pred_simple = best_model.predict(X_test_tokenized)


In [91]:
for i in range(2099):
    if y_pred_simple[i] >= 0.5:
        y_pred_simple[i] = 1
    else:
        y_pred_simple[i] = 0

In [92]:
print(accuracy_score(y_test, y_pred_simple))

The Model Accuracy Score for Simple LSTM Model is 0.8432586946164841.

## Bidirectional LSTM Model

In [93]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(30,dropout=0.8)))
model2.add(layers.Dense(60, activation = 'relu'))
model2.add(layers.Dense(1,activation='sigmoid'))

model2.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, 
                              mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=20,validation_data=(X_validate, y_validate),
                     callbacks=[checkpoint2])

In [94]:
best_model = keras.models.load_model("best_model2.hdf5")
y_pred_bi = best_model.predict(X_test_tokenized)


In [95]:
for i in range(2099):
    if y_pred_bi[i] >= 0.5:
        y_pred_bi[i] = 1
    else:
        y_pred_bi[i] = 0

In [97]:
print(accuracy_score(y_test, y_pred_bi))

By running LSTM model, the best accuracy has been achieved from simple LSTM Model.

In [98]:
best_model = keras.models.load_model("best_model1.hdf5")
classify = best_model.predict(reviews_test)
classify = np.squeeze(classify)

In [100]:
for i in range(6996):
    if classify[i] >= 0.5:
        classify[i] = 1
    else:
        classify[i] = 0
classify = classify.astype(int)

In [102]:
LSTM_SVM_df = pd.DataFrame({'review_id': game_test['review_id'],'user_suggestionSVM': predicted_result, 'user_suggestionLSTM': classify})
# you could use any filename. We choose submission here
LSTM_SVM_df.head()

In [103]:
different = []
for i in range(LSTM_SVM_df.shape[0]):
    if (LSTM_SVM_df.iloc[i,1] != LSTM_SVM_df.iloc[i,2]):
        different.append(i)

print(len(different))

Although the validation accuracy of LSTM model is high, the performance of the model is not as good as simple linear SVM model.

In [105]:
LSTM_df = pd.DataFrame({'review_id': game_test['review_id'], 'user_suggestion': classify})
LSTM_df = LSTM_df.sort_values(by=['review_id'])
LSTM_df.to_csv('Simple_LSTM_1.csv', index=False)

In [106]:
# Checking the dimension
print(reviews.shape)
print(reviews_test.shape)

In [107]:
best_model = keras.models.load_model("best_model1.hdf5")
pred_model = best_model.predict(reviews)
pred_model = np.squeeze(pred_model)
pred_model.shape

In [108]:
best_model = keras.models.load_model("best_model1.hdf5")
pred_model_test = best_model.predict(reviews_test)
pred_model_test = np.squeeze(pred_model_test)
pred_model_test.shape

In [109]:
pred_model = pd.Series(pred_model, name = 'sentiment')
game_train_merged = pd.concat([game_train, pred_model], axis = 1)
game_train_merged = game_train_merged.drop("user_review", axis = 1)
game_train_merged.head()

In [110]:
# reordering columns in game_train
cols = game_train_merged.columns.tolist()
cols_1 = cols[:-2]
cols_2 = cols[-1]
cols_3 = cols[-2]
cols_1.append(cols_2)
cols_1.append(cols_3)
game_train_merged = game_train_merged[cols_1]
game_train_merged.head()

In [111]:
pred_model_test = pd.Series(pred_model_test, name = 'sentiment')
game_test_merged = pd.concat([game_test, pred_model_test], axis = 1)
game_test_merged = game_test_merged.drop("user_review", axis = 1)
game_test_merged.head()

In [112]:
train_target = game_train_merged['user_suggestion']
train_target.head()

In [113]:
train_feature = game_train_merged.iloc[:,:-1]
train_feature.head()

In [114]:
train_feature_list = train_feature.columns.tolist()
train_col1 = train_feature_list[:3]
train_col2 = train_feature_list[3:-1]
train_col2.sort()
train_col3 = train_feature_list[-1]
train_col = train_col1 + train_col2
train_col.append(train_col3)
train_feature = train_feature[train_col]
train_feature.head()

In [115]:
game_test_merged_list = game_test_merged.columns.tolist()
test_col1 = game_test_merged_list[:3]
test_col2 = game_test_merged_list[3:-1]
test_col2.sort()
test_col3 = game_test_merged_list[-1]
test_col = test_col1 + test_col2
test_col.append(test_col3)
game_test_merged = game_test_merged[test_col]
game_test_merged.head()

In [116]:
#Make sure if the columns are properly sorted and the columns are same for both training and testing set
for i in range(len(test_col)):
    i = 1
    if train_col[i]==test_col[i]:
        i = i * 1
    else:
        i = i * 0
print(i)

Since we got the continuous variables by using LSTM layer with Sigmoid function, we can include this as a one of the feature variable. Now we can try using xgboost model to predict the 'user_suggestion'.

# Classification Model Including More Features

In [118]:
# Select important features in the model
final_train = train_feature.iloc[:,3:]
final_train_w_year = train_feature.iloc[:,2:]

In [117]:
final_game_test = game_test_merged.iloc[:,3:]
final_game_test_w_year = game_test_merged.iloc[:,2:]

# XGboost Model

## Using Data final_train and train_target

In [119]:
data_dmatrix = xgb.DMatrix(data=final_train,label=train_target)

In [120]:
data_dmatrix

In [121]:
X_train, X_test, y_train, y_test = train_test_split(final_train, train_target, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [122]:
xgb_model_original= xgb.XGBClassifier(n_estimators = 200, 
                             learning_rate = 0.3, 
                             max_depth = 4,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [123]:
xgb_model_original.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_original.predict(X_test)

In [124]:
print(f'XGBoost model accuracy score: {accuracy_score(y_test, y_pred)}')

In [125]:
predicted_result = xgb_model_original.predict(final_game_test)
print(predicted_result)

In [126]:
XGB_original_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
XGB_original_df = XGB_original_df.sort_values(by=['review_id'])
XGB_original_df.to_csv('XGboost_Original.csv', index=False)

## Using Data final_train_w_year and train_target

In [127]:
X_train, X_test, y_train, y_test = train_test_split(final_train_w_year, train_target, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [128]:
xgb_model_year= xgb.XGBClassifier(n_estimators = 30, 
                             learning_rate = 0.3, 
                             max_depth = 4,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [129]:
xgb_model_year.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_year.predict(X_test)

In [130]:
print(f'XGBoost model accuracy score: {accuracy_score(y_test, y_pred)}')

In [131]:
predicted_result = xgb_model_year.predict(final_game_test_w_year)
print(predicted_result)

In [132]:
XGB_year_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
XGB_year_df = XGB_year_df.sort_values(by=['review_id'])
XGB_year_df.to_csv('XGboost_year.csv', index=False)

The model using XGboost yields good accuracy score for the training dataset. However the performance in the test dataset is extremely lower than the training set case. We assume it's because of the overfitting issues in using the LSTM model. Since the LSTM model yielded low performing results, therefore affecting the XGboost models. Therefore the models aren't performing as expected given the high accuracy score in the training set.

# XGBoost with SVM sentiment

In [133]:
kf = KFold(n_splits = 10)
for train_indices, test_indices in kf.split(Tfid_X):
    train_text = Tfid_X[train_indices]
    y_train = class_df[train_indices]
        
    test_text = Tfid_X[test_indices]
    y_test = class_df[test_indices]
    
    linearSVM = LinearSVC(C=1)
    linearSVM.fit(train_text, y_train)
        
    svm_pred = linearSVM.predict(test_text)
    print (classification_report(y_test, svm_pred))  

In [135]:
SVM_score = linearSVM.decision_function(Tfid_test_X)
SVM_pred = linearSVM.predict(Tfid_test_X)
score_pred_df = pd.DataFrame({'score': SVM_score, 'pred':SVM_pred})
score_pred_df.head(50)

Using decision_function returns a continuous variable with a decision boundary of 0. If negative then 0, and if positive then 1.

In [136]:
SVM_train_score = linearSVM.decision_function(Tfid_X)
SVM_test_score = linearSVM.decision_function(Tfid_test_X)

In [137]:
train_feature = train_feature.drop('sentiment', axis = 1)


In [138]:
SVM_train_score = pd.Series(SVM_train_score, name = 'sentiment')
train_feature = pd.concat([train_feature, SVM_train_score], axis = 1)
train_feature.head()

In [139]:
game_test_merged = game_test_merged.drop('sentiment', axis = 1)
SVM_test_score = pd.Series(SVM_test_score, name = 'sentiment')
game_test_merged = pd.concat([game_test_merged, SVM_test_score], axis = 1)
game_test_merged.head()

In [140]:
# Select important features in the model
final_train = train_feature.iloc[:,3:]
final_train_w_year = train_feature.iloc[:,2:]
final_game_test = game_test_merged.iloc[:,3:]
final_game_test_w_year = game_test_merged.iloc[:,2:]

## Using Data final_train and train_target

In [191]:
X_train, X_test, y_train, y_test = train_test_split(final_train, train_target, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [192]:
xgb_model_original= xgb.XGBClassifier(n_estimators = 200, 
                             learning_rate = 0.3, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)


In [193]:
xgb_model_original.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_original.predict(X_test)

In [194]:
print(f'XGBoost model accuracy score: {accuracy_score(y_test, y_pred)}')

In [195]:
predicted_result = xgb_model_original.predict(final_game_test)
print(predicted_result)

In [199]:
XGB_SVM_Original_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
XGB_SVM_Original_df = XGB_SVM_Original_df.sort_values(by=['review_id'])
XGB_SVM_Original_df.to_csv('XGB_SVM_Original_df_1.csv', index=False)

In [196]:
XGB_vs_SVM_Original_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestionSVM': SVM_pred, 'user_suggestionXGboost': predicted_result})

In [197]:
XGB_vs_SVM_Original_df.head()

In [198]:
different = []
for i in range(XGB_vs_SVM_Original_df.shape[0]):
    if (XGB_vs_SVM_Original_df.iloc[i,1] != XGB_vs_SVM_Original_df.iloc[i,2]):
        different.append(i)

print(len(different))

The XGboost model gives exactly same output with just using SVM model. It's because the weight for the sentiment columns is extremly heavy, we can reduce the weight through normalization. But before we can try adding year as an extra feature. 

In [173]:
X_train, X_test, y_train, y_test = train_test_split(final_train_w_year, train_target, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)

In [174]:
xgb_model_year= xgb.XGBClassifier(n_estimators = 30, 
                             learning_rate = 0.3, 
                             max_depth = 3,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [175]:
xgb_model_year.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_year.predict(X_test)

In [185]:
print(f'XGBoost model accuracy score: {accuracy_score(y_test, y_pred)}')

predicted_result = xgb_model_year.predict(final_game_test_w_year)
print(predicted_result)

In [182]:
XGB_vs_SVM_Year_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestionSVM': SVM_pred, 'user_suggestionXGboost': predicted_result})

In [183]:
XGB_vs_SVM_Year_df.head()

In [186]:
different = []
for i in range(XGB_vs_SVM_Year_df.shape[0]):
    if (XGB_vs_SVM_Year_df.iloc[i,1] != XGB_vs_SVM_Year_df.iloc[i,2]):
        different.append(i)

print(len(different))

In [187]:
XGB_SVM_year_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
XGB_SVM_year_df = XGB_SVM_year_df.sort_values(by=['review_id'])
XGB_SVM_year_df.to_csv('XGB_SVM_year_df.csv', index=False)

## Standardizing SVM value

In [None]:
SVM_train_score = linearSVM.decision_function(Tfid_X)
SVM_test_score = linearSVM.decision_function(Tfid_test_X)

In [218]:
scaler = MinMaxScaler(feature_range=(-1, 1))
SVM_test_score = scaler.fit_transform(SVM_test_score.values.reshape(-1, 1))
SVM_train_score = scaler.fit_transform(SVM_train_score.values.reshape(-1, 1))
train_feature = train_feature.drop('sentiment', axis = 1)

In [220]:
SVM_train_score = pd.Series(np.squeeze(SVM_train_score), name = 'sentiment')
train_feature = pd.concat([train_feature, SVM_train_score], axis = 1)
train_feature.head()

In [221]:
game_test_merged = game_test_merged.drop('sentiment', axis = 1)
SVM_test_score = pd.Series(np.squeeze(SVM_test_score), name = 'sentiment')
game_test_merged = pd.concat([game_test_merged, SVM_test_score], axis = 1)
game_test_merged.head()

In [222]:
# Select important features in the model
final_train = train_feature.iloc[:,3:]
final_train_w_year = train_feature.iloc[:,2:]
final_game_test = game_test_merged.iloc[:,3:]
final_game_test_w_year = game_test_merged.iloc[:,2:]

In [223]:
X_train, X_test, y_train, y_test = train_test_split(final_train, train_target, test_size = 0.2, random_state = 3400)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 3400)


In [224]:
xgb_model_original= xgb.XGBClassifier(n_estimators = 200, 
                             learning_rate = 0.3, 
                             max_depth = 4,
                             objective='binary:logistic', 
                             use_label_encoder=False)

In [225]:
xgb_model_original.fit(X_train, y_train, 
              early_stopping_rounds = 100,
              eval_metric="logloss", 
              eval_set=[(X_val, y_val)])
y_pred = xgb_model_original.predict(X_test)

In [226]:
print(f'XGBoost model accuracy score: {accuracy_score(y_test, y_pred)}')

In [227]:
predicted_result = xgb_model_original.predict(final_game_test)
print(predicted_result)

In [228]:
XGB_vs_SVMScale_Original_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestionSVM': SVM_pred, 'user_suggestionXGboost': predicted_result})

In [229]:
XGB_vs_SVMScale_Original_df.head()

In [230]:
different = []
for i in range(XGB_vs_SVMScale_Original_df.shape[0]):
    if (XGB_vs_SVMScale_Original_df.iloc[i,1] != XGB_vs_SVMScale_Original_df.iloc[i,2]):
        different.append(i)

print(len(different))

In [231]:
XGB_SVMScale_Original_df = pd.DataFrame({'review_id': game_test_merged['review_id'], 'user_suggestion': predicted_result})
# you could use any filename. We choose submission here
XGB_SVMScale_Original_df = XGB_SVMScale_Original_df.sort_values(by=['review_id'])
XGB_SVMScale_Original_df.to_csv('XGB_SVMScale_Original_df.csv', index=False)

Although standardized the model, the model performance doesn't seems to improve. In conclusion, LinearSVC model has made the best performance over XGboost models and LSTM models which is impressive to observe. XGboost and LSTM Model had very high accuracy score on the training set, but it yielded underperforming result in predicting the testing set due to overfitting. 