In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE as smt
from imblearn.under_sampling import RandomUnderSampler as rrs

from pycaret.classification import *

from glob import glob

import os
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(f'./cltandonehot_final.csv', sep=';')
data.columns

Index(['24_Hour_Peak', 'All_time_peak', 'average_forever', 'average_2weeks',
       'median_forever', 'median_2weeks', 'price', 'initialprice', 'num_lang',
       'required_age',
       ...
       'Linear', 'Grid-Based Movement', 'FMV', 'Lemmings', 'Tile-Matching',
       'Indie', 'Arcade', 'Atmospheric', 'Cats', 'tag_num'],
      dtype='object', length=511)

In [3]:
data.filter(regex=r'c\d+').head().columns

Index(['c0', 'c1', 'c2', 'c3', 'c4', 'c5'], dtype='object')

In [4]:
train, test = train_test_split(data, random_state=1234,
                            stratify=data['Review'])

In [5]:
print(train.shape, test.shape)

(34689, 511) (11563, 511)


In [6]:
caret = setup(train, target='Review', session_id=1234, fold=5)

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Review
2,Target type,Binary
3,Target mapping,"Negative: 0, Positive: 1"
4,Original data shape,"(34689, 511)"
5,Transformed data shape,"(34689, 511)"
6,Transformed train set shape,"(24282, 511)"
7,Transformed test set shape,"(10407, 511)"
8,Numeric features,510
9,Preprocess,True


In [7]:
comp = caret.compare_models(n_select=5, fold=5, sort='Recall')

In [8]:
init_report = pull()
init_report

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.7367,0.7992,0.6145,0.7178,0.6621,0.4486,0.4523,0.6
catboost,CatBoost Classifier,0.7429,0.8079,0.6097,0.7331,0.6656,0.4596,0.4648,0.824
lightgbm,Light Gradient Boosting Machine,0.7384,0.8046,0.607,0.7253,0.6608,0.4506,0.4554,0.832
rf,Random Forest Classifier,0.7375,0.7983,0.5856,0.7352,0.6519,0.4455,0.4529,0.716
dt,Decision Tree Classifier,0.6483,0.6392,0.5824,0.5811,0.5817,0.2783,0.2783,0.526
gbc,Gradient Boosting Classifier,0.7287,0.7914,0.5586,0.732,0.6336,0.4243,0.4342,0.576
ada,Ada Boost Classifier,0.7193,0.7768,0.5575,0.7118,0.6252,0.4061,0.414,0.816
et,Extra Trees Classifier,0.7147,0.7671,0.5505,0.7053,0.6183,0.396,0.4038,0.886
qda,Quadratic Discriminant Analysis,0.5291,0.5325,0.5087,0.4676,0.4105,0.0532,0.0584,0.658
lda,Linear Discriminant Analysis,0.6729,0.7221,0.5033,0.6406,0.5636,0.308,0.3138,0.522


In [9]:
xgb = comp[0]
xgb.random_state

1234

In [10]:
# OVERSAMPLING

sm = smt(n_jobs=-1, random_state=1234)

r_train_x, r_train_y = sm.fit_resample(X =train.drop('Review', axis=1), y = train['Review'])

In [11]:
r_train = r_train_x.copy()
r_train['Review'] = r_train_y

In [12]:
print(train.shape, r_train.shape)

(34689, 511) (40250, 511)


In [13]:
over_set = setup(r_train, fold=5, target='Review', session_id=1234)
or_xgb = over_set.create_model('xgboost', fold=5, return_train_score=True, random_state=1234)

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Review
2,Target type,Binary
3,Target mapping,"Negative: 0, Positive: 1"
4,Original data shape,"(40250, 511)"
5,Transformed data shape,"(40250, 511)"
6,Transformed train set shape,"(28175, 511)"
7,Transformed test set shape,"(12075, 511)"
8,Numeric features,510
9,Preprocess,True


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8689,0.945,0.8405,0.8912,0.8651,0.7379,0.7391
CV-Train,1,0.8686,0.9445,0.8342,0.8959,0.8639,0.7373,0.739
CV-Train,2,0.8738,0.9471,0.8401,0.9009,0.8694,0.7476,0.7494
CV-Train,3,0.8712,0.9471,0.8378,0.8978,0.8667,0.7424,0.7441
CV-Train,4,0.8702,0.9457,0.8337,0.8993,0.8653,0.7404,0.7423
CV-Val,0,0.7594,0.8396,0.7323,0.7741,0.7526,0.5187,0.5195
CV-Val,1,0.7542,0.8335,0.7153,0.7756,0.7442,0.5084,0.51
CV-Val,2,0.7608,0.8427,0.7291,0.7783,0.7529,0.5216,0.5226
CV-Val,3,0.7656,0.8462,0.7278,0.7873,0.7564,0.5312,0.5327
CV-Val,4,0.7595,0.8376,0.7211,0.7812,0.75,0.5191,0.5206


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
or_xgb_report = pull()

In [15]:
# UNDERSAMPLING

rs = rrs(random_state=1234)

rs_train_x, rs_train_y = rs.fit_resample(X =train.drop('Review', axis=1), y = train['Review'])

In [16]:
rs_train = rs_train_x.copy()
rs_train['Review'] = rs_train_y

In [17]:
print(rs_train.shape, train.shape)

(29128, 511) (34689, 511)


In [18]:
under_set = setup(rs_train, fold=5, target='Review', session_id=1234)
ur_xgb = under_set.create_model('xgboost', fold=5, return_train_score=True, random_state=1234)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8523,0.9295,0.8194,0.8771,0.8473,0.7046,0.7061
CV-Train,1,0.8605,0.9368,0.8256,0.8876,0.8555,0.721,0.7228
CV-Train,2,0.8632,0.9376,0.8289,0.8898,0.8583,0.7263,0.728
CV-Train,3,0.8643,0.9358,0.8245,0.8957,0.8586,0.7285,0.7308
CV-Train,4,0.8561,0.9322,0.8209,0.8831,0.8509,0.7122,0.714
CV-Val,0,0.7146,0.7878,0.691,0.7252,0.7077,0.4291,0.4296
CV-Val,1,0.719,0.7916,0.7013,0.727,0.7139,0.438,0.4382
CV-Val,2,0.7214,0.7985,0.694,0.7343,0.7136,0.4429,0.4435
CV-Val,3,0.7278,0.802,0.692,0.7454,0.7177,0.4556,0.4568
CV-Val,4,0.7268,0.7944,0.709,0.735,0.7218,0.4535,0.4538


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
ur_xgb_report = pull()

In [20]:
data2 = data.drop(['c0', 'c1', 'c2', 'c3', 'c4', 'c5'],axis=1)

train2, test2 = train_test_split(data2, random_state=1234,
                            stratify=data2['Review'])

caret2 = setup(train2, target='Review', session_id=1234, fold=5)
comp2 = caret2.compare_models(n_select=5, fold=5, sort='Recall')
init_report2 = pull()
init_report2

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Review
2,Target type,Binary
3,Target mapping,"Negative: 0, Positive: 1"
4,Original data shape,"(34689, 505)"
5,Transformed data shape,"(34689, 505)"
6,Transformed train set shape,"(24282, 505)"
7,Transformed test set shape,"(10407, 505)"
8,Numeric features,504
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.7377,0.8003,0.6124,0.721,0.6621,0.4501,0.4542,12.288
catboost,CatBoost Classifier,0.7426,0.8075,0.6098,0.7326,0.6655,0.4592,0.4644,15.368
lightgbm,Light Gradient Boosting Machine,0.7394,0.8039,0.6092,0.7261,0.6624,0.4528,0.4575,1.062
rf,Random Forest Classifier,0.7384,0.7997,0.583,0.7389,0.6517,0.4469,0.455,3.482
dt,Decision Tree Classifier,0.6465,0.6375,0.5809,0.579,0.5798,0.2747,0.2748,1.228
ada,Ada Boost Classifier,0.7191,0.7764,0.5575,0.7111,0.6249,0.4056,0.4133,2.864
gbc,Gradient Boosting Classifier,0.7279,0.7912,0.557,0.731,0.6322,0.4225,0.4324,6.372
et,Extra Trees Classifier,0.7122,0.7675,0.5481,0.7012,0.6153,0.3909,0.3985,5.16
knn,K Neighbors Classifier,0.6556,0.6728,0.4918,0.612,0.5452,0.2734,0.2777,3.246
svm,SVM - Linear Kernel,0.468,0.0,0.4774,0.4761,0.4431,-0.0397,-0.056,0.71


In [21]:
xgb2 = comp2[0]

In [22]:
# OVERSAMPLING

sm = smt(n_jobs=-1, random_state=1234)

r_train_x2, r_train_y2 = sm.fit_resample(X =train2.drop('Review', axis=1), y = train2['Review'])

r_train2 = r_train_x2.copy()
r_train2['Review'] = r_train_y2

over_set2 = setup(r_train2, fold=5, target='Review', session_id=1234)
or_xgb2 = over_set2.create_model('xgboost', fold=5, return_train_score=True, random_state=1234)

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Review
2,Target type,Binary
3,Target mapping,"Negative: 0, Positive: 1"
4,Original data shape,"(40250, 505)"
5,Transformed data shape,"(40250, 505)"
6,Transformed train set shape,"(28175, 505)"
7,Transformed test set shape,"(12075, 505)"
8,Numeric features,504
9,Preprocess,True


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8581,0.9363,0.8247,0.8838,0.8532,0.7162,0.7178
CV-Train,1,0.862,0.9391,0.8267,0.8895,0.8569,0.724,0.7258
CV-Train,2,0.8633,0.9388,0.8254,0.8931,0.8579,0.7266,0.7287
CV-Train,3,0.8567,0.9358,0.8193,0.8856,0.8512,0.7135,0.7155
CV-Train,4,0.8563,0.9358,0.8215,0.8828,0.8511,0.7125,0.7142
CV-Val,0,0.7535,0.8392,0.7288,0.7666,0.7472,0.507,0.5076
CV-Val,1,0.7544,0.8327,0.7164,0.7753,0.7446,0.5088,0.5103
CV-Val,2,0.7613,0.8419,0.7295,0.779,0.7534,0.5226,0.5237
CV-Val,3,0.7642,0.8422,0.7229,0.788,0.754,0.5283,0.5301
CV-Val,4,0.7558,0.8332,0.7246,0.7729,0.748,0.5116,0.5126


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
or_xgb_report2 = pull()

In [24]:
# UNDERSAMPLING

rs = rrs(random_state=1234)

rs_train_x2, rs_train_y2 = rs.fit_resample(X =train2.drop('Review', axis=1), y = train2['Review'])

rs_train2 = rs_train_x2.copy()
rs_train2['Review'] = rs_train_y2

under_set2 = setup(rs_train2, fold=5, target='Review', session_id=1234)
ur_xgb2 = under_set2.create_model('xgboost', fold=5, return_train_score=True, random_state=1234)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8537,0.9314,0.8193,0.8799,0.8485,0.7074,0.7091
CV-Train,1,0.8603,0.9357,0.8282,0.885,0.8556,0.7206,0.722
CV-Train,2,0.8627,0.9371,0.8243,0.8929,0.8572,0.7255,0.7276
CV-Train,3,0.8587,0.9328,0.8178,0.8907,0.8527,0.7175,0.7199
CV-Train,4,0.8543,0.9281,0.8178,0.8823,0.8488,0.7087,0.7106
CV-Val,0,0.7258,0.7931,0.6984,0.739,0.7181,0.4517,0.4524
CV-Val,1,0.716,0.7886,0.7018,0.7224,0.7119,0.4321,0.4322
CV-Val,2,0.7285,0.7963,0.6979,0.7435,0.72,0.4571,0.4579
CV-Val,3,0.7293,0.8007,0.6994,0.7439,0.7209,0.4586,0.4594
CV-Val,4,0.7219,0.798,0.7007,0.7316,0.7158,0.4437,0.4441


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
ur_xgb_report2 = pull()