In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

from pycaret.classification import *

from glob import glob

import os
import pandas as pd
import numpy as np

In [2]:
pwd = os.getcwd()

genre_path = glob(f'{pwd}\\*genre.csv', recursive=True)[0]
data_path = glob(f'{pwd}\\*data.csv', recursive=True)[0]
tag_path = glob(f'{pwd}\\tag*.csv', recursive=True)[0]

print(genre_path, data_path, tag_path, sep='\n\n')

c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\one_hot_genre.csv

c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\pre_data.csv

c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\tag_merge.csv


In [3]:
g_df = pd.read_csv(genre_path)
r_df = pd.read_csv(data_path)
t_df = pd.read_csv(tag_path)

# Tag X

In [23]:
merge_df = pd.merge(r_df, g_df)
merge_df['genre_num'] = merge_df['genre_set'].apply(lambda x:len(set(eval(x))))
merge_copy = merge_df.copy()

X = merge_copy.drop(['appid','Review', 'Target_', 'positive', 'genre_set' ,'negative',
                    'recommendations', 'genre_set'], axis=1)
y = merge_copy['Target_']

# Tag O

In [13]:
merge_df = pd.merge(r_df, g_df).merge(t_df, on='appid')
merge_df['genre_num'] = merge_df['genre_set'].apply(lambda x:len(set(eval(x))))
merge_copy = merge_df.copy()

X = merge_copy.drop(['appid','Review', 'Target_', 'positive', 'genre_set' ,'negative',
                    'recommendations', 'genre_set'], axis=1)
y = merge_copy['Target_']

In [None]:
print(X.shape, y.shape)

# Basic Dataset

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    random_state=1234,
                                                    stratify=y)

print(f'train_x : {x_train.shape}, train_y : {y_train.shape}')
print(f'test_x : {x_test.shape}, test_y : {y_test.shape}')

train_x : (32376, 60), train_y : (32376,)
test_x : (13876, 60), test_y : (13876,)


# Pycaret DataSet

## SMOTE O

In [25]:
X_resampled, y_resampled = SMOTE(random_state=1234,
                                sampling_strategy='all', n_jobs=-1).fit_resample(x_train, y_train)

print(X_resampled.shape, y_resampled.shape)

# pycaret
Train = X_resampled
Train['Target'] = y_resampled

Test = x_test
Test['Target'] = y_test

(54372, 60) (54372,)


## SMOTE X

In [19]:
Train = x_train
Train['Target'] = y_train

Test = x_test
Test['Target'] = y_test

# Pycaret

In [26]:
exp_clf = setup(data = Train, target = 'Target', session_id=1234)

In [28]:
# 태그 x, SMOTE O

comp = compare_models(include=['lightgbm', 'xgboost', 'rf', 'et'], n_select=4)
model_result = pull()
model_result

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7403,0.9143,0.7403,0.745,0.7385,0.6537,0.6563,2.342
et,Extra Trees Classifier,0.7148,0.9003,0.7148,0.718,0.714,0.6198,0.6212,2.845
xgboost,Extreme Gradient Boosting,0.7096,0.9039,0.7096,0.713,0.7046,0.6128,0.6165,12.091
lightgbm,Light Gradient Boosting Machine,0.6926,0.896,0.6926,0.6961,0.6861,0.5901,0.5946,1.468


In [11]:
model_result.to_csv('./no_tag-yes_smote.csv')

In [49]:
pred = predict_model(comp[2], data=Test)['prediction_label']

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.6358,0.7946,0.6358,0.6126,0.6178,0.4037,0.4078


In [45]:
print(classification_report(pred, Test['Target']))

              precision    recall  f1-score   support

           0       0.81      0.66      0.73      7018
           1       0.20      0.37      0.26      1060
           2       0.09      0.18      0.12       178
           3       0.65      0.68      0.67      5620

    accuracy                           0.64     13876
   macro avg       0.44      0.47      0.44     13876
weighted avg       0.69      0.64      0.66     13876



In [50]:
final_model = finalize_model(comp[2])

In [51]:
pred = predict_model(final_model, data=Test)['prediction_label']
print(classification_report(pred, Test['Target']))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.6397,0.7998,0.6397,0.6145,0.6191,0.4079,0.4133


              precision    recall  f1-score   support

           0       0.81      0.66      0.73      7069
           1       0.20      0.38      0.27      1042
           2       0.08      0.20      0.11       138
           3       0.65      0.67      0.66      5627

    accuracy                           0.64     13876
   macro avg       0.44      0.48      0.44     13876
weighted avg       0.69      0.64      0.66     13876



In [52]:
final_model.get_params

<bound method Pipeline.get_params of Pipeline(memory=FastMemory(location=C:\Users\NT550-~1\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['24_Hour_Peak', 'All_time_peak',
                                             'average_forever',
                                             'average_2weeks', 'median_forever',
                                             'median_2weeks', 'price',
                                             'initialprice', 'num_lang',
                                             'required_age', 'is_free',
                                             'controller_support', 'dlc',
                                             'metacritic', 'm...
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=

In [53]:
tune_xgb = tune_model(comp[2],
                    search_library='optuna',
                    optimize='Recall')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6624,0.8824,0.6624,0.664,0.6538,0.5498,0.5551
1,0.6616,0.8804,0.6616,0.6633,0.6529,0.5488,0.5541
2,0.6653,0.8815,0.6653,0.6683,0.6558,0.5537,0.5594
3,0.6703,0.8854,0.6703,0.6719,0.663,0.5604,0.5649
4,0.6682,0.8825,0.6682,0.6686,0.6611,0.5576,0.5613
5,0.6684,0.8797,0.6684,0.6689,0.6593,0.5579,0.5628
6,0.6755,0.8823,0.6755,0.6812,0.6687,0.5673,0.5727
7,0.6687,0.8822,0.6687,0.6706,0.6609,0.5582,0.5628
8,0.6689,0.8821,0.6689,0.6698,0.66,0.5586,0.5638
9,0.6708,0.8814,0.6708,0.6692,0.6597,0.561,0.5667


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

[32m[I 2023-05-24 18:41:15,597][0m Searching the best hyperparameters using 38060 samples...[0m
[32m[I 2023-05-24 19:03:46,877][0m Finished hyperparemeter search![0m


In [54]:
pred = predict_model(tune_xgb, data=Test)['prediction_label']
print(classification_report(pred, Test['Target']))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.6358,0.7946,0.6358,0.6126,0.6178,0.4037,0.4078


              precision    recall  f1-score   support

           0       0.80      0.66      0.72      6915
           1       0.21      0.37      0.27      1136
           2       0.11      0.25      0.15       151
           3       0.65      0.67      0.66      5674

    accuracy                           0.64     13876
   macro avg       0.44      0.49      0.45     13876
weighted avg       0.68      0.64      0.65     13876



In [10]:
# 태그 x, SMOTE X

comp = compare_models(include=['lightgbm', 'xgboost', 'rf', 'et'], n_select=4)
model_result2 = pull()
model_result2

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7403,0.9143,0.7403,0.745,0.7385,0.6537,0.6563,0.456
et,Extra Trees Classifier,0.7148,0.9003,0.7148,0.718,0.714,0.6198,0.6212,0.728
xgboost,Extreme Gradient Boosting,0.7096,0.9039,0.7096,0.713,0.7046,0.6128,0.6165,0.229
lightgbm,Light Gradient Boosting Machine,0.6926,0.896,0.6926,0.6961,0.6861,0.5901,0.5946,0.332


In [12]:
model_result2.to_csv('./no_tag-no_smote.csv')

In [17]:
comp = compare_models(include=['lightgbm', 'xgboost', 'rf', 'et'], n_select=4)
model_result3 = pull()
model_result3

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7872,0.9359,0.7872,0.7925,0.7856,0.7163,0.7191,5.979
et,Extra Trees Classifier,0.7756,0.9268,0.7756,0.7795,0.7746,0.7009,0.7028,9.847
xgboost,Extreme Gradient Boosting,0.7467,0.925,0.7467,0.7487,0.7439,0.6623,0.6646,80.736
lightgbm,Light Gradient Boosting Machine,0.7387,0.9219,0.7387,0.7403,0.7355,0.6516,0.6539,6.215


In [18]:
model_result3.to_csv('./yes_tag-yes_smote.csv')

In [20]:
comp = compare_models(include=['lightgbm', 'xgboost', 'rf', 'et'], n_select=4)
model_result4 = pull()
model_result4

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7872,0.9359,0.7872,0.7925,0.7856,0.7163,0.7191,2.149
et,Extra Trees Classifier,0.7756,0.9268,0.7756,0.7795,0.7746,0.7009,0.7028,2.653
xgboost,Extreme Gradient Boosting,0.7467,0.925,0.7467,0.7487,0.7439,0.6623,0.6646,40.86
lightgbm,Light Gradient Boosting Machine,0.7387,0.9219,0.7387,0.7403,0.7355,0.6516,0.6539,1.865


In [21]:
model_result4.to_csv('./yes_tag-no_smote.csv')

In [22]:
print(model_result, model_result2, model_result3, model_result4, sep='\n\n')

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7403  0.9143  0.7403  0.7450   
et                 Extra Trees Classifier    0.7148  0.9003  0.7148  0.7180   
xgboost         Extreme Gradient Boosting    0.7096  0.9039  0.7096  0.7130   
lightgbm  Light Gradient Boosting Machine    0.6926  0.8960  0.6926  0.6961   

              F1   Kappa     MCC  TT (Sec)  
rf        0.7385  0.6537  0.6563     2.233  
et        0.7140  0.6198  0.6212     2.727  
xgboost   0.7046  0.6128  0.6165    13.300  
lightgbm  0.6861  0.5901  0.5946     1.341  

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.7403  0.9143  0.7403  0.7450   
et                 Extra Trees Classifier    0.7148  0.9003  0.7148  0.7180   
xgboost         Extreme Gradient Boosting    0.7096  0.9039  0.7096  0.7130   
lightgbm  Light Gradient Boosting Machine    0.6926  0.8960  0