In [88]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

from glob import glob

import os
import pandas as pd
import numpy as np

In [72]:
pwd = os.getcwd()

genre_path = glob(f'{pwd}\\*genre.csv', recursive=True)[0]
data_path = glob(f'{pwd}\\*data.csv', recursive=True)[0]
tag_path = glob(f'{pwd}\\tag*.csv', recursive=True)[0]

print(genre_path, data_path, tag_path)

c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\one_hot_genre.csv c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\pre_data.csv c:\Users\NT550-045\Desktop\ml2\EDA\LSH\model\tag_merge.csv


In [98]:
g_df = pd.read_csv(genre_path)
r_df = pd.read_csv(data_path)
t_df = pd.read_csv(tag_path)

In [99]:
merge_df = pd.merge(r_df, g_df).merge(t_df, on='appid')
# merge_df = pd.merge(r_df, g_df)
merge_df.head(1)

Unnamed: 0,appid,24_Hour_Peak,All_time_peak,positive,negative,average_forever,average_2weeks,median_forever,median_2weeks,price,...,360 Video,Linear,Grid-Based Movement,FMV,Lemmings,Tile-Matching,Indie_y,Arcade,Atmospheric,Cats
0,294100,20930,60742,157344,3038,12492,1401,5368,921,34.99,...,0.0,0.0,0.0,0.0,0.0,0.0,1223.0,0.0,0.0,0.0


In [100]:
merge_df['genre_num'] = merge_df['genre_set'].apply(lambda x:len(set(eval(x))))

In [101]:
merge_copy = merge_df.copy()

In [102]:
# X = merge_copy.drop(['appid','Review', 'Target_'], axis=1)
X = merge_copy.drop(['appid','Review', 'Target_', 'positive', 'genre_set' ,'negative',
                    'recommendations'], axis=1)
y = merge_copy['Target_']

In [103]:
print(X.shape, y.shape)

(46252, 506) (46252,)


In [104]:
X_resampled, y_resampled = SMOTE(random_state=1234,
                                sampling_strategy='all').fit_resample(X, y)

print(X_resampled.shape, y_resampled.shape)

(77676, 506) (77676,)


In [105]:
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                    test_size=0.3,
                                                    random_state=1234,
                                                    stratify=y_resampled)

In [106]:
rf = RandomForestClassifier(random_state=1234,n_jobs=-1)
rf.fit(x_train, y_train)
pred_rf = rf.predict(x_test)

print(classification_report(pred_rf, y_test))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75      7076
           1       0.74      0.85      0.79      5066
           2       0.97      0.94      0.96      5971
           3       0.64      0.72      0.68      5190

    accuracy                           0.80     23303
   macro avg       0.80      0.80      0.79     23303
weighted avg       0.80      0.80      0.80     23303



In [107]:
imp_rf_df = pd.DataFrame({'col':rf.feature_names_in_, 'imp':rf.feature_importances_})
imp_rf_df = imp_rf_df.sort_values(by='imp',ascending=False)
imp_rf_df.reset_index(drop=True, inplace=True)
imp_rf_df.head(15)

Unnamed: 0,col,imp
0,days_after_release,0.055274
1,All_time_peak,0.034225
2,achievements,0.033997
3,initialprice,0.029479
4,month,0.028542
5,price,0.027523
6,Indie_y,0.022139
7,num_lang,0.020344
8,cluster,0.018972
9,Casual_y,0.017493


In [108]:
xgb = XGBClassifier(n_jobs=-1)
xgb.fit(x_train, y_train)
pred_xgb = xgb.predict(x_test)

print(classification_report(pred_xgb, y_test))

              precision    recall  f1-score   support

           0       0.82      0.68      0.74      6953
           1       0.61      0.75      0.67      4802
           2       0.89      0.84      0.87      6137
           3       0.64      0.69      0.67      5411

    accuracy                           0.74     23303
   macro avg       0.74      0.74      0.74     23303
weighted avg       0.75      0.74      0.74     23303



In [109]:
imp_xgb_df = pd.DataFrame({'col':xgb.feature_names_in_, 'imp':xgb.feature_importances_})
imp_xgb_df = imp_xgb_df.sort_values(by='imp',ascending=False)
imp_xgb_df.reset_index(drop=True, inplace=True)
imp_xgb_df.head(15)

Unnamed: 0,col,imp
0,controller_support,0.041708
1,mac,0.016239
2,All_time_peak,0.015306
3,Casual_x,0.013909
4,Simulation_x,0.013763
5,dlc,0.011278
6,Indie_x,0.011158
7,N,0.009359
8,year,0.009343
9,VR Only,0.009048


In [110]:
lgbm = LGBMClassifier(random_state=1234, n_jobs=-1)
lgbm.fit(x_train, y_train)
pred_lgbm = lgbm.predict(x_test)

print(classification_report(pred_lgbm, y_test))

              precision    recall  f1-score   support

           0       0.81      0.69      0.74      6831
           1       0.60      0.73      0.65      4765
           2       0.88      0.83      0.85      6180
           3       0.65      0.69      0.67      5527

    accuracy                           0.73     23303
   macro avg       0.73      0.73      0.73     23303
weighted avg       0.75      0.73      0.74     23303



In [111]:
imp_lgb_df = pd.DataFrame({'col':lgbm.feature_name_, 'imp':lgbm.feature_importances_})
imp_lgb_df = imp_lgb_df.sort_values(by='imp',ascending=False)
imp_lgb_df.reset_index(drop=True, inplace=True)
imp_lgb_df.head(15)

Unnamed: 0,col,imp
0,days_after_release,494
1,Indie_y,401
2,initialprice,372
3,price,301
4,All_time_peak,289
5,achievements,280
6,Singleplayer,230
7,dlc,223
8,num_lang,192
9,Action_y,180
