In [None]:
!pip install category-encoders

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, make_scorer, f1_score
from imblearn.pipeline import Pipeline
from datetime import datetime
import shap
import json
import joblib

In [None]:
raw_data = pd.read_csv('/ml/algorithmlabs.inference/train_data.csv')
missing_rate = joblib.load('/ml/algorithmlabs.inference/pickle/missing_rate.pkl')
name_dict = joblib.load('/ml/algorithmlabs.inference/pickle/name_dict.pkl')

In [None]:
train_data = raw_data.copy()

In [None]:
# 0207 수정
# 0208 수정
train_data.drop(['STUDENT_CD', 'CHG_YEAR', 'CHG_SMT', 'BIRTH', '이름', 'CHG_DIV'], axis=1, inplace=True)

In [None]:
# target encoding
object_cols = train_data.select_dtypes('object').columns
te = TargetEncoder()
train_data[object_cols] = te.fit_transform(train_data[object_cols], train_data['LABEL'])

In [None]:
# split
X = train_data.drop(['LABEL'], axis=1)
y = train_data['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify = y)

In [None]:
# scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)
X_train = pd.DataFrame(data=X_scaled, index=X_train.index, columns=X_train.columns)
X_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(data=X_scaled, index=X_test.index, columns=X_test.columns)

In [None]:
# smote
sm = SMOTE(random_state=0, k_neighbors=7)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
# baseline model
model_rf = RandomForestClassifier(random_state=0)
model_xg = XGBClassifier(random_state=0, use_label_encoder=False, objective='binary:logistic')
model_ex = ExtraTreesClassifier(random_state=0)
model_gb = GradientBoostingClassifier(random_state=0)

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# gridsearchcv
params = [{'n_estimators':[200, 300], 'max_depth':[5]}]
grid_rf = GridSearchCV(model_rf, param_grid=params, cv=kfold, scoring='recall', n_jobs=-1)
grid_ex = GridSearchCV(model_ex, param_grid=params, cv=kfold, scoring='recall', n_jobs=-1)

params_b = [{'n_estimators':[200, 300], 'max_depth':[5], 'learning_rate' : [.01, .001, .0001]}]
grid_xg = GridSearchCV(model_xg, param_grid=params_b, cv=kfold, scoring='recall', n_jobs=-1)
grid_gb = GridSearchCV(model_gb, param_grid=params_b, cv=kfold, scoring='recall', n_jobs=-1)

grid_rf.fit(X_res, y_res)
grid_xg.fit(X_res, y_res)
grid_ex.fit(X_res, y_res)
grid_gb.fit(X_res, y_res)

best_rf = grid_rf.best_estimator_
best_xg = grid_xg.best_estimator_
best_ex = grid_ex.best_estimator_
best_gb = grid_gb.best_estimator_

In [None]:
voting = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xg), ('ex', best_ex), ('gb', best_gb)], voting='soft', n_jobs=-1)

voting.fit(X_res, y_res)
now = datetime.now()

results_pred = voting.predict(X_test)
acc = accuracy_score(y_test, results_pred)
recall = recall_score(y_test, results_pred)
pre = precision_score(y_test, results_pred)
f1 = f1_score(y_test, results_pred)
auc = roc_auc_score(y_test, results_pred)

metrics_dict = {}
metrics_dict['accuracy_score'] = acc
metrics_dict['recall_score'] = recall
metrics_dict['precision_score'] = pre
metrics_dict['f1_score'] = f1
metrics_dict['auc'] = auc

In [None]:
explainer_1 = shap.TreeExplainer(voting.estimators_[0], X_res, check_additivity=False, model_output='probability') 
shap_values_1 = explainer_1.shap_values(X_test, check_additivity=False) 
explainer_2 = shap.TreeExplainer(voting.estimators_[1], X_res, check_additivity=False, model_output='probability') 
shap_values_2 = explainer_2.shap_values(X_test, check_additivity=False) 
explainer_3 = shap.TreeExplainer(voting.estimators_[2], X_res, check_additivity=False, model_output='probability') 
shap_values_3 = explainer_3.shap_values(X_test, check_additivity=False) 
explainer_4 = shap.TreeExplainer(voting.estimators_[3], X_res, check_additivity=False, model_output='probability') 
shap_values_4 = explainer_4.shap_values(X_test, check_additivity=False) 

In [None]:
importances_1 = np.absolute(shap_values_1[1]).sum(axis=0) / shap_values_1[1].shape[0]
importances_2 = np.absolute(shap_values_2).sum(axis=0) / shap_values_2.shape[0]
importances_3 = np.absolute(shap_values_3[1]).sum(axis=0) / shap_values_3[1].shape[0]
importances_4 = np.absolute(shap_values_4).sum(axis=0) / shap_values_4.shape[0]

importances_sum = (importances_1 + importances_2 + importances_3 + importances_4)/4

feature_importance = pd.Series(importances_sum / np.sum(importances_sum))
feature_importance.index = X_test.columns

fe_dict = feature_importance.to_dict()

In [None]:
new_fe_dict = {}

for name in fe_dict.keys() : 
    for name_ex in name_dict.keys() : 
        if name == name_ex : 
            new_fe_dict[name_dict[name_ex]] = fe_dict[name]

In [None]:
new_missing_dict = {}

for name in missing_rate.keys() : 
    for name_ex in name_dict.keys() : 
        if name == name_ex : 
            new_missing_dict[name_dict[name_ex]] = missing_rate[name]

In [None]:
main_dict = {}
main_dict['feature importance'] = new_fe_dict
main_dict['trainset_size'] = len(X_train)
main_dict['testset_size'] = len(X_test)
main_dict['total_size'] = len(X_train)+len(X_test)
main_dict['last_train_date'] = str(now)
main_dict['predict_semester'] = '2023년 1학기'
main_dict['data_period'] = '2021-03-01 ~ 2021-08-31'
main_dict['metrics'] = metrics_dict
main_dict['feature missing rate'] = new_missing_dict

In [None]:
file_path = '/ml/algorithmlabs.inference/model_data_dropout.json'
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(json.dumps(main_dict, ensure_ascii=False, indent=2))

In [None]:
joblib.dump(voting, '/ml/algorithmlabs.inference/pickle/model.pkl')
joblib.dump(te, '/ml/algorithmlabs.inference/pickle/encoder.pkl')
joblib.dump(scaler, '/ml/algorithmlabs.inference/pickle/scaler.pkl')
joblib.dump(explainer_1, '/ml/algorithmlabs.inference/pickle/shap_explainer_1.pkl')
joblib.dump(explainer_2, '/ml/algorithmlabs.inference/pickle/shap_explainer_2.pkl')
joblib.dump(explainer_3, '/ml/algorithmlabs.inference/pickle/shap_explainer_3.pkl')
joblib.dump(explainer_4, '/ml/algorithmlabs.inference/pickle/shap_explainer_4.pkl')