In [144]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold

from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, make_scorer, f1_score
from datetime import datetime
import shap
import json
import joblib
from pycaret.classification import *

In [145]:
path = 'c:\\Users\\wongi\\Desktop\\알고리즘랩스\\프로젝트\\현대산업개발\\develop'

In [146]:
raw_data = pd.read_csv(f'{path}\\algorithmlabs.inference\\train_data_promotion.csv')
missing_rate = joblib.load(f'{path}\\algorithmlabs.inference\\pickle_promotion\\missing_rate.pkl')
name_dict = joblib.load(f'{path}\\algorithmlabs.inference\\pickle_promotion\\name_dict.pkl')

In [147]:
raw_data['NO_POINT_SUM'] = round(raw_data['NO_POINT_SUM'], 1)
train = raw_data.copy()

In [148]:
train['YN_HANDICAP'].replace({'Y' : 1, 'N' : 0}, inplace=True)
train['YN_SUPPORTED'].replace({'Y' : 1, 'N' : 0}, inplace=True)
train['YN_FOREIGNER'].replace({'Y' : 1, 'N' : 0}, inplace=True)
train['YN_GENDER'].replace({'M' : 1, 'W' : 0}, inplace=True)

In [149]:
le = LabelEncoder()
train['DS_BIRTHPLACE'] = le.fit_transform(train['DS_BIRTHPLACE'])

In [150]:
train.drop(['DS_BONBU', 'DS_DEPT', 'DS_JOBFAMILY', 'DS_ZONE', 'DS_JOBTYPE', 'DS_DUTY', 'DS_ADOPTYPE', 'DS_JOBFAMILY_JOIN'], axis=1, inplace=True)

In [151]:
# 0411 수정
params = setup(data=train, target='LABEL', train_size=0.75, fold=10, session_id=0, preprocess=True, fix_imbalance=True, 
               numeric_features=train.drop(['LABEL'], axis=1).columns.tolist(), ignore_features=['ID_SABUN'])

Unnamed: 0,Description,Value
0,session_id,0
1,Target,LABEL
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(118, 29)"
5,Missing Values,False
6,Numeric Features,27
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [152]:
results = pull()
trainset_size = results.data['Value'][results.data[results.data['Description']=='Transformed Train Set'].index[0]][0]
testset_size = results.data['Value'][results.data[results.data['Description']=='Transformed Test Set'].index[0]][0]
total_size = trainset_size + testset_size

In [153]:
# 0411 수정
today = datetime.today().strftime("%Y%m%d")
top1 = compare_models(n_select=1, sort='Recall', include=['lightgbm', 'rf', 'dt', 'gbc', 'et', 'catboost', 'xgboost'])
save_model(top1 , 'promotion_model')
saved_model = load_model('promotion_model')
predict = predict_model(saved_model.named_steps["trained_model"])
metrics = pull()

metrics_dict = {}
metrics_dict['accuracy_score'] = metrics.head(1)['Accuracy'].values[0]
metrics_dict['recall_score'] = metrics.head(1)['Recall'].values[0]
metrics_dict['precision_score'] = metrics.head(1)['Prec.'].values[0]
metrics_dict['f1_score'] = metrics.head(1)['F1'].values[0]
metrics_dict['auc'] = metrics.head(1)['AUC'].values[0]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8611,0.8512,0.6,0.65,0.6,0.5408,0.5591,1.346
xgboost,Extreme Gradient Boosting,0.8597,0.8226,0.55,0.6833,0.59,0.5326,0.5491,0.039
lightgbm,Light Gradient Boosting Machine,0.8139,0.8143,0.5,0.5833,0.5233,0.4232,0.4325,0.012
rf,Random Forest Classifier,0.8403,0.8101,0.5,0.5333,0.4933,0.4373,0.4538,0.101
gbc,Gradient Boosting Classifier,0.8514,0.8798,0.5,0.6,0.5167,0.4675,0.4912,0.032
dt,Decision Tree Classifier,0.7722,0.6583,0.45,0.3333,0.3733,0.2749,0.2889,0.007
et,Extra Trees Classifier,0.825,0.8464,0.45,0.5833,0.49,0.4126,0.4272,0.091


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.9,0.9423,0.75,0.6,0.6667,0.6087,0.6139


In [157]:
train_pipe = saved_model[:-1].transform(train)
explainer = shap.TreeExplainer(saved_model.named_steps["trained_model"])
shap_values = explainer.shap_values(train_pipe)

if len(shap_values) == 2 :
    importances = np.absolute(shap_values[1]).sum(axis=0) / shap_values[1].shape[0] 
else : 
    importances = np.absolute(shap_values).sum(axis=0) / shap_values.shape[0]
    
feature_importance = pd.Series(importances / np.sum(importances))
feature_importance.index = train.drop(['ID_SABUN', 'LABEL'], axis=1).columns
fe_dict = feature_importance.to_dict()

In [159]:
new_fe_dict = {}

for name in fe_dict.keys() : 
    for name_ex in name_dict.keys() : 
        if name == name_ex : 
            new_fe_dict[name_dict[name_ex]] = fe_dict[name]

In [160]:
new_missing_dict = {}

for name in missing_rate.keys() : 
    for name_ex in name_dict.keys() : 
        if name == name_ex : 
            new_missing_dict[name_dict[name_ex]] = missing_rate[name]

In [161]:
main_dict = {}
main_dict['feature_importance'] = new_fe_dict
main_dict['trainset_size'] = trainset_size
main_dict['testset_size'] = testset_size
main_dict['total_size'] = total_size
main_dict['last_train_date'] = today
main_dict['predict_semester'] = '2023년'
main_dict['data_period'] = '2020-01-01 ~ 2022-12-31'
main_dict['metrics'] = metrics_dict
main_dict['feature_missing_rate'] = new_missing_dict

In [162]:
file_path = f'{path}\\algorithmlabs.inference\\model_data_promotion.json'
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(json.dumps(main_dict, ensure_ascii=False, indent=2))

In [163]:
joblib.dump(saved_model, f'{path}\\algorithmlabs.inference\\pickle_promotion\\model.pkl')
joblib.dump(explainer, f'{path}\\algorithmlabs.inference\\pickle_promotion\\explainer.pkl')
joblib.dump(le, f'{path}\\algorithmlabs.inference\\pickle_promotion\\label_encoder.pkl')

['c:\\Users\\wongi\\Desktop\\알고리즘랩스\\프로젝트\\현대산업개발\\develop\\algorithmlabs.inference\\pickle_promotion\\label_encoder.pkl']