In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install

In [None]:
!pip install ftfy
!pip install pycaret
!pip install shap==0.42.1

# Package

In [None]:
import pandas as pd
import numpy as np
import re
import ftfy
from datetime import datetime
import joblib
import json
from pycaret.classification import *
import shap
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.simplefilter('ignore')

In [None]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# Function

In [None]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [None]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None)->pd.DataFrame:

    if dtypesMapper:
        df = pd.read_csv(
            filePath,
            # engine='openpyxl',
            usecols=[k for k, v in dtypesMapper.items()],
            dtype=dtypesMapper
            # low_memory=False
            )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

# Dataset

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023'

In [None]:
Mapper = {
    '사번' : str,
    '성별' : np.int64,
    '연령' : np.int64,
    '연차' : np.int32,
    '최종학력' : np.int64,
    '신입경력' : np.int64,
    '수상여부' : np.int32,
    '전년도평균교육점수' : np.float32,
    '전년도KPI통과여부' : np.float32,
    '전년도평가점수' : np.float32,
    '전전년도평가점수' : np.float32,
    '전전전년도평가점수' : np.float32,
    '전년도총교육시간' : np.float32,
    '전년도총이수교육횟수' : np.float32,
    '근속개월' : np.int64,
    '다면평가등급' : np.float64,
    '승진적합여부' : np.int64,
    '문항1평균평점' : np.float32,
    '문항2평균평점' : np.float32,
    '문항3평균평점' : np.float32,
    '문항4평균평점' : np.float32,
    '문항5평균평점' : np.float32,
    '문항6평균평점' : np.float32,
    '문항7평균평점' : np.float32,
    '문항8평균평점' : np.float32,
    '문항9평균평점' : np.float32,
    '문항10평균평점' : np.float32,
    '문항11평균평점' : np.float32,
    '문항12평균평점' : np.float32,
    '문항13평균평점' : np.float32,
    '문항14평균평점' : np.float32,
    '문항15평균평점' : np.float32,
    '문항16평균평점' : np.float32,
    '문항17평균평점' : np.float32,
    '문항18평균평점' : np.float32,
    '문항19평균평점' : np.float32,
    '문항20평균평점' : np.float32,
    '문항21평균평점' : np.float32,
    '문항22평균평점' : np.float32,
    '문항23평균평점' : np.float32,
    '문항24평균평점' : np.float32,
    '전체평균평점' : np.float32,
}

train_data = read_selected(filePath = f'{data_path}/output/train_data_promotion.csv',
                   dtypesMapper=Mapper,
                   textCols=['사번'])

In [None]:
missing_rate = joblib.load(f'{data_path}/output/pypkl/missing_rate_promotion.pkl')

# Preprocess

In [None]:
train = train_data.drop(['사번'], axis=1)

In [None]:
score = ['문항1평균평점', '문항2평균평점', '문항3평균평점', '문항4평균평점',
       '문항5평균평점', '문항6평균평점', '문항7평균평점', '문항8평균평점', '문항9평균평점', '문항10평균평점',
       '문항11평균평점', '문항12평균평점', '문항13평균평점', '문항14평균평점', '문항15평균평점', '문항16평균평점',
       '문항17평균평점', '문항18평균평점', '문항19평균평점', '문항20평균평점', '문항21평균평점', '문항22평균평점',
       '문항23평균평점', '문항24평균평점', '전체평균평점']

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train.drop(['승진적합여부']+score, axis=1))
train_ = pd.DataFrame(train_scaled, index=train.drop(['승진적합여부'], axis=1).index, columns=train.drop(['승진적합여부']+score, axis=1).columns)
train[train.drop(['승진적합여부']+score, axis=1).columns] = train_

sc_scaler = StandardScaler()
train_scaled = sc_scaler.fit_transform(train[score])
train_ = pd.DataFrame(train_scaled, index=train[score].index, columns=train[score].columns)
train[score] = train_

In [None]:
pca = PCA(n_components=5)
train_pca = pca.fit_transform(train[train.columns[16:-1]].values)
pca_list = ['다면평가 주성분_1', '다면평가 주성분_2', '다면평가 주성분_3', '다면평가 주성분_4', '다면평가 주성분_5']
train[pca_list] = pd.DataFrame(data = train_pca, columns=pca_list)[pca_list]
train.drop(train.columns[16:-5], axis=1, inplace=True)

# Modeling

In [None]:
params = setup(data=train, target='승진적합여부', train_size=0.7, fold=10, session_id=random_seed, preprocess=True, fix_imbalance=False,
               numeric_features=train.drop(['승진적합여부'], axis=1).columns.tolist(), html=False, verbose=False)

In [None]:
results = pull()
trainset_size = results[results['Description']=='Transformed train set shape']['Value'].iloc[0][0]
testset_size = results[results['Description']=='Transformed test set shape']['Value'].iloc[0][0]
total_size = trainset_size + testset_size

In [None]:
top1 = compare_models(n_select=1, sort='AUC', include=['rf'], verbose=False)
# save_model(top1 , 'model')
# saved_model = load_model('model')
end_model = tune_model(top1, optimize='AUC', verbose=False)
predict = predict_model(end_model, verbose=False)
metrics = pull()

metrics_dict = {}
metrics_dict['accuracy_score'] = metrics.head(1)['Accuracy'].values[0]
metrics_dict['recall_score'] = metrics.head(1)['Recall'].values[0]
metrics_dict['precision_score'] = metrics.head(1)['Prec.'].values[0]
metrics_dict['f1_score'] = metrics.head(1)['F1'].values[0]
metrics_dict['auc'] = metrics.head(1)['AUC'].values[0]

In [None]:
# train_pipe = end_model.transform(train)
train.drop(['승진적합여부'], axis=1, inplace=True)
explainer = shap.TreeExplainer(end_model)
shap_values = explainer.shap_values(train)

if len(shap_values) == 2 :
    importances = np.absolute(shap_values[1]).sum(axis=0) / shap_values[1].shape[0]
else :
    importances = np.absolute(shap_values).sum(axis=0) / shap_values.shape[0]

feature_importance = pd.Series(importances / np.sum(importances))
feature_importance.index = train.columns

In [None]:
final_feature_imp = feature_importance[0:14]
final_feature_imp['다면평가'] = sum(feature_importance[14:])
fe_dict = final_feature_imp.to_dict()

In [None]:
final_feature_imp

In [None]:
main_dict = {}
main_dict['feature_importance'] = fe_dict
main_dict['trainset_size'] = trainset_size
main_dict['testset_size'] = testset_size
main_dict['total_size'] = total_size
main_dict['last_train_date'] = today
main_dict['predict_semester'] = '2023년'
main_dict['data_period'] = '2021-01-01 ~ 2022-12-31'
main_dict['metrics'] = metrics_dict
main_dict['feature_missing_rate'] = missing_rate

In [None]:
metrics_dict

In [None]:
'''
file_path = f'{data_path}/output/model_data_promotion.json'
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(json.dumps(main_dict, ensure_ascii=False, indent=2))
'''

In [None]:

joblib.dump(scaler, f'{data_path}/result/model/scaler_promotion.pkl')
# joblib.dump(pca, f'{data_path}/result/model/kb_promotion_pca.pkl')
joblib.dump(end_model, f'{data_path}/result/model/model_promotion.pkl')
# joblib.dump(explainer, f'{data_path}/output/pypkl/kb_promotion_explainer.pkl')


In [None]:
joblib.dump(end_model, f'{data_path}/output/pypkl/kb_promotion_model.pkl')
joblib.dump(scaler, f'{data_path}/output/pypkl/kb_promotion_scaler.pkl')