In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install

In [None]:
!pip install ftfy
!pip install pycaret
!pip install shap==0.42.1
!pip install xgboost
!pip install lightgbm
!pip install catboost

# Package

In [None]:
import pandas as pd
import numpy as np
import re
import ftfy
from datetime import datetime
import json
from pycaret.classification import *
import shap
import copy
import joblib
import warnings
warnings.simplefilter('ignore')

In [None]:
today = datetime.today().strftime("%Y%m%d")
random_seed = 42

# function

In [None]:
def fixEncoding(text:str)->str:
    text = ftfy.fix_text(text)
    text = re.sub(r'\s', ' ', text)

    return text

In [None]:
def read_selected(filePath:str,
                  dtypesMapper:dict=None,
                  textCols:list[str]=None)->pd.DataFrame:

    if dtypesMapper:
        df = pd.read_csv(
            filePath,
            # engine='openpyxl',
            usecols=[k for k, v in dtypesMapper.items()],
            dtype=dtypesMapper
            # low_memory=False
            )
    else:
        raise("You should specify parameter [cols] and [dtypesMapeer]")

    if textCols:
        for col in textCols:
            df[col] = df[col].apply(lambda x: fixEncoding(x))

    df = df.loc[:, [k for k, v in dtypesMapper.items()]]
    return df

# Dataset

In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/KB캐피탈/2023'

In [None]:
Mapper = {'사번' : str,
 '010900' : np.float32,
 '011100' : np.float32,
 '011200' : np.float32,
 '011700' : np.float32,
 '012340' : np.float32,
 '012900' : np.float32,
 '013500' : np.float32,
 '013600' : np.float32,
 '013900' : np.float32,
 '013910' : np.float32,
 '014000' : np.float32,
 '014100' : np.float32,
 '014102' : np.float32,
 '014200' : np.float32,
 '014300' : np.float32,
 '014400' : np.float32,
 '015120' : np.float32,
 '015900' : np.float32,
 '016200' : np.float32,
 '017800' : np.float32,
 '018800' : np.float32,
 '019900' : np.float32,
 '035000' : np.float32,
 '035800' : np.float32,
 '035900' : np.float32,
 '036100' : np.float32,
 '037100' : np.float32,
 '037800' : np.float32,
 '038000' : np.float32,
 '038100' : np.float32,
 '038200' : np.float32,
 '038400' : np.float32,
 '038500' : np.float32,
 '038600' : np.float32,
 '038700' : np.float32,
 '039300' : np.float32,
 '039400' : np.float32,
 '039800' : np.float32,
 '040100' : np.float32,
 '040900' : np.float32,
 '041300' : np.float32,
 '042220' : np.float32,
 '042800' : np.float32,
 '046600' : np.float32,
 '046700' : np.float32,
 '046800' : np.float32,
 '050500' : np.float32,
 '051000' : np.float32,
 '051200' : np.float32,
 '051500' : np.float32,
 '051600' : np.float32,
 '052500' : np.float32,
 '052700' : np.float32,
 '052800' : np.float32,
 '053400' : np.float32,
 '053500' : np.float32,
 '054310' : np.float32,
 '055110' : np.float32,
 '055111' : np.float32,
 '056110' : np.float32,
 '077100' : np.float32,
 '성명' : str,
 '성별' : str,
 '연령' : np.int32,
 '근속기간' : str,
 '현근무지' : str,
 '현근무부서' : str,
 '학력' : str,
 '전공' : str,
 '거주지' : str,
 '거주지_우편번호' : str,
 '직무관련자격증' : np.int32,
 '수상여부' : np.int32,
 '근무부서코드' : str,
 '근무부서' : str,
 '근무부서고과평균' : str,
 '직책' : str,
 '이동희망부서1' : str,
 '이동희망부서2' : str,
 '이동희망부서3' : str,
 '이동희망시기' : str,
 '최종이동일' : str,
 'major_code' : str,
 'emp_address' : str,
 'emp_lat' : np.float32,
 'emp_long' : np.float32,
 '근속개월' : np.int32,
 '부서근속기간' : np.int32,
 '경영자역량강화' : np.int32,
 '디지털역량강화' : np.int32,
 '조직가치공유' : np.int32,
 '직무역량강화' : np.int32,
 '핵심인재육성' : np.int32}

raw_data = read_selected(filePath = f'{data_path}/input/pro_data.csv',
                      dtypesMapper=Mapper,
                      textCols=['사번'])

In [None]:
missing_rate = joblib.load(f'{data_path}/output/pypkl/missing_rate_recom.pkl')

# Labeling

In [None]:
label_list = []
for i in range(len(raw_data)) :
    max = raw_data[raw_data.columns[1:62]].max(axis=1)[i]
    idx = np.where(raw_data.values[i] == max)[0][0]
    label_list.append(raw_data.columns[idx])

raw_data['LABEL'] = label_list
raw_data['LABEL'] = raw_data['LABEL'].astype('str')

# Model Preprocessing

In [None]:
result_data = copy.deepcopy(raw_data)
result_data['성별'].replace({'남자' : 1, '여자' : 0}, inplace=True)
result_data['직책'].replace({'팀원' : 1, '팀장' : 2, '소장' : 3}, inplace=True)
result_data['학력'].replace({'고등학교' : 1, '전문대학' : 2, '대학교' : 3, '대학원(석사)' : 4}, inplace=True)
result_data['major_code'] = result_data['major_code'].apply(lambda x : int(x[0:2]))

In [None]:
fe_cols = ['사번', '010900', '011100', '011200', '011700', '012340', '012900',
       '013500', '013600', '013900', '013910', '014000', '014100', '014102',
       '014200', '014300', '014400', '015120', '015900', '016200', '017800',
       '018800', '019900', '035000', '035800', '035900', '036100', '037100',
       '037800', '038000', '038100', '038200', '038400', '038500', '038600',
       '038700', '039300', '039400', '039800', '040100', '040900', '041300',
       '042220', '042800', '046600', '046700', '046800', '050500', '051000',
       '051200', '051500', '051600', '052500', '052700', '052800', '053400',
       '053500', '054310', '055110', '055111', '056110', '077100', '성별',
       '연령', '학력', '직무관련자격증', '수상여부', '직책', 'major_code', '근속개월',
       '부서근속기간', '경영자역량강화', '디지털역량강화', '조직가치공유', '직무역량강화',
       '핵심인재육성', 'LABEL']

# Modeling

In [None]:
train = result_data[fe_cols]
train[train.columns[1:-1]] = train[train.columns[1:-1]].astype('float64')
new = train[train.columns[1:62]].sum(axis=1)
drop_idx = new[new==0].index.tolist()
train = train.drop(drop_idx, axis=0)
train = train.drop(['사번'], axis=1)

In [None]:
train.drop(train[(train['LABEL']=='052800') | (train['LABEL']=='038500')].index, axis=0, inplace=True)

In [None]:
mapping = {}

for i in range(train['LABEL'].nunique()) :
    mapping[np.sort(train['LABEL'].unique())[i]] = i

In [None]:
train['LABEL'] = train['LABEL'].apply(lambda x : mapping[x])

In [None]:
'''
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
train_pca = pca.fit_transform(train[train.columns[0:61]].values)
train['부서고과평균'] = pd.DataFrame(data=train_pca, columns=['부서고과평균'])['부서고과평균']
train.drop(train.columns[0:61], axis=1, inplace=True)
'''

In [None]:
params = setup(data=train, target='LABEL', train_size=0.75, fold=5, session_id=random_seed, preprocess=True, fix_imbalance=False,
               numeric_features=train.drop(['LABEL'], axis=1).columns.tolist(), html=False, verbose=False)

In [None]:
results = pull()
trainset_size = results[results['Description']=='Transformed train set shape']['Value'].iloc[0][0]
testset_size = results[results['Description']=='Transformed test set shape']['Value'].iloc[0][0]
total_size = trainset_size + testset_size

In [None]:
# knn , nb, ridge, ada 불가
# lightgbm, rf, dt, gbc, et, xgboost, catboost 가능
# tuning -> blend 55분 소요 -> 0.7937
today = datetime.today().strftime("%Y%m%d")
top1 = compare_models(n_select=4, sort='Recall', include=['lightgbm', 'rf', 'dt', 'gbc', 'et', 'catboost', 'xgboost'], verbose=False)
# top1 = compare_models(n_select=3, sort='Accuracy', include=['rf', 'gbc', 'catboost'])
# save_model(top1 , 'model')
# saved_model = load_model('model')
# tune_model = [tune_model(i) for i in top1]
# end_model = tune_model(saved_model.named_steps["trained_model"], optimize='Accuracy', verbose=False)
end_model = blend_models(top1)
# end_model = saved_model.named_steps["trained_model"]
predict = predict_model(end_model)
metrics = pull()

metrics_dict = {}
metrics_dict['accuracy_score'] = metrics.head(1)['Accuracy'].tolist()[0]
metrics_dict['recall_score'] = metrics.head(1)['Recall'].tolist()[0]
metrics_dict['precision_score'] = metrics.head(1)['Prec.'].tolist()[0]
metrics_dict['f1_score'] = metrics.head(1)['F1'].tolist()[0]
# metrics_dict['auc'] = metrics.head(1)['AUC'].tolist()[0]

In [None]:
train_pipe = train.drop(['LABEL'], axis=1)
explainer_0 = shap.TreeExplainer(end_model.estimators_[0], check_additivity=False)
shap_values_0 = explainer_0.shap_values(train_pipe)
explainer_1 = shap.TreeExplainer(end_model.estimators_[1], check_additivity=False)
shap_values_1 = explainer_1.shap_values(train_pipe)
explainer_3 = shap.TreeExplainer(end_model.estimators_[3], check_additivity=False)
shap_values_3 = explainer_3.shap_values(train_pipe)

In [None]:
fe_sum = ((end_model.estimators_[0].feature_importances_ / 100) + (end_model.estimators_[1].feature_importances_) + (end_model.estimators_[2].feature_importances_) + (end_model.estimators_[3].feature_importances_))/4
feature_importance = pd.Series(fe_sum, index=train_pipe.columns)

In [None]:
final_feature_imp = feature_importance[61:]
final_feature_imp['근무부서고과평균'] = sum(feature_importance[0:61])
fe_dict = final_feature_imp.to_dict()

In [None]:
main_dict = {}
main_dict['feature_importance'] = fe_dict
main_dict['trainset_size'] = trainset_size
main_dict['testset_size'] = testset_size
main_dict['total_size'] = total_size
main_dict['last_train_date'] = today
main_dict['data_period'] = '2020-01-01 ~ 2023-01-01'
main_dict['metrics'] = metrics_dict
main_dict['feature_missing_rate'] = missing_rate

# Export

In [None]:
file_path = f'{data_path}/output/model_data_recommendation.json'
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(json.dumps(main_dict, ensure_ascii=False, indent=2))

In [None]:
joblib.dump(end_model, f'{data_path}/result/model/model_recom.pkl')
joblib.dump(mapping, f'{data_path}/result/model/mapping_recom.pkl')
joblib.dump(explainer_0, f'{data_path}/output/pypkl/explainer_0_recom.pkl')
joblib.dump(explainer_1, f'{data_path}/output/pypkl/explainer_1_recom.pkl')
joblib.dump(explainer_3, f'{data_path}/output/pypkl/explainer_3_recom.pkl')