In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder # Label Encoding
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, Binarizer, RobustScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class MultiColumnLabelEncoder: # 다수의 Column을 동시에 Label encoding하는 class 생성
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
def get_clf_eval(y_test, pred): # 분류모델의 평가지표를 출력하는 함수
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print('Confusion_matrix')
    print(confusion)
    print('Accuracy : {:.4f}\nPrecision : {:.4f}\nRecall : {:.4f}'.format(accuracy, precision, recall))

In [None]:
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds): # threshold를 반복하면서 분류모델의 평가지표를 출력하는 함수
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('\nThreshold: ', custom_threshold)
        get_clf_eval(y_test, custom_predict)

In [None]:
random_seed = 12
thresholds = [0.06, 0.3, 0.5, 0.6]

In [None]:
auc_scorer = make_scorer(roc_auc_score, needs_threshold=True)

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=random_seed)

In [None]:
raw_data=pd.read_csv('/content/drive/My Drive/Colab Notebooks/core employee prediction.csv')

In [None]:
# Label Encoding / A, b
X = raw_data.drop(['핵심인재여부'], axis=1)
X = MultiColumnLabelEncoder(columns = ['조직', 'AL', 'AP', 'AR', 'AT']).fit_transform(X)
y = raw_data['핵심인재여부']

In [None]:
pipeline_xgb = Pipeline(steps = [['smote', SMOTE(random_state=random_seed)],
                                ['scaler', StandardScaler()],
                                ['classifier', XGBClassifier(random_state=random_seed, use_label_encoder=False)]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)

In [None]:
param = { 'smote__k_neighbors' : [1],
           'classifier__n_estimators' : [200],
           'classifier__max_depth' : [3],
         'classifier__learning_rate' : [0.1],
         'classifier__objective' : ['binary:logistic'],
         'classifier__gamma' : [5], #트리에서 추가적으로 가지를 나눌지를 결정할 최소 손실 감소 값으로 클수록 과적합 감소 효과
         'classifier__colsample_bytree' : [1]} # 각 트리마다 데이터 샘플링 비율 overfitting 방지

grid_search = GridSearchCV(estimator=pipeline_xgb,
                           param_grid=param,
                           scoring=auc_scorer,
                           cv=stratified_kfold,
                           n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train)
pred = grid_search.predict_proba(X_test)[:,1]
print(grid_search.best_params_)
roc_auc_score(y_test, pred)

In [None]:
get_eval_by_threshold(y_test, pred.reshape(-1,1), [0.29])

In [None]:
grid_search.best_estimator_.named_steps["classifier"].feature_importances_

In [None]:
# in_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/performance forecasting(inference data).csv')

In [None]:
# in_data = MultiColumnLabelEncoder(columns = ['조직', 'AL', 'AP', 'AR', 'AT']).transform(X)

In [None]:
real_model = grid_search
path = "/content/drive/MyDrive/Colab Notebooks/"
fixed_col_name = []
'''
X = in_data
scaled_X = scaler.transform(in_data)
y = []
'''

In [None]:
!pip install shap

In [None]:
import shap
shap.initjs()
explainer = shap.KernelExplainer(grid_search.predict, X_train)
shap_values = explainer.shap_values(X)

In [None]:
X.columns = fixed_col_name
predicted = real_model.predict(scaled_X)
X.index = pd.RangeIndex(len(scaled_X))

In [None]:
pred = pd.Series(predicted)
pred.name = "결과"

In [None]:
inference_data = pd.concat([X, pred], axis=1)
inference_data.to_csv(path + "inference_data.csv", index=False)

In [None]:
# 변경사항3
# shape_values의 type이 shap._explanation.Explanation -> numpy.ndarray로 변경되어 type에 맞추어 value값 추출
importances = np.absolute(shap_values).sum(axis=0) / shap_values.shape[0]
feature_importance = pd.Series(importances / sum(importances))
feature_importance.index = fixed_col_name
feature_importance.to_csv(path + "feature_importance.csv", index=True)

In [None]:
means = shap_values.sum(axis=0) / shap_values.shape[0]
means = pd.Series(means, index = fixed_col_name)
means = means / means.abs().sum()
means.columns = ["feature", "mean"]
means.to_csv(path + "mean_weight.csv", index=True)

In [None]:
pos_max = shap_values.max(axis=0)
neg_max = shap_values.min(axis=0)
new_list = []
for row in shap_values:
  temp = []
  for ele, pos, neg in zip(row, pos_max, neg_max):
    if ele >= 0:
      if max == 0:
        temp.append(0)
      else:
        temp.append(ele/pos)
    else:
      if neg == 0:
        temp.append(0)
      else:
        temp.append(-ele/neg)
  new_list.append(temp)

feature_weight = pd.DataFrame(new_list, index = pd.RangeIndex(len(shap_values)), columns = fixed_col_name)
feature_weight = feature_weight.fillna(0)
      
# feature_weight = pd.DataFrame(np.array(
#     [x / np.absolute(x).sum() for x in shap_values.values]
# ), index = pd.RangeIndex(len(shap_values.values)), columns = fixed_col_name)
feature_weight.to_csv(path + "feature_weight.csv", index=False)
feature_weight