In [None]:
# 모델 학습(Training Data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from math import sqrt

In [None]:
raw_data=pd.read_csv('/content/drive/My Drive/Colab Notebooks/performance forecasting(train data).csv')

In [None]:
random_seed = 4

In [None]:
X = pd.get_dummies(raw_data.drop(['9블록(5등급)', '이름'], axis=1), columns = ['직책', '직급', '성별', '학력', '입사경로(경력/신입)'], drop_first=True)
y = raw_data['9블록(5등급)']

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X = pd.DataFrame(data=X_scaled, index=X.index, columns=X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)

In [None]:
model_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 8), n_estimators=700, learning_rate = 0.05, random_state=random_seed)
model_ada.fit(X_train, y_train)
pred_ada = model_ada.predict(X_test)

In [None]:
# 학습 모델 평가지표

In [None]:
print('MAE = {:.3f}'.format(mean_absolute_error(y_test, pred_ada)))
print('RMSE = {:.3f}'.format(sqrt(mean_squared_error(y_test, pred_ada, squared=False))))
print('MAPE = {:.3f}'.format(mean_absolute_percentage_error(y_test, pred_ada)))
print('r2 score = {:.3f}'.format(r2_score(y_test, pred_ada)))

In [None]:
# 모델 예측(Inference Data)

In [None]:
in_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/performance forecasting(inference data).csv')

In [None]:
rest_data = in_data.drop(['사원 번호', '이름', '부문명', '본부명', '팀명'], axis=1)

In [None]:
# drop하여 제거하는 '사원 번호', '이름', '부문명', '본부명', '팀명' 5개 column은 모델링에 미활용하는 HR 인사정보 데이터
inference_data = pd.get_dummies(rest_data,
                                  columns = ['직책', '직급', '성별', '학력', '입사경로(경력/신입)'], drop_first=True)

In [None]:
inference_data['목표달성도'] = inference_data['목표달성도']+0.226

In [None]:
#csv 파일 저장 코드

In [None]:
real_model = model_ada
path = "/content/drive/MyDrive/Colab Notebooks/"
fixed_col_name = ['직급(과거)', '생년', '입사일', 'GC근속기간 월단위', '외부 경력 월단위', '총경력 월단위', '이직횟수',
       '포상', '사내MBA 선발대상', 'two_years_ago', 'one_years_ago', '목표(개수)', '목표달성도',
       '주기성', '체크인 횟수', '코멘트 횟수', '받은 배지(all)', '받은 배지(내부)', '받은 배지(외부)',
       '보낸 배지(all)', '보낸 배지(내부)', '보낸 배지(외부)', '빠르게_긍정응답율', '젊게_긍정응답율',
       '강하게_긍정응답율', '단도직입문항1_긍정응답율', '단도직입문항2_긍정응답율', 'taskboard 개수',
       'task 개수', 'task comment 개수', '직책[팀장]', '직급[G2]', '직급[G3]', '직급[S1]',
       '직급[S2]','성별[여]', '학력[대학-졸업]', '학력[박사]', '학력[석사]', '학력[전문대-졸업]',
       '입사경로(경력/신입)[신입]']
X = inference_data
scaled_X = scaler.transform(inference_data)
y = []

In [None]:
#변경사항1
# link = 'logit' 제거 -> link 파라미터 사용 시 결측값이 포함되었다는 에러와 함께 실행 중단되어 해당 파라미터 제거
# Input contains NaN, infinity or a value too large for dtype('float64')
# github 참고하여 위 logit 제거 방법 솔루션 리서치했지만 원인 및 해결이유는 설명하지 않아 파악 어려움  
# X does not have valid feature names, but SVC was fitted with feature names <- 원인파악 불가, 코드는 중단없이 계속 실행
# 위 'X~'이슈의 경우 jupyter notebook에서 동일한 코드 실행결과 이슈가 발생하지 않아 colab 환경문제로 생각
!pip install shap
import shap
shap.initjs()
explainer = shap.KernelExplainer(model_ada.predict, X_train)
shap_values = explainer.shap_values(scaled_X)

In [None]:
# 변경사항2
# inference 데이터에는 label이 존재하지 않기에 y column 제거
X.columns = fixed_col_name
predicted = real_model.predict(scaled_X)
X.index = pd.RangeIndex(len(scaled_X))

In [None]:
pred = pd.Series(predicted)
pred.name = "결과"

In [None]:
inference_data = pd.concat([X, pred], axis=1)
inference_data.to_csv(path + "inference_data.csv", index=False)

In [None]:
# 변경사항3
# shape_values의 type이 shap._explanation.Explanation -> numpy.ndarray로 변경되어 type에 맞추어 value값 추출
importances = np.absolute(shap_values).sum(axis=0) / shap_values.shape[0]
feature_importance = pd.Series(importances / sum(importances))
feature_importance.index = fixed_col_name
feature_importance.to_csv(path + "feature_importance.csv", index=True)

In [None]:
means = shap_values.sum(axis=0) / shap_values.shape[0]
means = pd.Series(means, index = fixed_col_name)
means = means / means.abs().sum()
means.columns = ["feature", "mean"]
means.to_csv(path + "mean_weight.csv", index=True)

In [None]:
'''
feature_weight = pd.DataFrame(np.array(
    [x / np.absolute(x).sum() for x in shap_values]
), index = pd.RangeIndex(len(shap_values)), columns = fixed_col_name)
feature_weight.to_csv(path + "feature_weight.csv", index=False)
'''

In [None]:
pos_max = shap_values.max(axis=0)
neg_max = shap_values.min(axis=0)
new_list = []
for row in shap_values:
  temp = []
  for ele, pos, neg in zip(row, pos_max, neg_max):
    if ele >= 0:
      if max == 0:
        temp.append(0)
      else:
        temp.append(ele/pos)
    else:
      if neg == 0:
        temp.append(0)
      else:
        temp.append(-ele/neg)
  new_list.append(temp)

feature_weight = pd.DataFrame(new_list, index = pd.RangeIndex(len(shap_values)), columns = fixed_col_name)
feature_weight = feature_weight.fillna(0)
      
# feature_weight = pd.DataFrame(np.array(
#     [x / np.absolute(x).sum() for x in shap_values.values]
# ), index = pd.RangeIndex(len(shap_values.values)), columns = fixed_col_name)
feature_weight.to_csv(path + "feature_weight.csv", index=False)
feature_weight