# 앙상블 모델(Boosting 알고리즘)로 직원 이직 여부 예측하기

## 라이브러리 임포트

In [None]:
# 필요한 라이브러리 불러오기
import numpy as np  # 수치 연산을 위한 NumPy
import pandas as pd  # 데이터프레임 처리를 위한 Pandas

from sklearn.ensemble import GradientBoostingClassifier  # GBM 분류기
from sklearn.metrics import accuracy_score, log_loss, classification_report  # 평가 지표

## 데이터 로드 및 기초 확인

In [None]:
attrition = pd.read_csv('https://raw.githubusercontent.com/aettikang/bigdata_analysis_basic/main/HR-Employee-Attrition.csv')
attrition.head()
attrition.info()

## 범주형 변수 → 더미 변수 변환

In [None]:
attrition_cat_dummies = pd.get_dummies(attrition) #가급적 get_dummies쓸것
attrition_cat_dummies.head(3)

## 입력(X)·출력(y) 분리

In [5]:
feature_columns = list(attrition_cat_dummies.columns.difference(["Attrition"]))

X = attrition_cat_dummies[feature_columns]
y = attrition_cat_dummies['Attrition'].astype('category') 

## 학습용·테스트용 데이터 분할

In [7]:
# Import the train_test_split method
from sklearn.model_selection import train_test_split

# Split data into train and test sets as well as for validation and testing
train_X, test_X, train_y, test_y = train_test_split(X,y, 
                                              train_size= 0.80,
                                              random_state=0);

## 하이퍼파라미터 탐색

In [None]:
from sklearn.model_selection import GridSearchCV

gb_params = {
    'n_estimators': [1000, 1500],
    'max_depth': [4, 6],
    'min_samples_leaf': [2, 4],
    'max_features': [0.7, 0.9],
    'learning_rate': [0.25, 0.3]
}

gb_clf = GradientBoostingClassifier(random_state=0)
grid_cv = GridSearchCV(
    gb_clf,
    param_grid=gb_params,
    cv=3,
    n_jobs=-1
)
grid_cv.fit(train_X, train_y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 교차검증 정확도: {:.4f}'.format(grid_cv.best_score_))


## 최종 모델 학습·예측·평가

In [None]:
best_gb = GradientBoostingClassifier(**grid_cv.best_params_, random_state=0)
best_gb.fit(train_X, train_y)
gb_predictions = best_gb.predict(test_X)
print("Accuracy score:", accuracy_score(test_y, gb_predictions))
print(classification_report(test_y, gb_predictions))


## 변수 중요도 확인 및 시각화

In [None]:
imp = best_gb.feature_importances_  # or grid_cv.best_estimator_.feature_importances_
df = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': imp
})
df = df.sort_values('Importance', ascending=False)[:5]

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.barplot(x='Importance', y='Feature', data=df);
