<style>
    rd { color:red; }
    bl { color:blue; }
</style>

# Feature Engineering 을 통한 성능 향상 시도
XGBoost의 plot_importance 결과로 얻은, 중요도가 높은 특성들만으로 성능이 향상되는지 확인해본다.

## 전처리
| 작업        | 대상                                                                                       |
|:------------|:-------------------------------------------------------------------------------------------|
| 컬럼 삭제   | "RowNumber", "CustomerId", "Surname"                                                       |
| 컬럼 인코딩 | "Geography", "Gender"                                                                      |
| 컬럼 라벨링 | "CreditScore", "Geography", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary" |

### 스케일링 : StandardScaler

## 남길 특성
1. IsActiveMember
2. NumOfProducts
3. Age
4. Balance

## 결론 : 정확도 86% 에서 의미있는 성능변화 없음

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import seaborn           as sns

import matplotlib
import matplotlib.font_manager as fm

import re

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
def encoding(df:pd.DataFrame, columns:list[str]):
    """범주형 데이터를 인코딩"""

    encoder_list = {}
    result_df    = df.copy(deep=True)

    for col_nm in columns:
        encoder           = LabelEncoder()
        result_df[col_nm] = encoder.fit_transform(result_df[col_nm])

        encoder_list[col_nm] = encoder

    return result_df, encoder_list


def scaling(df:pd.DataFrame, columns:list[str]):
    """DataFrame 에서 컬럼들을 스케일링"""

    scaler    = StandardScaler()
    result_df = df.copy(deep=True)

    result_df[columns] = scaler.fit_transform(result_df[columns])

    return result_df

## 데이터 로드 및 전처리

In [3]:
######################################### 데이터 로드
df     = pd.read_csv("../data/Churn_Modelling.csv")
inputs = df.drop(columns=["Exited"], axis=1)
labels = df["Exited"]


######################################### 데이터 전처리
_input = inputs.drop(columns=["RowNumber", "CustomerId", "Surname"], axis=1)     # 컬럼 삭제( Rownumber, CustomerId, Surname )
_input, encoders = encoding(_input, ["Geography", "Gender"])            # 범주형 문자열 데이터 인코딩
_input = scaling(_input, ["CreditScore", "Geography", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"])

In [8]:
display(df)
_input

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.326221,-0.901886,0,0.293517,-1.041760,-1.225848,-0.911583,1,1,0.021886
1,-0.440036,1.515067,0,0.198164,-1.387538,0.117350,-0.911583,0,1,0.216534
2,-1.536794,-0.901886,0,0.293517,1.032908,1.333053,2.527057,1,0,0.240687
3,0.501521,-0.901886,0,0.007457,-1.387538,-1.225848,0.807737,0,0,-0.108918
4,2.063884,1.515067,0,0.388871,-1.041760,0.785728,-0.911583,1,1,-0.365276
...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,-0.901886,1,0.007457,-0.004426,-1.225848,0.807737,1,0,-0.066419
9996,-1.391939,-0.901886,1,-0.373958,1.724464,-0.306379,-0.911583,1,1,0.027988
9997,0.604988,-0.901886,0,-0.278604,0.687130,-1.225848,-0.911583,0,1,-1.008643
9998,1.256835,0.306591,1,0.293517,-0.695982,-0.022608,0.807737,1,0,-0.125231


## TRIAL : XGBoost의 plot_importance 에서 중요도가 높은 항목들만 남겨서 성능 변화 확인
성능이 미세하게 하락

In [12]:
######################################### DROP : HasCrCard
fe_input = _input[["IsActiveMember", "NumOfProducts", "Age", "Balance"]]


######################################### 데이터 분할. random_state 지정한 상태에서 성능 확인/개선해보고, state 풀었을 때도 보기.
train_x, test_x, train_y, test_y = train_test_split(fe_input, labels, stratify=labels)
# print("학습 데이터 shape : ", train_x.shape, train_y.shape)
# print("검증 데이터 shape : ",  test_x.shape,  test_y.shape, "\n")


######################################### 모델 학습
model = RandomForestClassifier(max_depth=7)
model.fit(train_x, train_y)


######################################### 모델 성능 평가
predicted = model.predict(test_x)
print(classification_report(test_y, predicted, target_names=["Stayed", "Exited"]))

              precision    recall  f1-score   support

      Stayed       0.86      0.97      0.91      1991
      Exited       0.75      0.39      0.52       509

    accuracy                           0.85      2500
   macro avg       0.81      0.68      0.71      2500
weighted avg       0.84      0.85      0.83      2500

