<style>
    rd { color:red; }
    bl { color:blue; }
</style>

# min_samples_leaf 조정한 RandomForest
## 전처리
| 작업        | 대상                                                                                       |
|:------------|:-------------------------------------------------------------------------------------------|
| 컬럼 삭제   | "RowNumber", "CustomerId", "Surname"                                                       |
| 컬럼 인코딩 | "Geography", "Gender"                                                                      |
| 컬럼 라벨링 | "CreditScore", "Geography", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary" |

### 스케일링 : StandardScaler

## 하이퍼파라미터
- RandomForest
    - min_samples_leaf : 50, 80

## 결론 : 성능에 영향 없음.

In [10]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import seaborn           as sns

import matplotlib
import matplotlib.font_manager as fm

import re

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [11]:
def encoding(df:pd.DataFrame, columns:list[str]):
    """범주형 데이터를 인코딩"""

    encoder_list = {}
    result_df    = df.copy(deep=True)

    for col_nm in columns:
        encoder           = LabelEncoder()
        result_df[col_nm] = encoder.fit_transform(result_df[col_nm])

        encoder_list[col_nm] = encoder

    return result_df, encoder_list


def scaling(df:pd.DataFrame, columns:list[str]):
    """DataFrame 에서 컬럼들을 스케일링"""

    scaler    = StandardScaler()
    result_df = df.copy(deep=True)

    result_df[columns] = scaler.fit_transform(result_df[columns])

    return result_df

## 데이터 로드 및 전처리

In [12]:
######################################### 데이터 로드
df     = pd.read_csv("data/Churn_Modelling.csv")
inputs = df.drop(columns=["Exited"], axis=1)
labels = df["Exited"]


######################################### 데이터 전처리
_input = inputs.drop(columns=["RowNumber", "CustomerId", "Surname"], axis=1)     # 컬럼 삭제( Rownumber, CustomerId, Surname )
_input, encoders = encoding(_input, ["Geography", "Gender"])            # 범주형 문자열 데이터 인코딩
_input = scaling(_input, ["CreditScore", "Geography", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"])

In [13]:
print(_input.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  float64
 1   Geography        10000 non-null  float64
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  float64
 4   Tenure           10000 non-null  float64
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  float64
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(7), int64(3)
memory usage: 781.4 KB
None 



In [14]:
print(_input.value_counts(), "\n")

CreditScore  Geography  Gender  Age       Tenure  Balance   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary
1.000        1.0        1       0.716216  1.0     0.277436  0.000000       1          0               0.489456           1
0.000        0.0        0       0.297297  0.0     0.442805  0.000000       1          1               0.861630           1
                                0.567568  0.3     0.000000  0.000000       0          0               0.568977           1
                        1       0.445946  1.0     0.000000  0.000000       1          1               0.629121           1
             0.5        1       0.283784  0.0     0.437362  0.333333       0          0               0.618012           1
                                                                                                                        ..
0.102        0.5        1       0.405405  0.8     0.510726  0.000000       1          0               0.878794           1
0.108        0.5     

## 데이터 분할

In [15]:
######################################### 데이터 분할. random_state 지정한 상태에서 성능 확인/개선해보고, state 풀었을 때도 보기.
train_x, test_x, train_y, test_y = train_test_split(_input, labels, stratify=labels)
print("학습 데이터 shape : ", train_x.shape, train_y.shape)
print("검증 데이터 shape : ",  test_x.shape,  test_y.shape, "\n")

학습 데이터 shape :  (7500, 10) (7500,)
검증 데이터 shape :  (2500, 10) (2500,) 



## 모델 학습 및 평가 - min_samples_leaf=20
순정과 차이 없음

In [26]:
######################################### 모델 학습
model = RandomForestClassifier(min_samples_leaf=20)
model.fit(train_x, train_y)


######################################### 모델 성능 평가
predicted = model.predict(test_x)
print(classification_report(test_y, predicted, target_names=["Stayed", "Exited"]))

              precision    recall  f1-score   support

      Stayed       0.87      0.97      0.92      1991
      Exited       0.79      0.44      0.57       509

    accuracy                           0.86      2500
   macro avg       0.83      0.71      0.74      2500
weighted avg       0.86      0.86      0.85      2500



## 모델 학습 및 평가 - min_samples_leaf=50
순정에 비해 성능이 다소 <bl>하락</bl>

In [23]:
######################################### 모델 학습
model = RandomForestClassifier(min_samples_leaf=50)
model.fit(train_x, train_y)


######################################### 모델 성능 평가
predicted = model.predict(test_x)
print(classification_report(test_y, predicted, target_names=["Stayed", "Exited"]))

              precision    recall  f1-score   support

      Stayed       0.86      0.97      0.91      1991
      Exited       0.80      0.38      0.52       509

    accuracy                           0.85      2500
   macro avg       0.83      0.68      0.72      2500
weighted avg       0.85      0.85      0.83      2500



## 모델 학습 및 평가 - min_samples_leaf=80
순정에 비해 성능이 다소 <bl>하락</bl>

In [24]:
######################################### 모델 학습
model = RandomForestClassifier(min_samples_leaf=80)
model.fit(train_x, train_y)


######################################### 모델 성능 평가
predicted = model.predict(test_x)
print(classification_report(test_y, predicted, target_names=["Stayed", "Exited"]))

              precision    recall  f1-score   support

      Stayed       0.85      0.98      0.91      1991
      Exited       0.83      0.31      0.45       509

    accuracy                           0.85      2500
   macro avg       0.84      0.65      0.68      2500
weighted avg       0.85      0.85      0.82      2500

