In [387]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Download Dataset

In [388]:
data = pd.read_csv('BankChurners.csv')

data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


# Preprocessing

In [389]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   CLIENTNUM                                                                                                                           10127 non-null  int64  
 1   Attrition_Flag                                                                                                                      10127 non-null  object 
 2   Customer_Age                                                                                                                        10127 non-null  int64  
 3   Gender                                                                           

In [390]:
data.isnull().sum()

CLIENTNUM                                                                                                                             0
Attrition_Flag                                                                                                                        0
Customer_Age                                                                                                                          0
Gender                                                                                                                                0
Dependent_count                                                                                                                       0
Education_Level                                                                                                                       0
Marital_Status                                                                                                                        0
Income_Category                                 

결측치 자체는 확인되지 않으나, 몇몇 범주형 변수에 'Unknown'이라는 값이 포함되어 있음. 이는 결측치로 간주할 수 있으므로 제거해야함.

더불어 분석에 불필요한 고객 번호 feature를 제거함.

In [391]:
data.drop('CLIENTNUM', axis=1, inplace=True)
data.rename(columns={'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1': 'Naive_mon1',
                     'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2': 'Naive_mon2'},
                     inplace=True)

In [392]:
data.replace({'Unknown': np.nan}, inplace=True)
data.isnull().sum()

Attrition_Flag                 0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1519
Marital_Status               749
Income_Category             1112
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
Naive_mon1                     0
Naive_mon2                     0
dtype: int64

결측치 수가 다소 많으나, 일단 제거

In [393]:
data = data.dropna()

범주형 변수들에 대해 인코딩 진행

(1) 명목형 변수 -> 원-핫 인코딩 진행

(2) 순서형 변수 -> custom_encodings 딕셔너리를 통해 정수 인코딩 진행

In [394]:
custom_encodings = {'Education_Level': {'Uneducated': 0,
                                'High School': 1,
                                'College': 2,
                                'Graduate': 3,
                                'Post-Graduate': 4,
                                'Doctorate': 5},
            'Income_Category': {'Less than $40K': 0,
                                '$40K - $60K': 1,
                                '$60K - $80K': 2,
                                '$80K - $120K': 3,
                                '$120K +': 4},
            'Card_Category': {'Blue': 0,
                              'Silver': 1,
                              'Gold': 2,
                              'Platinum': 3}}

for feature in data.columns:
    if data[feature].dtype == object:
        if feature in custom_encodings.keys():
            data[feature] = data[feature].replace(custom_encodings[feature])
        else:
            data = pd.get_dummies(data, columns=[feature], drop_first=True) # 범주가 2개일 경우, 더미 생성 X

# Ensemble Model

In [395]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


X = data.iloc[:, 1:].drop(['Naive_mon1', 'Naive_mon2', 'Attrition_Flag_Existing Customer'], axis=1)
y = data['Attrition_Flag_Existing Customer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, shuffle=True, random_state=42)

rdf = RandomForestClassifier(random_state=42)

rdf.fit(X_train, y_train)
rdf_pred = rdf.predict(X_test)
print('accuracy: ', accuracy_score(y_test, rdf_pred), '\n')
print('Classification Report\n', classification_report(y_test, rdf_pred), '\n')
print('Confusion Matrix\n', pd.DataFrame(confusion_matrix(y_test, rdf_pred), 
                                         columns=['Predicted Negative', 'Predicted Positive'], 
                                         index=['Actual Negative', 'Actual Positive']))

accuracy:  0.9576570218772054 

Classification Report
               precision    recall  f1-score   support

           0       0.90      0.80      0.85       212
           1       0.97      0.99      0.98      1205

    accuracy                           0.96      1417
   macro avg       0.94      0.89      0.91      1417
weighted avg       0.96      0.96      0.96      1417
 

Confusion Matrix
                  Predicted Negative  Predicted Positive
Actual Negative                 170                  42
Actual Positive                  18                1187


In [396]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print('accuracy: ', accuracy_score(y_test, xgb_pred), '\n')
print('Classification Report\n', classification_report(y_test, xgb_pred), '\n')
print('Confusion Matrix\n', pd.DataFrame(confusion_matrix(y_test, xgb_pred), 
                                         columns=['Predicted Negative', 'Predicted Positive'], 
                                         index=['Actual Negative', 'Actual Positive']))

accuracy:  0.9675370501058574 

Classification Report
               precision    recall  f1-score   support

           0       0.93      0.84      0.89       212
           1       0.97      0.99      0.98      1205

    accuracy                           0.97      1417
   macro avg       0.95      0.92      0.93      1417
weighted avg       0.97      0.97      0.97      1417
 

Confusion Matrix
                  Predicted Negative  Predicted Positive
Actual Negative                 179                  33
Actual Positive                  13                1192


Accuray 값은 높게 도출되나, recall 점수가 상대적으로 낮게 도출됨을 확인할 수 있음.

이는 흔히 클래스 불균형 때문에 발생하는 문제임.

In [397]:
data['Attrition_Flag_Existing Customer'].value_counts()

1    5968
0    1113
Name: Attrition_Flag_Existing Customer, dtype: int64

타겟 데이터가 상당히 불균형하게 분포되어 있음을 확인할 수 있음.

이를 해결하기 위해 오버샘플링 기법을 적용한 훈련 데이터를 통해 모델을 재훈련함.

In [398]:
!pip install imblearn



DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [399]:
from imblearn.over_sampling import RandomOverSampler

over_sampler = RandomOverSampler(random_state=42)
X_over_sampled, y_over_sampled = over_sampler.fit_resample(X_train, y_train)

xgb.fit(X_over_sampled, y_over_sampled)
xgb_pred = xgb.predict(X_test)
print('accuracy: ', accuracy_score(y_test, xgb_pred), '\n')
print('Classification Report\n', classification_report(y_test, xgb_pred), '\n')
print('Confusion Matrix\n', pd.DataFrame(confusion_matrix(y_test, xgb_pred), 
                                         columns=['Predicted Negative', 'Predicted Positive'], 
                                         index=['Actual Negative', 'Actual Positive']))

accuracy:  0.9710656316160904 

Classification Report
               precision    recall  f1-score   support

           0       0.91      0.90      0.90       212
           1       0.98      0.98      0.98      1205

    accuracy                           0.97      1417
   macro avg       0.95      0.94      0.94      1417
weighted avg       0.97      0.97      0.97      1417
 

Confusion Matrix
                  Predicted Negative  Predicted Positive
Actual Negative                 190                  22
Actual Positive                  19                1186


recall 값이 개선되어 성능이 전반적으로 향상되었음을 알 수 있음. (f1-score 상승)