In [1]:
# 최종 구현 모델
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import shap
import matplotlib.pyplot as plt

# 데이터셋 불러오기
file_path = '/content/BankChurners.csv'
df = pd.read_csv(file_path)

# 데이터 전처리

# Naive Bayes 관련 열 제거
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
         'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1,inplace=True)

# 불필요한 열 제거 ('CLIENTNUM')
df.drop(columns=['CLIENTNUM'], inplace=True)



# 범주형 변수를 수치형으로 매핑
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['Education_Level'] = df['Education_Level'].map({
    'Unknown':0, 'Uneducated': 1, 'High School': 2, 'College': 3,
    'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6
})
df['Marital_Status'] = df['Marital_Status'].map({'Unknown':0, 'Single': 1, 'Married': 2, 'Divorced': 3})
df['Income_Category'] = df['Income_Category'].map({
    'Unknown':0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3,
    '$80K - $120K': 4, '$120K +': 5
})
df['Card_Category'] = df['Card_Category'].map({'Blue': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4})


# 목표 변수와 특성 변수 분리
target_column = 'Attrition_Flag'
X = df.drop(target_column, axis=1)
y = df[target_column]


# 학습/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 클래스 불균형 처리 (SMOTE 사용)
smote = SMOTE(random_state=333)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 데이터 스케일링
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# 기본 모델 정의
catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=10, random_state=333, verbose=0)
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=10, random_state=333)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=333)
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, random_state=333)

# 스태킹 모델 정의
estimators = [
    ('catboost', catboost_model),
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('gb', gb_model)
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, n_jobs=-1)

# 스태킹 모델 학습
stacking_model.fit(X_train_resampled, y_train_resampled)

# 테스트 세트에 대한 예측 수행
y_pred = stacking_model.predict(X_test)
y_pred_proba = stacking_model.predict_proba(X_test)[:, 1]

# 모델 평가
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 평가 지표 출력
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)




Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Confusion Matrix:
 [[1654   45]
 [  30  297]]
Accuracy: 0.9629812438302073
Precision: 0.868421052631579
Recall: 0.908256880733945
F1 Score: 0.8878923766816144
ROC AUC Score: 0.9910200819694261
