In [9]:
# 논문4

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)

# 데이터셋 불러오기
file_path = '/content/BankChurners.csv'
df = pd.read_csv(file_path)

# 데이터 전처리

# Naive Bayes 관련 열 제거
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
         'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1,inplace=True)

# 불필요한 열 제거 ('CLIENTNUM')
df.drop(columns=['CLIENTNUM'], inplace=True)

# 범주형 변수를 수치형으로 매핑
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['Education_Level'] = df['Education_Level'].map({
    'Unknown':0, 'Uneducated': 1, 'High School': 2, 'College': 3,
    'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6
})
df['Marital_Status'] = df['Marital_Status'].map({'Unknown':0, 'Single': 1, 'Married': 2, 'Divorced': 3})
df['Income_Category'] = df['Income_Category'].map({
    'Unknown':0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3,
    '$80K - $120K': 4, '$120K +': 5
})
df['Card_Category'] = df['Card_Category'].map({'Blue': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4})

# 목표 변수와 특성 변수 분리
target_column = 'Attrition_Flag'
X = df.drop(target_column, axis=1)
y = df[target_column]

# 학습/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 초기화
et_classifier = ExtraTreesClassifier(random_state=42)
rf_classifier = RandomForestClassifier(random_state=42)

# Extra Trees Classifier 훈련
et_classifier.fit(X_train, y_train)
y_pred_et = et_classifier.predict(X_test)
y_pred_proba_et = et_classifier.predict_proba(X_test)[:, 1]  # 예측 확률

# Random Forest Classifier 훈련
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
y_pred_proba_rf = rf_classifier.predict_proba(X_test)[:, 1]  # 예측 확률

# 평가 함수 정의
def evaluate_model(y_test, y_pred, y_pred_proba, model_name):
    print(f"Evaluation Metrics for {model_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("="*50)

# Extra Trees 평가
evaluate_model(y_test, y_pred_et, y_pred_proba_et, "Extra Trees Classifier")

Evaluation Metrics for Extra Trees Classifier:
Accuracy: 0.9348469891411648
Precision: 0.922077922077922
Recall: 0.6513761467889908
F1-Score: 0.7634408602150538
AUC-ROC: 0.9810411953064674
Confusion Matrix:
 [[1681   18]
 [ 114  213]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96      1699
           1       0.92      0.65      0.76       327

    accuracy                           0.93      2026
   macro avg       0.93      0.82      0.86      2026
weighted avg       0.93      0.93      0.93      2026

