In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# 데이터셋 불러오기
file_path = '/content/BankChurners.csv'
df = pd.read_csv(file_path)

# 데이터 전처리

# Naive Bayes 관련 열 제거
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
         'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True)

# 불필요한 열 제거 ('CLIENTNUM')
df.drop(columns=['CLIENTNUM'], inplace=True)

# 범주형 변수를 수치형으로 매핑
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['Education_Level'] = df['Education_Level'].map({
    'Unknown': 0, 'Uneducated': 1, 'High School': 2, 'College': 3,
    'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6
})
df['Marital_Status'] = df['Marital_Status'].map({'Unknown': 0, 'Single': 1, 'Married': 2, 'Divorced': 3})
df['Income_Category'] = df['Income_Category'].map({
    'Unknown': 0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3,
    '$80K - $120K': 4, '$120K +': 5
})
df['Card_Category'] = df['Card_Category'].map({'Blue': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4})

# 목표 변수와 특성 변수 분리
target_column = 'Attrition_Flag'
X = df.drop(target_column, axis=1)
y = df[target_column]

# 특징 표준화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 양적 로열티 세분화
quantitative_features = ['Total_Relationship_Count', 'Months_on_book', 'Total_Revolving_Bal']
X_quant = X[quantitative_features]
kmeans_quant = KMeans(n_clusters=4, random_state=42)
X['Quant_Loyalty_Segment'] = kmeans_quant.fit_predict(X_quant)

# 질적 로열티 세분화
qualitative_features = ['Contacts_Count_12_mon', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1']
X_qual = X[qualitative_features]
kmeans_qual = KMeans(n_clusters=3, random_state=42)
X['Qual_Loyalty_Segment'] = kmeans_qual.fit_predict(X_qual)

# 이차원 세분화
X['Loyalty_Segment'] = X['Quant_Loyalty_Segment'].astype(str) + '_' + X['Qual_Loyalty_Segment'].astype(str)

# 이탈 패턴 그룹화
segment_churn_rates = X.join(y).groupby('Loyalty_Segment')[target_column].mean()
segment_churn_groups = segment_churn_rates.apply(lambda x: 0 if x < 0.2 else (1 if x < 0.5 else 2))
X['Churn_Pattern_Group'] = X['Loyalty_Segment'].map(segment_churn_groups)

# 데이터 학습 및 테스트 분할 (불균형 처리를 생략하여 진행)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 각 이탈 패턴 그룹에 대해 모델 학습 및 평가 (Specificity 추가)
results = []
for group in X['Churn_Pattern_Group'].unique():
    # 현재 그룹에 대한 데이터 필터링
    X_train_group = X_train[X_train['Churn_Pattern_Group'] == group].drop(columns=['Quant_Loyalty_Segment', 'Qual_Loyalty_Segment', 'Loyalty_Segment', 'Churn_Pattern_Group'])
    y_train_group = y_train[X_train.index.isin(X_train_group.index)]
    X_test_group = X_test[X_test['Churn_Pattern_Group'] == group].drop(columns=['Quant_Loyalty_Segment', 'Qual_Loyalty_Segment', 'Loyalty_Segment', 'Churn_Pattern_Group'])
    y_test_group = y_test[X_test.index.isin(X_test_group.index)]

    # 그룹별 모델 선택 (랜덤 포레스트, Gradient Boosting, 로지스틱 회귀)
    if group == 0:
        model = RandomForestClassifier(random_state=42, max_depth=3, n_estimators=50, min_samples_split=5)
    elif group == 1:
        model = GradientBoostingClassifier(random_state=42, n_estimators=50, learning_rate=0.05, max_depth=2)
    else:
        model = LogisticRegression(random_state=42, max_iter=3000, solver='liblinear', penalty='l2', tol=1e-3)

    # 모델 학습
    model.fit(X_train_group, y_train_group)

    # 예측 수행
    y_pred = model.predict(X_test_group)

    # 모델 평가
    accuracy = accuracy_score(y_test_group, y_pred)
    precision = precision_score(y_test_group, y_pred)
    recall = recall_score(y_test_group, y_pred)

    # Specificity 계산
    tn, fp, fn, tp = confusion_matrix(y_test_group, y_pred).ravel()
    specificity = tn / (tn + fp)

    f1 = f1_score(y_test_group, y_pred)
    roc_auc = roc_auc_score(y_test_group, y_pred)

    # 결과 저장
    results.append({
        'Churn_Pattern_Group': group,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1,
        'ROC AUC Score': roc_auc
    })

# 결과 출력 (데이터프레임 출력)
results_df = pd.DataFrame(results)
print(results_df)


   Churn_Pattern_Group  Accuracy  Precision    Recall  Specificity  F1-Score  \
0                    0  0.913216   1.000000  0.043689     1.000000  0.083721   
1                    1  0.881664   0.867159  0.810345     0.924843  0.837790   

   ROC AUC Score  
0       0.521845  
1       0.867594  
