In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_csv("BankChurners.csv", na_values=["NaN"])

In [3]:
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [4]:
data.info()
# 결측치 없음 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   CLIENTNUM                                                                                                                           10127 non-null  int64  
 1   Attrition_Flag                                                                                                                      10127 non-null  object 
 2   Customer_Age                                                                                                                        10127 non-null  int64  
 3   Gender                                                                           

In [5]:
df=data.drop_duplicates()

In [6]:
object_columns_to_convert = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

# Convert specified object columns to categorical
df[object_columns_to_convert] = df[object_columns_to_convert].astype('category')

# Display data types after conversion
print("Data Types after Conversion:")
print(df.dtypes)

Data Types after Conversion:
CLIENTNUM                                                                                                                                int64
Attrition_Flag                                                                                                                        category
Customer_Age                                                                                                                             int64
Gender                                                                                                                                category
Dependent_count                                                                                                                          int64
Education_Level                                                                                                                       category
Marital_Status                                                                                                   

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
# This library is to deal with the imbalance of dataset
from imblearn.over_sampling import SMOTE

In [8]:
# Split the data into features (X) and target variable (y)
X = df.drop("Attrition_Flag", axis=1)
y = df["Attrition_Flag"]

# Identify categorical columns with data type 'category'
categorical_columns = X.select_dtypes(include=['category']).columns.tolist()

# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_columns]))
X_encoded.columns = encoder.get_feature_names_out(categorical_columns)
X = pd.concat([X.drop(categorical_columns, axis=1), X_encoded], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



In [9]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)
dt_predictions = dt_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)
rf_predictions = rf_model.predict(X_test)

# SVM (rbf kernel)
svm_model = SVC(random_state=42)
svm_model.fit(X_resampled, y_resampled)
svm_predictions = svm_model.predict(X_test)


# Evaluate the models (use appropriate metrics for your problem)
models = {
    'Decision Tree': dt_predictions,
    'Random Forest': rf_predictions,
    'SVM': svm_predictions,
}

for model_name, predictions in models.items():
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, pos_label='Attrited Customer')
    recall = recall_score(y_test, predictions, pos_label='Attrited Customer')
    f1 = f1_score(y_test, predictions, pos_label='Attrited Customer')

    # Display evaluation metrics
    print(f"Metrics for {model_name}:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("\n")

Metrics for Decision Tree:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


Metrics for Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


Metrics for SVM:
Accuracy: 0.38960184271141823
Precision: 0.17658257972394098
Recall: 0.7479838709677419
F1-score: 0.2857142857142857




데이터셋의 의사 결정 트리와 랜덤 포레스트 모델은 각각 정확도, 정밀도, 재현율, F1-점수에서 1.0을 달성했습니다. 이는 모델이 데이터에 완벽하게 적합되었음을 의미합니다. 그러나 완벽한 점수는 과적합, 데이터 유출, 단순하거나 불균형한 데이터셋, 훈련 데이터에 대한 평가, 혹은 평가 방법의 오류 등의 문제를 나타낼 수 있습니다.

반면에 SVM 모델은 낮은 성능을 보였으며, 이는 모델이 적절히 설정되거나 조정되지 않았거나 데이터에 적합하지 않을 수 있음을 나타냅니다. SVM의 성능을 향상시키기 위해선, 데이터를 재샘플링하거나, 클래스 가중치를 조정하고, 커널 선택을 실험하며, 특성 선택을 통해 데이터의 노이즈를 줄이고, 하이퍼파라미터를 최적화하는 등의 방법을 사용할 수 있습니다.

In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)
dt_predictions = dt_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)
rf_predictions = rf_model.predict(X_test)

# SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_resampled, y_resampled)
svm_predictions = svm_model.predict(X_test)

# SVM - kernel variations (linear 런타임이 무한대기..)
svm_clf = SVC(C=1.0, kernel='linear', gamma='scale')
svm_clf.fit(X_resampled, y_resampled)
svm_clf_predictions = svm_model.predict(X_test)

# Evaluate the models (use appropriate metrics for your problem)
models = {
    'Decision Tree': dt_predictions,
    'Random Forest': rf_predictions,
    'SVM': svm_predictions,
    'SVM_clf':svm_clf_predictions
}

for model_name, predictions in models.items():
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, pos_label='Attrited Customer')
    recall = recall_score(y_test, predictions, pos_label='Attrited Customer')
    f1 = f1_score(y_test, predictions, pos_label='Attrited Customer')

    # Display evaluation metrics
    print(f"Metrics for {model_name}:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("\n")