# DecisionTree

In [32]:
import LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

In [24]:
data = pd.read_csv('data/Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## 1. 데이터 전처리

### 1-1 오버샘플링

In [25]:
def perform_oversampling(data, label_column):
    # 다수 클래스와 소수 클래스 분리
    majority = data[data[label_column] == 0]
    minority = data[data[label_column] == 1]

    # 소수 클래스 오버샘플링
    minority_upsampled = resample(
        minority,
        replace=True, # 복원추출
        n_samples=len(majority), # 다수클래스 수만큼 소수클래스 늘리기
        random_state=42
    )

    # 병합 및 셔플
    upsampled_data = pd.concat([majority, minority_upsampled]) # 기존다수의 클래스와 소수 클래스를 합치기
    return upsampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

# 사용 예시
upsampled_df = perform_oversampling(data, label_column='Exited')
data = upsampled_df

### 1-2. feature engineering

In [26]:
# 지역 원핫인코딩 실시
data = pd.get_dummies(data, columns=['Geography', 'Gender'], dtype=int)

In [27]:
# 연속형 변수들을 로그 스케일링 실시 ('Age','Balance')
data['LogAge'] = data['Age'].apply(lambda x: np.log1p(x))
data['LogBalance'] = data['Balance'].apply(lambda x: np.log1p(x))

### 1-3 데이터분할 및 스케일링

In [28]:
# 안쓰는 칼럼 제거
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# 로그칼럼들을 적용
X = data.drop(columns=['Exited', 'Balance', 'Age'])
y = data['Exited']

Banking_df = pd.DataFrame(X, columns=X.columns) # 칼럼이름 보존

X_train, X_test, y_train, y_test = train_test_split(Banking_df, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 2. 머신러닝

In [34]:
# DecisionTree 생성
dt_clf = DecisionTreeClassifier(
    random_state=42,
    max_depth=5,
    min_samples_split=100,
    min_samples_leaf=5,

)
dt_clf.fit(X_train, y_train)

# 하이퍼파라미터 최적화
param_grid ={
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=dt_clf,
    param_grid=param_grid,
    cv=3, # 교차검증 폴드 수
    n_jobs=-1, # 병렬 처리 (가능한 모든 코어 사용)
    scoring='accuracy') # n_jobs는 cpu개수 제한

grid_search.fit(X_train, y_train)

print('best_params_:', grid_search.best_params_)
print('best_score_:', grid_search.best_score_)

best_model = grid_search.best_estimator_

# 테스트셋 예측 및 성능 평가
y_pred = best_model.predict(X_test)
print("Accuracy (Test):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

best_params_: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
best_score_: 0.7645212456453795
Accuracy (Test): 0.7655367231638418

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.77      1627
           1       0.75      0.78      0.76      1559

    accuracy                           0.77      3186
   macro avg       0.77      0.77      0.77      3186
weighted avg       0.77      0.77      0.77      3186


Confusion Matrix:
 [[1227  400]
 [ 347 1212]]


In [35]:
!pip install imbalanced-learn



In [36]:
import imblearn
print(imblearn.__version__)

0.13.0


In [37]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# SMOTE 오버샘플링 적용
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# 클래스 비율 확인
print("After SMOTE:", dict(zip(*np.unique(y_train_resampled, return_counts=True))))

# 모델 학습
rf_model_smote = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model_smote.fit(X_train_resampled, y_train_resampled)

# 예측 및 평가
y_pred_smote = rf_model_smote.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred_smote))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_smote))

NameError: name 'X_train_scaled' is not defined