In [1]:
# ==============================
# 1. 기본 라이브러리 로드
# ==============================
import pandas as pd  # 데이터프레임 처리 라이브러리
import numpy as np  # 수학 연산 및 배열 처리 라이브러리
import matplotlib.pyplot as plt  # 데이터 시각화 라이브러리
%matplotlib inline  
# ==============================
# 2. 머신러닝 모델 관련 라이브러리
# ==============================

# (1) 앙상블 학습 모델
from sklearn.ensemble import VotingClassifier  # 여러 개의 분류 모델을 조합하는 투표 기반 앙상블 학습

# (2) 개별 분류 모델
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀 분류기
from sklearn.neighbors import KNeighborsClassifier  # K-최근접 이웃(KNN) 분류기
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트 분류기
from sklearn.ensemble import GradientBoostingClassifier  # 그래디언트 부스팅 트리 분류기
from sklearn.tree import DecisionTreeClassifier  # 결정 트리 분류기
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost 분류기

# (3) 부스팅 기반 분류 모델 (XGBoost & LightGBM)
import xgboost as xgb  # XGBoost 라이브러리 (트리 기반 부스팅 기법)
from xgboost import XGBClassifier  # XGBoost 분류기
from lightgbm import LGBMClassifier  # LightGBM 분류기

# ==============================
# 3. 모델 평가 및 성능 지표
# ==============================

from sklearn.metrics import confusion_matrix  # 혼동 행렬
from sklearn.metrics import precision_score, recall_score  # 정밀도(precision) 및 재현율(recall)
from sklearn.metrics import f1_score, roc_auc_score  # F1-score 및 ROC-AUC 점수
from sklearn.metrics import accuracy_score  # 분류 모델의 정확도(accuracy)

# ==============================
# 4. 하이퍼파라미터 최적화
# ==============================

from hyperopt import hp  # 하이퍼파라미터 탐색 공간 정의
from hyperopt import STATUS_OK  # 최적화 과정에서 상태 반환
from hyperopt import fmin, tpe, Trials  # 최적의 하이퍼파라미터 탐색을 위한 함수들

# ==============================
# 5. 데이터셋 로드 및 데이터 분할
# ==============================

from sklearn.datasets import load_breast_cancer  # 유방암 데이터셋 (예제 데이터셋)
from sklearn.model_selection import train_test_split  # 데이터 분할 (학습/테스트 세트)
from sklearn.model_selection import cross_val_score  # 교차 검증을 통한 모델 성능 평가

# ==============================
# 6. 데이터 전처리
# ==============================

from sklearn.preprocessing import StandardScaler  # 데이터 표준화 (평균 0, 분산 1 변환)
from sklearn.preprocessing import MinMaxScaler  # 데이터 Min-Max 스케일링 (0~1 범위 변환)
from sklearn.impute import SimpleImputer  # 결측값 처리
from imblearn.under_sampling import RandomUnderSampler  # 불균형 데이터의 샘플링을 위한 언더샘플링
from imblearn.over_sampling import RandomOverSampler  # 불균형 데이터의 샘플링을 위한 오버샘플링
from imblearn.over_sampling import SMOTE  # SMOTE 기법 (Synthetic Minority Over-sampling Technique)
from scipy.stats import randint  # 범위 내에서 랜덤값 생성

# ==============================
# 7. 통계 분석 관련 라이브러리
# ==============================

from scipy.stats import shapiro, skew, kurtosis  # 데이터 분포 분석을 위한 함수들

from sklearn.model_selection import GridSearchCV  # 그리드 서치를 통한 하이퍼파라미터 최적화
from sklearn.model_selection import RandomizedSearchCV  # 랜덤 서치를 통한 하이퍼파라미터 최적화

import seaborn as sns  # 고급 데이터 시각화 라이브러리

# ==============================
# 8. 기타 유틸리티
# ==============================

import time  # 코드 실행 시간 측정
import warnings  # 경고 메시지 무시 설정
warnings.filterwarnings('ignore') 

## 이상치 미 제거 +불균형 미 처리

In [2]:
cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()


cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index

X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_labels = cust_train_copy_df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels,
    test_size=0.3,
    random_state=156,
    stratify=y_labels
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=0
)


xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False
)


roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score (이상치 미제거 + 불균형 미처리): {roc_score:.4f}")


test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('no_outlier_nosmote_submission.csv', index=False)


ROC AUC Score (이상치 미제거 + 불균형 미처리): 0.8485


##  이상치 미제거 + 언더샘플링 

In [3]:
cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()

cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index

X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_target = cust_train_copy_df['TARGET']

undersample = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_features, y_target)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.3,
    random_state=156,
    stratify=y_resampled
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=0
)

xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False
)

roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score (이상치 미제거 + 언더샘플링): {roc_score:.4f}")

test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('no_outlier_with_undersample_submission.csv', index=False)


ROC AUC Score (이상치 미제거 + 언더샘플링): 0.8362


## 이상치 미제거 + SMOTE 

In [4]:

cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()

cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index

X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_labels = cust_train_copy_df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels,
    test_size=0.3,
    random_state=156,
    stratify=y_labels
)

smote = SMOTE(random_state=156)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_train_over, y_train_over,
    eval_set=[(X_test, y_test)],
    verbose=False
)

roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score (이상치 미제거 + SMOTE): {roc_score:.4f}")

test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('no_outlier_with_smote_submission.csv', index=False)


ROC AUC Score (이상치 미제거 + SMOTE): 0.8250


## 이상치 제거 + 불균형 미처리

In [5]:
cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()


numeric_cols = cust_train_copy_df.select_dtypes(include=['int64', 'float64']).columns.drop('TARGET')

for col in numeric_cols:
    Q1 = cust_train_copy_df[col].quantile(0.25)
    Q3 = cust_train_copy_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = cust_train_copy_df[col].median()
    
    cust_train_copy_df.loc[
        (cust_train_copy_df[col] < lower_bound) | (cust_train_copy_df[col] > upper_bound),
        col
    ] = median


cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index


X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_labels = cust_train_copy_df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels,
    test_size=0.3,
    stratify=y_labels,
    random_state=156
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=0
)


xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False
)


roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score: {roc_score:.4f}")


test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('remove_outliers_with_median_submission.csv', index=False)


ROC AUC Score: 0.8297


## 이상치 제거 + 언더샘플링

In [6]:
cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()

numeric_cols = cust_train_copy_df.select_dtypes(include=['int64', 'float64']).columns.drop('TARGET')

for col in numeric_cols:
    Q1 = cust_train_copy_df[col].quantile(0.25)
    Q3 = cust_train_copy_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = cust_train_copy_df[col].median()
    
    cust_train_copy_df.loc[
        (cust_train_copy_df[col] < lower_bound) | (cust_train_copy_df[col] > upper_bound),
        col
    ] = median


cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index


X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_target = cust_train_copy_df['TARGET']

undersample = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_features, y_target)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.3,
    random_state=156,
    stratify=y_resampled
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=0
)


xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False
)


roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score (이상치 제거 + 언더샘플링): {roc_score:.4f}")


test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('remove_outliers_with_undersample_submission.csv', index=False)


ROC AUC Score (이상치 제거 + 언더샘플링): 0.8198


## 이상치 제거 + SMOTE

In [7]:

cust_train_df = pd.read_csv('./santander-customer-satisfaction/train.csv', encoding='latin-1')
cust_test_df = pd.read_csv('./santander-customer-satisfaction/test.csv', encoding='latin-1')

cust_train_copy_df = cust_train_df.copy()
cust_test_copy_df = cust_test_df.copy()

numeric_cols = cust_train_copy_df.select_dtypes(include=['int64', 'float64']).columns.drop('TARGET')

for col in numeric_cols:
    Q1 = cust_train_copy_df[col].quantile(0.25)
    Q3 = cust_train_copy_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = cust_train_copy_df[col].median()
    
    cust_train_copy_df.loc[
        (cust_train_copy_df[col] < lower_bound) | (cust_train_copy_df[col] > upper_bound),
        col
    ] = median


cust_train_copy_df.drop('ID', axis=1, inplace=True)
if 'ID' in cust_test_copy_df.columns:
    ids = cust_test_copy_df['ID']
    cust_test_copy_df.drop('ID', axis=1, inplace=True)
else:
    ids = cust_test_copy_df.index


X_features = cust_train_copy_df.drop('TARGET', axis=1)
y_target = cust_train_copy_df['TARGET']


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_features, y_target)


X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.3,
    random_state=156,
    stratify=y_resampled
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=0
)

xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=156,
    eval_metric='auc',
    tree_method='gpu_hist',
    early_stopping_rounds=100,
    gpu_id=0
)

xgb_clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=False
)

roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score (이상치 제거 + SMOTE): {roc_score:.4f}")

test_preds = xgb_clf.predict_proba(cust_test_copy_df)[:, 1]

submission = pd.DataFrame({
    'ID': ids,
    'TARGET': test_preds
})

submission.to_csv('remove_outliers_with_smote_submission.csv', index=False)


ROC AUC Score (이상치 제거 + SMOTE): 0.9756
