<a href="https://colab.research.google.com/github/sunyoungmoon012/first-repository/blob/master/%EC%B1%95%ED%84%B0%EB%B6%84%EB%A5%984.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 필수 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings

# 경고 무시
warnings.filterwarnings('ignore')

# 1. 데이터 불러오기
file_path = './train_santander.csv'  # 파일 경로
cust_df = pd.read_csv(file_path, encoding='latin-1')

In [None]:
# 2. 데이터 기본 정보 확인
print(f'Dataset Shape: {cust_df.shape}')   # (76020, 371)
print(cust_df.head(3))                     # 상위 3개 데이터 확인
cust_df.info()                             # 데이터 타입 및 Null 확인

In [None]:
# 3. Target 값 분포 확인
print("\n[Target 값 분포]")
print(cust_df['TARGET'].value_counts())

unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1]['TARGET'].count()
total_cnt = cust_df['TARGET'].count()
print('불만족 고객 비율: {:.2f}%'.format((unsatisfied_cnt / total_cnt) * 100))

In [None]:
# 4. 기초 통계 요약
print("\n[데이터 describe()]")
print(cust_df.describe())

In [None]:
# 5. var3 컬럼의 이상값 확인
print("\n[var3 컬럼 값 분포 (상위 10개)]")
print(cust_df['var3'].value_counts()[:10])

In [None]:
# 6. var3 컬럼의 이상값 -999999를 2로 대체
cust_df['var3'].replace(-999999, 2, inplace=True)
print("\n[var3 이상값 처리 후 분포 (상위 10개)]")
print(cust_df['var3'].value_counts()[:10])

In [None]:
# 6. ID 컬럼 제거
cust_df.drop('ID', axis=1, inplace=True)

In [None]:
# 7. 피처 데이터와 레이블 데이터 분리
X_features = cust_df.iloc[:, :-1]  # 마지막 컬럼 제외 (피처)
y_labels = cust_df.iloc[:, -1]     # 마지막 컬럼 (TARGET)

print("\n[피처 데이터 shape]:", X_features.shape)

In [None]:
# 8. 학습용 데이터와 테스트용 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                    test_size=0.2, random_state=0)

print("\n[학습 세트 Shape]:", X_train.shape, ", 테스트 세트 Shape:", X_test.shape)

In [None]:
# 9. 레이블 분포 확인 (불균형 여부)
print("\n[학습 세트 레이블 분포 비율]")
print(y_train.value_counts() / y_train.count())
print("\n[테스트 세트 레이블 분포 비율]")
print(y_test.value_counts() / y_test.count())

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# 1. 학습용 데이터를 다시 학습용 + 검증용으로 분리 (early stopping용)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train,
                                            test_size=0.3, random_state=0)

In [None]:
# 2. XGBoost 모델 생성
xgb_clf = XGBClassifier(n_estimators=500,
                        learning_rate=0.05,
                        random_state=156)

In [None]:
# 3. 모델 학습 (early stopping 적용)
xgb_clf.fit(X_tr, y_tr,
            early_stopping_rounds=100,
            eval_metric="auc",
            eval_set=[(X_tr, y_tr), (X_val, y_val)],
            verbose=False)

In [None]:
# 4. 테스트 세트 예측 및 ROC-AUC 평가
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # 예측 확률값
xgb_roc_score = roc_auc_score(y_test, y_pred_proba)

print('\nXGBoost ROC-AUC: {:.4f}'.format(xgb_roc_score))

In [None]:
from hyperopt import hp, tpe, Trials, fmin
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# 1. 하이퍼파라미터 탐색 공간 정의
xgb_search_space = {
    'max_depth': hp.quniform('max_depth', 5, 15, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
}

In [None]:
# 2. 목적 함수 정의 (3-Fold 교차검증)
def objective_func(search_space):
    xgb_clf = XGBClassifier(
        n_estimators=100,
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        colsample_bytree=search_space['colsample_bytree'],
        learning_rate=search_space['learning_rate'],
        random_state=156
    )

In [None]:
    # 교차검증
    kf = KFold(n_splits=3, shuffle=True, random_state=156)
    auc_scores = []

    for train_index, val_index in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        xgb_clf.fit(X_tr, y_tr,
                    early_stopping_rounds=30,
                    eval_metric="auc",
                    eval_set=[(X_tr, y_tr), (X_val, y_val)],
                    verbose=False)

        preds = xgb_clf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        auc_scores.append(auc)

    # HyperOpt는 최소화 문제 → - 평균 AUC 반환
    return -1 * np.mean(auc_scores)

In [None]:
# 3. Trials 객체 생성 → 수행 결과 저장
trials = Trials()

# 4. 하이퍼파라미터 최적화 수행
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            rstate=np.random.default_rng(156))

print('\nBest Hyperparameters:', best)

In [None]:
# 4. 최적 파라미터로 최종 모델 학습 + 테스트 데이터 평가
best_xgb = XGBClassifier(
    n_estimators=500,
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=best['colsample_bytree'],
    learning_rate=best['learning_rate'],
    random_state=156
)

best_xgb.fit(X_train, y_train,
             early_stopping_rounds=100,
             eval_metric='auc',
             eval_set=[(X_train, y_train), (X_test, y_test)],
             verbose=False)

final_pred_proba = best_xgb.predict_proba(X_test)[:, 1]
final_roc_auc = roc_auc_score(y_test, final_pred_proba)

print('\n최종 테스트 ROC-AUC: {:.4f}'.format(final_roc_auc))

In [None]:
# HyperOpt 결과를 그대로 사용하는 대신 일부 값을 round 처리
best_xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5),
    random_state=156
)

# 학습 + 평가
best_xgb.fit(X_train, y_train,
             early_stopping_rounds=100,
             eval_metric='auc',
             eval_set=[(X_train, y_train), (X_test, y_test)],
             verbose=False)

final_pred_proba = best_xgb.predict_proba(X_test)[:, 1]
final_roc_auc = roc_auc_score(y_test, final_pred_proba)

print('\n최종 HyperOpt 튜닝 후 XGBoost 테스트 ROC-AUC: {:.4f}'.format(final_roc_auc))

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

# 그래프 크기 설정 및 중요도 시각화
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_importance(best_xgb, ax=ax, max_num_features=20, height=0.4)
plt.show()

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# LightGBM 모델 생성 (XGBoost와 동일하게 n_estimators=500)
lgbm_clf = LGBMClassifier(n_estimators=500)

# 검증용 데이터셋
eval_set = [(X_tr, y_tr), (X_val, y_val)]

# 모델 학습 (early stopping 적용)
lgbm_clf.fit(X_tr, y_tr,
             early_stopping_rounds=100,
             eval_metric='auc',
             eval_set=eval_set,
             verbose=False)

# 테스트 데이터 평가
lgbm_pred_proba = lgbm_clf.predict_proba(X_test)[:, 1]
lgbm_roc_score = roc_auc_score(y_test, lgbm_pred_proba)

print('\nLightGBM 테스트 ROC-AUC: {:.4f}'.format(lgbm_roc_score))

In [None]:
from lightgbm import LGBMClassifier
from hyperopt import hp, tpe, Trials, fmin
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# 1. 탐색 공간 정의
lgbm_search_space = {
    'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
    'max_depth': hp.quniform('max_depth', 100, 160, 1),
    'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
    'subsample': hp.uniform('subsample', 0.7, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
}

In [None]:
# 2. 목적 함수
def objective_func(search_space):
    lgbm_clf = LGBMClassifier(
        n_estimators=100,
        num_leaves=int(search_space['num_leaves']),
        max_depth=int(search_space['max_depth']),
        min_child_samples=int(search_space['min_child_samples']),
        subsample=search_space['subsample'],
        learning_rate=search_space['learning_rate']
    )

    kf = KFold(n_splits=3, shuffle=True, random_state=156)
    roc_auc_list = []

    for train_index, val_index in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        lgbm_clf.fit(X_tr, y_tr,
                     early_stopping_rounds=30,
                     eval_metric='auc',
                     eval_set=[(X_tr, y_tr), (X_val, y_val)],
                     verbose=False)

        pred_proba = lgbm_clf.predict_proba(X_val)[:, 1]
        roc_auc = roc_auc_score(y_val, pred_proba)
        roc_auc_list.append(roc_auc)

    # 평균 ROC-AUC의 음수 반환
    return -np.mean(roc_auc_list)

# 3. 최적화 수행
trials = Trials()
best = fmin(fn=objective_func,
            space=lgbm_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            rstate=np.random.default_rng(30))

print('\nBest Hyperparameters for LightGBM:', best)

In [None]:
# 최적 파라미터를 적용해 LightGBM 최종 모델 생성
lgbm_clf = LGBMClassifier(
    n_estimators=500,
    num_leaves=int(best['num_leaves']),
    max_depth=int(best['max_depth']),
    min_child_samples=int(best['min_child_samples']),
    subsample=round(best['subsample'], 5),
    learning_rate=round(best['learning_rate'], 5),
    random_state=156
)

# 학습 (early stopping 적용)
lgbm_clf.fit(X_tr, y_tr,
             early_stopping_rounds=100,
             eval_metric='auc',
             eval_set=[(X_tr, y_tr), (X_val, y_val)],
             verbose=False)

# 테스트 세트 평가
lgbm_pred_proba = lgbm_clf.predict_proba(X_test)[:, 1]
lgbm_roc_score = roc_auc_score(y_test, lgbm_pred_proba)

print('\n최종 LightGBM 테스트 ROC-AUC: {:.4f}'.format(lgbm_roc_score))