# 앙상블 모델 - Optuna를 활용한 하이퍼파라미터 최적화

## 학습 목표
1. **Optuna**를 사용하여 앙상블 모델의 하이퍼파라미터를 자동으로 최적화할 수 있다
2. **RandomForest, GradientBoosting, XGBoost, LightGBM** 모델을 Optuna로 튜닝할 수 있다
3. **Pruning**을 활용하여 비효율적인 trial을 조기 종료할 수 있다
4. 최적화된 앙상블 모델들의 성능을 비교 분석할 수 있다

## 주요 내용
- California Housing 데이터셋 활용 (회귀)
- RandomForest, GradientBoosting, XGBoost, LightGBM 최적화
- Optuna Pruning 기능 활용
- 최적화 결과 비교 및 시각화

## 1. 환경 설정 및 데이터 로드

In [None]:
# 필요한 라이브러리 설치 (필요시)
# !pip install optuna xgboost lightgbm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Scikit-learn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 앙상블 모델
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# XGBoost, LightGBM
try:
    from xgboost import XGBRegressor
    print("XGBoost loaded successfully")
except ImportError:
    print("XGBoost not installed. Run: pip install xgboost")

try:
    from lightgbm import LGBMRegressor
    print("LightGBM loaded successfully")
except ImportError:
    print("LightGBM not installed. Run: pip install lightgbm")

print(f"\nOptuna version: {optuna.__version__}")

In [None]:
# California Housing 데이터셋 로드
housing = fetch_california_housing()

X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target

print(f"데이터 shape: {X.shape}")
print(f"타겟 shape: {y.shape}")
print(f"\nFeature names: {housing.feature_names}")
X.head()

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 스케일링 (트리 기반 모델은 필수는 아니지만 일관성을 위해)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"학습 데이터: {X_train_scaled.shape}")
print(f"테스트 데이터: {X_test_scaled.shape}")

## 2. 평가 함수 정의

In [None]:
# 결과 저장용 DataFrame
results_df = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'MAE', 'R2', 'Best_Params'])

def evaluate_model(model_name, y_true, y_pred, best_params=None):
    """모델 평가 및 결과 저장"""
    global results_df
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    new_row = pd.DataFrame({
        'Model': [model_name],
        'MSE': [mse],
        'RMSE': [rmse],
        'MAE': [mae],
        'R2': [r2],
        'Best_Params': [str(best_params) if best_params else '-']
    })
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
    print(f"\n{model_name}:")
    print(f"  MSE: {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R2: {r2:.4f}")
    
    return mse

## 3. RandomForest + Optuna 최적화

In [None]:
def objective_rf(trial):
    """RandomForest의 objective 함수"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = RandomForestRegressor(**params)
    
    # 교차 검증
    scores = cross_val_score(model, X_train_scaled, y_train,
                            cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    
    return -scores.mean()

# Optuna Study 생성 및 실행
print("=" * 50)
print("RandomForest 최적화 시작...")
print("=" * 50)

sampler = TPESampler(seed=42)
study_rf = optuna.create_study(direction='minimize', sampler=sampler)
study_rf.optimize(objective_rf, n_trials=50, show_progress_bar=True)

print(f"\n최적 파라미터: {study_rf.best_params}")
print(f"최적 MSE (CV): {study_rf.best_value:.4f}")

In [None]:
# 최적 RandomForest 모델 학습 및 평가
best_rf = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train_scaled, y_train)
pred_rf = best_rf.predict(X_test_scaled)

evaluate_model('RandomForest (Optuna)', y_test, pred_rf, study_rf.best_params)

## 4. GradientBoosting + Optuna 최적화

In [None]:
def objective_gb(trial):
    """GradientBoosting의 objective 함수"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'random_state': 42
    }
    
    model = GradientBoostingRegressor(**params)
    
    scores = cross_val_score(model, X_train_scaled, y_train,
                            cv=3, scoring='neg_mean_squared_error')
    
    return -scores.mean()

# Optuna Study 생성 및 실행
print("=" * 50)
print("GradientBoosting 최적화 시작...")
print("=" * 50)

sampler = TPESampler(seed=42)
study_gb = optuna.create_study(direction='minimize', sampler=sampler)
study_gb.optimize(objective_gb, n_trials=50, show_progress_bar=True)

print(f"\n최적 파라미터: {study_gb.best_params}")
print(f"최적 MSE (CV): {study_gb.best_value:.4f}")

In [None]:
# 최적 GradientBoosting 모델 학습 및 평가
best_gb = GradientBoostingRegressor(**study_gb.best_params, random_state=42)
best_gb.fit(X_train_scaled, y_train)
pred_gb = best_gb.predict(X_test_scaled)

evaluate_model('GradientBoosting (Optuna)', y_test, pred_gb, study_gb.best_params)

## 5. XGBoost + Optuna 최적화 (with Pruning)

In [None]:
def objective_xgb(trial):
    """XGBoost의 objective 함수 (Pruning 적용)"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    model = XGBRegressor(**params)
    
    scores = cross_val_score(model, X_train_scaled, y_train,
                            cv=3, scoring='neg_mean_squared_error')
    
    return -scores.mean()

# Optuna Study 생성 및 실행 (Pruning 적용)
print("=" * 50)
print("XGBoost 최적화 시작 (with Pruning)...")
print("=" * 50)

sampler = TPESampler(seed=42)
pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=10)

study_xgb = optuna.create_study(direction='minimize', sampler=sampler, pruner=pruner)
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print(f"\n최적 파라미터: {study_xgb.best_params}")
print(f"최적 MSE (CV): {study_xgb.best_value:.4f}")

In [None]:
# 최적 XGBoost 모델 학습 및 평가
best_xgb = XGBRegressor(**study_xgb.best_params, random_state=42, n_jobs=-1, verbosity=0)
best_xgb.fit(X_train_scaled, y_train)
pred_xgb = best_xgb.predict(X_test_scaled)

evaluate_model('XGBoost (Optuna)', y_test, pred_xgb, study_xgb.best_params)

## 6. LightGBM + Optuna 최적화

In [None]:
def objective_lgbm(trial):
    """LightGBM의 objective 함수"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1
    }
    
    model = LGBMRegressor(**params)
    
    scores = cross_val_score(model, X_train_scaled, y_train,
                            cv=3, scoring='neg_mean_squared_error')
    
    return -scores.mean()

# Optuna Study 생성 및 실행
print("=" * 50)
print("LightGBM 최적화 시작...")
print("=" * 50)

sampler = TPESampler(seed=42)
study_lgbm = optuna.create_study(direction='minimize', sampler=sampler)
study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=True)

print(f"\n최적 파라미터: {study_lgbm.best_params}")
print(f"최적 MSE (CV): {study_lgbm.best_value:.4f}")

In [None]:
# 최적 LightGBM 모델 학습 및 평가
best_lgbm = LGBMRegressor(**study_lgbm.best_params, random_state=42, n_jobs=-1, verbosity=-1)
best_lgbm.fit(X_train_scaled, y_train)
pred_lgbm = best_lgbm.predict(X_test_scaled)

evaluate_model('LightGBM (Optuna)', y_test, pred_lgbm, study_lgbm.best_params)

## 7. 모델 성능 비교

In [None]:
# 전체 결과 출력
print("=" * 70)
print("앙상블 모델 성능 비교 결과")
print("=" * 70)

results_summary = results_df[['Model', 'MSE', 'RMSE', 'MAE', 'R2']].sort_values('MSE').reset_index(drop=True)
display(results_summary)

In [None]:
# 성능 비교 시각화
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
models = results_summary['Model'].tolist()

# MSE 비교
ax1 = axes[0, 0]
bars = ax1.barh(models, results_summary['MSE'], color=colors)
ax1.set_xlabel('MSE (낮을수록 좋음)')
ax1.set_title('모델별 MSE 비교')
ax1.invert_yaxis()
for i, v in enumerate(results_summary['MSE']):
    ax1.text(v + 0.01, i, f'{v:.4f}', va='center', fontsize=9)

# RMSE 비교
ax2 = axes[0, 1]
bars = ax2.barh(models, results_summary['RMSE'], color=colors)
ax2.set_xlabel('RMSE (낮을수록 좋음)')
ax2.set_title('모델별 RMSE 비교')
ax2.invert_yaxis()
for i, v in enumerate(results_summary['RMSE']):
    ax2.text(v + 0.01, i, f'{v:.4f}', va='center', fontsize=9)

# MAE 비교
ax3 = axes[1, 0]
bars = ax3.barh(models, results_summary['MAE'], color=colors)
ax3.set_xlabel('MAE (낮을수록 좋음)')
ax3.set_title('모델별 MAE 비교')
ax3.invert_yaxis()
for i, v in enumerate(results_summary['MAE']):
    ax3.text(v + 0.01, i, f'{v:.4f}', va='center', fontsize=9)

# R2 비교
ax4 = axes[1, 1]
bars = ax4.barh(models, results_summary['R2'], color=colors)
ax4.set_xlabel('R2 Score (높을수록 좋음)')
ax4.set_title('모델별 R2 Score 비교')
ax4.invert_yaxis()
for i, v in enumerate(results_summary['R2']):
    ax4.text(v + 0.01, i, f'{v:.4f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

## 8. Optuna 최적화 과정 시각화

In [None]:
# 각 모델의 최적화 히스토리 비교
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

studies = [
    (study_rf, 'RandomForest'),
    (study_gb, 'GradientBoosting'),
    (study_xgb, 'XGBoost'),
    (study_lgbm, 'LightGBM')
]

for ax, (study, name) in zip(axes.flatten(), studies):
    trials = [t.value for t in study.trials if t.value is not None]
    best_values = [min(trials[:i+1]) for i in range(len(trials))]
    
    ax.plot(trials, 'o-', alpha=0.5, label='Trial Value', markersize=3)
    ax.plot(best_values, 'r-', linewidth=2, label='Best Value')
    ax.set_xlabel('Trial')
    ax.set_ylabel('MSE')
    ax.set_title(f'{name} 최적화 히스토리')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# XGBoost 파라미터 중요도
fig = optuna.visualization.plot_param_importances(study_xgb)
fig.update_layout(title='XGBoost 파라미터 중요도')
fig.show()

In [None]:
# LightGBM 파라미터 중요도
fig = optuna.visualization.plot_param_importances(study_lgbm)
fig.update_layout(title='LightGBM 파라미터 중요도')
fig.show()

## 9. Feature Importance 비교

In [None]:
# 각 모델의 Feature Importance 비교
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

models_for_importance = [
    (best_rf, 'RandomForest'),
    (best_gb, 'GradientBoosting'),
    (best_xgb, 'XGBoost'),
    (best_lgbm, 'LightGBM')
]

feature_names = housing.feature_names

for ax, (model, name) in zip(axes.flatten(), models_for_importance):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    ax.barh(range(len(indices)), importances[indices], color='steelblue')
    ax.set_yticks(range(len(indices)))
    ax.set_yticklabels([feature_names[i] for i in indices])
    ax.set_xlabel('Feature Importance')
    ax.set_title(f'{name} Feature Importance')
    ax.invert_yaxis()

plt.tight_layout()
plt.show()

## 10. 최적 파라미터 요약

In [None]:
print("=" * 70)
print("최적화된 하이퍼파라미터 요약")
print("=" * 70)

all_studies = [
    ('RandomForest', study_rf),
    ('GradientBoosting', study_gb),
    ('XGBoost', study_xgb),
    ('LightGBM', study_lgbm)
]

for name, study in all_studies:
    print(f"\n[{name}]")
    print(f"  최적 MSE (CV): {study.best_value:.4f}")
    for param, value in study.best_params.items():
        if isinstance(value, float):
            print(f"  {param}: {value:.6f}")
        else:
            print(f"  {param}: {value}")

In [None]:
# 가장 좋은 모델 선택
best_model_row = results_summary.iloc[0]

print("\n" + "=" * 70)
print("최고 성능 모델")
print("=" * 70)
print(f"모델: {best_model_row['Model']}")
print(f"MSE: {best_model_row['MSE']:.4f}")
print(f"RMSE: {best_model_row['RMSE']:.4f}")
print(f"MAE: {best_model_row['MAE']:.4f}")
print(f"R2 Score: {best_model_row['R2']:.4f}")

## 11. 결론

### Optuna를 사용한 앙상블 모델 최적화 요약

| 모델 | 주요 튜닝 파라미터 |
|------|--------------------|
| **RandomForest** | n_estimators, max_depth, min_samples_split, max_features |
| **GradientBoosting** | n_estimators, learning_rate, max_depth, subsample |
| **XGBoost** | n_estimators, learning_rate, max_depth, reg_alpha, reg_lambda |
| **LightGBM** | n_estimators, learning_rate, num_leaves, min_child_samples |

### Optuna 활용 팁

1. **Pruning 활용**: XGBoost, LightGBM처럼 학습 시간이 긴 모델에 특히 효과적
2. **로그 스케일 탐색**: learning_rate, reg_alpha 등은 `log=True`로 탐색
3. **적절한 trial 수**: 파라미터가 많을수록 더 많은 trial 필요 (50~200)
4. **시각화 활용**: 파라미터 중요도를 확인하여 불필요한 파라미터 제거 가능

### 참고 자료
- [Optuna 공식 문서](https://optuna.readthedocs.io/)
- [XGBoost 파라미터 튜닝 가이드](https://xgboost.readthedocs.io/en/latest/parameter.html)
- [LightGBM 파라미터 튜닝 가이드](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)