In [None]:
# 確保環境安裝必要的庫
# 請在 Jupyter/Colab 單元格中執行以下指令

%pip install optuna
%pip install xgboost==1.7.6  # 關鍵：鎖定版本以確保與訓練邏輯兼容
%pip install plotly
%pip install optuna-integration

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.17.2-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.44-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting tqdm (from optuna)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting PyYAML (from optuna)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.4-cp311-cp311-win_amd64.whl.metadata (4.2 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading alembic-1.17.2-py3-none-any.whl (248 kB)
Downloading sqlalchemy-2.0.44-cp311-cp311-win_amd64.whl (2.1 MB)
  

In [None]:
import logging
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 確保在 Notebook 中顯示圖表
%matplotlib inline

from typing import Any, Callable, Tuple, Dict, List
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.base import clone
import optuna
# 導入但不使用，因為它需要新的 callbacks 參數
from optuna.integration import XGBoostPruningCallback 

import joblib
import os

ModuleNotFoundError: No module named 'seaborn'

In [None]:
class Config:
    TARGET_COL = 'Exited'
    N_SPLITS = 5
    RANDOM_STATE = 42
    
# --- 數據加載（模擬）---
# 由於無法訪問您的本地文件，我們創建一個模擬數據集以確保邏輯可執行。
# 在實際執行時，請將下面的模擬代碼替換為真實的數據加載。

try:
    # 嘗試加載真實文件
    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    print(f"訓練數據大小: {df_train.shape}, 測試數據大小: {df_test.shape}")
except FileNotFoundError:
    print("⚠️ 警告：未找到 'train.csv' 和 'test.csv'。正在創建模擬數據。")

    # 模擬訓練數據
    np.random.seed(Config.RANDOM_STATE)
    n_train = 165000
    df_train = pd.DataFrame({
        'RowNumber': range(1, n_train + 1),
        'CustomerId': range(10000000, 10000000 + n_train),
        'Surname': ['Surname' + str(i) for i in range(n_train)],
        'CreditScore': np.random.randint(300, 850, n_train),
        'Geography': np.random.choice(['France', 'Spain', 'Germany'], n_train, p=[0.5, 0.25, 0.25]),
        'Gender': np.random.choice(['Male', 'Female'], n_train),
        'Age': np.random.randint(18, 90, n_train),
        'Tenure': np.random.randint(0, 10, n_train),
        'Balance': np.abs(np.random.normal(loc=50000, scale=60000, size=n_train)),
        'NumOfProducts': np.random.randint(1, 5, n_train),
        'HasCrCard': np.random.randint(0, 2, n_train),
        'IsActiveMember': np.random.randint(0, 2, n_train),
        'EstimatedSalary': np.random.uniform(1000, 200000, n_train),
        'Exited': np.random.randint(0, 2, n_train, p=[0.8, 0.2])
    })
    # 模擬測試數據
    n_test = 110000
    df_test = pd.DataFrame({
        'id': range(n_test),
        'RowNumber': range(n_train + 1, n_train + n_test + 1),
        'CustomerId': range(10000000 + n_train, 10000000 + n_train + n_test),
        'Surname': ['Surname' + str(i) for i in range(n_train, n_train + n_test)],
        'CreditScore': np.random.randint(300, 850, n_test),
        'Geography': np.random.choice(['France', 'Spain', 'Germany'], n_test, p=[0.5, 0.25, 0.25]),
        'Gender': np.random.choice(['Male', 'Female'], n_test),
        'Age': np.random.randint(18, 90, n_test),
        'Tenure': np.random.randint(0, 10, n_test),
        'Balance': np.abs(np.random.normal(loc=50000, scale=60000, size=n_test)),
        'NumOfProducts': np.random.randint(1, 5, n_test),
        'HasCrCard': np.random.randint(0, 2, n_test),
        'IsActiveMember': np.random.randint(0, 2, n_test),
        'EstimatedSalary': np.random.uniform(1000, 200000, n_test),
    })
    
    # 調整模擬數據以匹配某些特徵工程假設
    df_train.loc[df_train['Balance'] < 1, 'Balance'] = 0
    df_test.loc[df_test['Balance'] < 1, 'Balance'] = 0

    print(f"訓練數據大小 (模擬): {df_train.shape}, 測試數據大小 (模擬): {df_test.shape}")

In [None]:
class FeatureEngineer:
    """
    用於特徵工程的工具類別。
    """
    @staticmethod
    def map_columns(df: pd.DataFrame, mappings: dict) -> pd.DataFrame:
        df_copy = df.copy()
        for col, mapping in mappings.items():
            if col in df_copy.columns:
                df_copy[col] = df_copy[col].map(mapping)
        return df_copy

    @staticmethod
    def cast_columns(df: pd.DataFrame, int_cols: Any = None,
                     cat_cols: Any = None) -> pd.DataFrame:
        df_copy = df.copy()
        if int_cols:
            for col in int_cols:
                if col in df_copy.columns:
                    df_copy[col] = df_copy[col].astype(int)
        if cat_cols:
            for col in cat_cols:
                if col in df_copy.columns:
                    df_copy[col] = df_copy[col].astype('category')
        return df_copy

    @staticmethod
    def run_v0_baseline(df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
        df_copy = df.copy()
        int_cols = ['HasCrCard', 'IsActiveMember']
        cat_cols = ['Geography', 'Gender']
        df_copy = FeatureEngineer.cast_columns(df_copy, int_cols=int_cols, cat_cols=cat_cols)

        cols_to_drop = ['CustomerId','Surname']
        if is_train and 'Exited' in df_copy.columns:
             cols_to_drop.append('Exited')
        df_copy.drop(columns=[col for col in cols_to_drop if col in df_copy.columns], inplace=True, errors='ignore')
        return df_copy

    @staticmethod
    def run_v1_preprocessing(df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
        df_copy = df.copy()
        gender_map = {'Male': 0, 'Female': 1}
        df_copy = FeatureEngineer.map_columns(df_copy, {'Gender': gender_map})

        # 年齡分箱
        df_copy['Age_bin'] = pd.cut(df_copy['Age'], bins=[0, 25, 35, 45, 60, np.inf],
                                     labels=['very_young', 'young', 'mid', 'mature', 'senior'])

        # 創建基礎特徵旗標
        df_copy['Is_two_products'] = (df_copy['NumOfProducts'] == 2).astype(int)
        df_copy['Germany_Female'] = ((df_copy['Geography'] == 'Germany') & (df_copy['Gender'] == 1)).astype(int)
        df_copy['Germany_Inactive'] = ((df_copy['Geography'] == 'Germany') & (df_copy['IsActiveMember'] == 0)).astype(int)
        df_copy['Has_Zero_Balance'] = (df_copy['Balance'] == 0).astype(int)

        # 對 Tenure 進行 Log 轉換
        df_copy['Tenure_log'] = np.log1p(df_copy['Tenure'])

        int_cols = ['HasCrCard', 'IsActiveMember', 'NumOfProducts', 'Is_two_products', 'Has_Zero_Balance',
                    'Germany_Female', 'Germany_Inactive']
        cat_cols = ['Geography', 'Age_bin']

        df_copy = FeatureEngineer.cast_columns(df_copy, int_cols=int_cols, cat_cols=cat_cols)

        cols_to_drop = ['CustomerId', 'Tenure','Surname', 'RowNumber' ] # 新增 RowNumber
        if is_train and 'Exited' in df_copy.columns:
            cols_to_drop.append('Exited')

        df_copy.drop(columns=[col for col in cols_to_drop if col in df_copy.columns], inplace=True, errors='ignore')
        return df_copy

    @staticmethod
    def run_v2_preprocessing(df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
        """版本 2：V1 + 新旗標 is_mature_inactive_transit。"""
        # 注意：這裡 run_v1_preprocessing 內已處理 'Gender' 映射
        df_copy = FeatureEngineer.run_v1_preprocessing(df, is_train=False)

        # 創建新的交互特徵
        df_copy['is_mature_inactive_transit'] = (
                    (df_copy['Has_Zero_Balance'] == 1) & (df_copy['IsActiveMember'] == 0) & (
                    df_copy['Age'] > 40)).astype(int)

        # 刪除 Exited 欄位 (如果在 V1 預處理中未被刪除，則再次嘗試刪除，但 V1 已處理)
        # 這裡的邏輯是確保最終特徵集不包含 Target
        if is_train and 'Exited' in df.columns: # 應檢查原始 df
            df_copy.drop(columns=['Exited'], inplace=True, errors='ignore')

        return df_copy

    @staticmethod
    def run_v3_preprocessing(df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
        """版本 3：V1 + 多項式/交互特徵。"""
        df_copy = FeatureEngineer.run_v1_preprocessing(df, is_train=False)

        # 創建交互特徵
        df_copy['Balance_per_product'] = df_copy['Balance'] / (df_copy['NumOfProducts'] + 1e-9)
        df_copy['Age_x_Tenure'] = df_copy['Age'] * df_copy['Tenure_log']
        df_copy['CreditScore_x_Age'] = df_copy['CreditScore'] * df_copy['Age']

        # 刪除 Exited 欄位 (確保最終特徵集不包含 Target)
        if is_train and 'Exited' in df.columns: # 應檢查原始 df
            df_copy.drop(columns=['Exited'], inplace=True, errors='ignore')
        return df_copy

In [None]:
logger = logging.getLogger('ModelTrainer')
if not logger.handlers:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [None]:
class HyperparameterTuner:
    """
    超參數調優類別，使用 Optuna 進行優化。
    專注於 XGBoost 的調優。
    """

    @staticmethod
    def _objective(trial: optuna.Trial, X: pd.DataFrame, y: pd.Series, cat_feature_names: list) -> float:
        """
        Optuna 的目標函數：使用交叉驗證評估一組超參數。
        """
        model_name = 'XGBoost' # 假設我們只調優 XGBoost

        # 1. 定義要調優的 XGBoost 參數空間
        params = {
            # 樹參數
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            # 正則化參數
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            # 隨機參數
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        }

        # 2. 定義固定參數
        fixed_params = {
            'random_state': 42,
            'verbose': 0,
            'eval_metric': 'logloss',
            'n_jobs': -1,
            'early_stopping_rounds': 50,
            'enable_categorical': True, # 啟用原生類別特徵支持
        }

        full_params = {**params, **fixed_params}

        # 3. 創建模型
        model = XGBClassifier(**full_params)

        # 4. 交叉驗證與擬合
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=fixed_params['random_state'])
        roc_auc_scores = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # --- 關鍵修正區塊 ---
            # 確保只傳遞 XGBoost 接受的參數
            fit_params = {
                'eval_set': [(X_val, y_val)],
                'verbose': False
            }
            # 'early_stopping_rounds' 已經在 full_params 中，會自動傳遞給 fit
            # 我們不能傳遞 'callbacks'
            # --------------------

            model.fit(X_tr, y_tr, **fit_params)

            # 確保我們使用訓練完成的模型進行預測
            best_iteration = model.get_booster().best_iteration

            # 使用最佳迭代次數預測
            proba_val = model.predict_proba(X_val, iteration_range=(0, best_iteration))[:, 1]
            roc_auc_scores.append(roc_auc_score(y_val, proba_val))

        # 5. 返回平均 ROC AUC 分數（Optuna 將嘗試最大化此值）
        return np.mean(roc_auc_scores)

    @staticmethod
    def tune(X: pd.DataFrame, y: pd.Series, cat_feature_names: list, n_trials: int) -> dict:
        """
        執行 Optuna 調優並返回最佳參數。
        """
        # 創建 Optuna 研究 (Study)
        study = optuna.create_study(direction='maximize')

        # 包裝目標函數，傳遞數據
        objective_with_args = lambda trial: HyperparameterTuner._objective(trial, X, y, cat_feature_names)

        # 開始優化
        study.optimize(objective_with_args, n_trials=n_trials, show_progress_bar=True)

        print(f"調優完成。最佳 ROC AUC: {study.best_value:.5f}")
        print("最佳參數:")
        for key, value in study.best_params.items():
            print(f"  {key}: {value}")

        # 返回最佳參數
        return study.best_params

In [None]:
class ModelTrainer:
    """協調器類別，用於統一模型訓練、評估和預測的流程。"""

    def __init__(self, n_splits: int = Config.N_SPLITS, random_state: int = Config.RANDOM_STATE):
        self.n_splits = n_splits
        self.random_state = random_state
        self.logger = logging.getLogger(self.__class__.__name__)

        if not self.logger.handlers:
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    def run_experiment_tune(self,
                       train_df: pd.DataFrame,
                       test_df: pd.DataFrame,
                       feature_engineering_pipeline: Callable,
                       models: dict, # 使用 dict
                       target_col: str = Config.TARGET_COL,
                       tune_hyperparams: bool = False,
                       tune_model_name: str = 'XGBoost', # 預設為 XGBoost
                       n_trials: int = 50) -> tuple[pd.DataFrame, dict, pd.DataFrame, plt.Figure]: # 使用 tuple 和 dict
        """
        啟動完整的實驗週期，可選配超參數調優。
        """
        self.logger.info(f"--- 啟動新實驗 (特徵工程 FE: {feature_engineering_pipeline.__name__}) ---")
        if tune_hyperparams:
            self.logger.info(f"!!! 已為模型 '{tune_model_name}' 啟用超參數調優模式 !!!")

        test_ids = test_df['id'].copy()
        original_train_for_analysis = train_df.copy()
        y_train = train_df[target_col].astype(int)


        self.logger.info("步驟 1: 應用特徵工程...")
        X_train_processed = feature_engineering_pipeline(train_df, is_train=True)
        X_test_processed = feature_engineering_pipeline(test_df, is_train=False)


        train_cols = X_train_processed.columns
        test_cols = X_test_processed.columns
        if not train_cols.equals(test_cols):
            self.logger.warning("訓練集和測試集的欄位不一致! 正在對齊...")
            shared_cols = list(train_cols.intersection(test_cols))
            X_train_processed = X_train_processed[shared_cols]
            X_test_processed = X_test_processed[shared_cols]


        models_to_train = models.copy()

        if tune_hyperparams:
            if tune_model_name not in models:
                self.logger.error(
                    f"用於調優的模型 '{tune_model_name}' 未在 models 字典中找到。調優已取消。")
            else:
                self.logger.info(f"步驟 1.5: 為 '{tune_model_name}' 進行超參數調優...")

                cat_features = X_train_processed.select_dtypes(include=['category', 'object']).columns.tolist()


                best_params = HyperparameterTuner.tune(
                    X=X_train_processed,
                    y=y_train,
                    cat_feature_names=cat_features,
                    n_trials=n_trials
                )

                # 確保 XGBoost 需要的固定參數被包含
                best_params['random_state'] = self.random_state
                best_params['verbose'] = 0
                best_params['eval_metric'] = 'logloss'
                best_params['n_jobs'] = -1
                best_params['verbosity'] = 0

                # 處理早停參數 (若 Optuna 未調優此參數，則使用預設值)
                if 'early_stopping_rounds' not in best_params:
                    best_params['early_stopping_rounds'] = 50

                # 關鍵修正：實例化 XGBClassifier
                # 由於我們現在只專注於 XGBoost，這裡假設 HyperparameterTuner.tune 返回的是 XGBoost 參數
                tuned_model = XGBClassifier(**best_params)


                tuned_model_name = f"{tune_model_name}_Tuned"
                models_to_train = {tuned_model_name: tuned_model}
                self.logger.info(f"調優完成。模型 '{tuned_model_name}' 將用於訓練。")

        # 2. 訓練與評估模型
        self.logger.info("步驟 2: 在交叉驗證上訓練模型...")
        all_results = self._evaluate_models(models_to_train, X_train_processed, y_train, X_test_processed)

        # 3. 錯誤分析(暫時不用，因此需要修正)
        # self.logger.info("步驟 3: 分析最佳模型的錯誤...")
        # best_model_name, error_df, dashboard_figure = ErrorAnalyzer.analyze_best_model(
        #     all_results, y_train, original_train_for_analysis
        # )
# =========================================================================
        # 修正開始
        # =========================================================================
        # 步驟 3: 確定最佳模型名稱並定義返回值 (取代 ErrorAnalyzer)
        # 由於 run_experiment_tune 通常只訓練一個模型，我們直接取其名稱
        best_model_name = list(all_results.keys())[0]
        self.logger.info(f"步驟 3: 最佳模型名稱確定為: {best_model_name}")

        # 錯誤分析已註解，必須定義返回變數作為預留位置
        error_df = pd.DataFrame()
        dashboard_figure = plt.figure()
        # =========================================================================
        # 修正結束
        # =========================================================================
        # 4. 生成提交文件
        self.logger.info("步驟 4: 生成提交文件...")
        submission_df = self._generate_submission(
            f"submission_{best_model_name}_{feature_engineering_pipeline.__name__}.csv",
            test_ids,
            all_results[best_model_name]['test_preds']
        )

        self.logger.info("--- 調優成功完成 ---")
        return submission_df, all_results, error_df, dashboard_figure

    def run_experiment(self,
                       train_df: pd.DataFrame,
                       test_df: pd.DataFrame,
                       feature_engineering_pipeline: Callable,
                       models: dict, # 使用 dict
                       target_col: str = Config.TARGET_COL) -> tuple[pd.DataFrame, dict, pd.DataFrame, plt.Figure]: # 使用 tuple 和 dict
        """
        啟動完整的實驗週期：特徵工程 (FE)、訓練、錯誤分析、生成提交文件。
        """
        self.logger.info(f"--- 啟動新實驗 (特徵工程 FE: {feature_engineering_pipeline.__name__}) ---")

        test_ids = test_df['id'].copy()
        original_train_for_analysis = train_df.copy()
        y_train = train_df[target_col].astype(int)

        # 1. 特徵工程
        self.logger.info("步驟 1: 應用特徵工程...")
        X_train_processed = feature_engineering_pipeline(train_df, is_train=True)
        X_test_processed = feature_engineering_pipeline(test_df, is_train=False)

        train_cols = X_train_processed.columns
        test_cols = X_test_processed.columns
        if not train_cols.equals(test_cols):
            self.logger.warning("訓練集和測試集的欄位不一致! 正在對齊...")
            shared_cols = list(train_cols.intersection(test_cols))
            X_train_processed = X_train_processed[shared_cols]
            X_test_processed = X_test_processed[shared_cols]

        # 2. 訓練與評估模型
        self.logger.info("步驟 2: 在交叉驗證上訓練模型...")
        all_results = self._evaluate_models(models, X_train_processed, y_train, X_test_processed)

        # 3. 錯誤分析(暫時不用，因此要修正)
        # self.logger.info("步驟 3: 分析最佳模型的錯誤...")
        # best_model_name, error_df, dashboard_figure = ErrorAnalyzer.analyze_best_model(
        #     all_results, y_train, original_train_for_analysis
        # )
# =========================================================================
        # 修正開始
        # =========================================================================
        # 步驟 3: 確定最佳模型名稱並定義返回值 (取代 ErrorAnalyzer)
        self.logger.info("步驟 3: 確定性能最佳的模型名稱...")
        # 根據 CV ROC AUC 平均值選出最佳模型
        best_roc_auc = -1.0
        best_model_name = None
        for name, result in all_results.items():
            current_auc = result['metrics_df']['ROC AUC'].mean()
            if current_auc > best_roc_auc:
                best_roc_auc = current_auc
                best_model_name = name

        # 錯誤分析已註解，必須定義返回變數作為預留位置
        error_df = pd.DataFrame()
        dashboard_figure = plt.figure()
        # =========================================================================
        # 修正結束
        # =========================================================================
        # 4. 生成提交文件
        self.logger.info("步驟 4: 生成提交文件...")
        submission_df = self._generate_submission(
            f"submission_{best_model_name}_{feature_engineering_pipeline.__name__}.csv",
            test_ids,
            all_results[best_model_name]['test_preds']
        )

        self.logger.info("--- 實驗成功完成 ---")
        return submission_df, all_results, error_df, dashboard_figure


    def _evaluate_models(self, models: dict, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame) -> dict: # 使用 dict
        """
        使用交叉驗證訓練和驗證模型 (僅保留 XGBoost 相關邏輯)。
        """
        self.logger.info("啟動交叉驗證...")
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        results = {}

        # 即使只用 XGBoost，偵測類別特徵仍是重要的步驟
        cat_feature_names = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
        if cat_feature_names:
            self.logger.info(f"偵測到類別特徵: {cat_feature_names}")

        for name, model in models.items():
            self.logger.info(f"正在訓練模型: {name}")
            oof_preds = np.zeros(len(X_train))
            test_preds_folds, fold_metrics_list, importances_folds = [], [], []


            # 進行 K 折交叉驗證
            for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                current_model = clone(model)
                fit_params = {}

                X_tr_fit, X_val_fit = X_tr, X_val

                # --- 簡化後的模型特定邏輯：僅保留 XGBClassifier ---
                if isinstance(current_model, XGBClassifier):
                    fit_params['eval_set'] = [(X_val_fit, y_val)]
                    fit_params['verbose'] = False # 設置 XGBoost 靜默模式

                # 提醒：若 models 字典中包含非 XGBClassifier 模型，它們將在這裡使用預設 fit 參數訓練
                # 且若為非原生支持類別特徵的模型，且數據中包含類別特徵，將會訓練失敗。
                # ----------------------------------------------------

                # 訓練模型
                current_model.fit(X_tr_fit, y_tr, **fit_params)

                # 預測
                X_test_predict = X_test.copy()
                # 由於只保留 XGBoost，且假設 XGBoost 透過 enable_categorical=True 原生處理類別特徵，
                # 我們移除手動編碼邏輯。

                proba_val = current_model.predict_proba(X_val_fit)[:, 1] # 驗證集預測概率
                proba_test = current_model.predict_proba(X_test_predict)[:, 1] # 測試集預測概率

                oof_preds[val_idx] = proba_val
                test_preds_folds.append(proba_test)

                # 收集指標和特徵重要性
                fold_metrics_list.append(
                    {'ROC AUC': roc_auc_score(y_val, proba_val), 'PR AUC': average_precision_score(y_val, proba_val)})
                if hasattr(current_model, 'feature_importances_'):
                    importances_folds.append(current_model.feature_importances_) # 樹模型
                elif hasattr(current_model, 'coef_'):
                    importances_folds.append(np.abs(current_model.coef_[0])) # 線性模型

            # 儲存結果
            results[name] = {
                'oof_preds': oof_preds,
                'test_preds': np.mean(test_preds_folds, axis=0),
                'metrics_df': pd.DataFrame(fold_metrics_list),
                'feature_importances': np.mean(importances_folds, axis=0) if importances_folds else None,
                'feature_names': X_train.columns
            }
            self.logger.info(
                f"  模型 {name} | CV ROC AUC: {results[name]['metrics_df']['ROC AUC'].mean():.4f} ± {results[name]['metrics_df']['ROC AUC'].std():.4f}")
        return results

    def _generate_submission(self, filename: str, df_test_id: pd.Series, test_preds: np.ndarray) -> pd.DataFrame:
        print(f'filename = {filename}')
        # 保留這個特殊的文件名處理邏輯
        if filename == 'submission_CatBoost_final_run_v3_preprocessing.csv':
            filename = 'submission.csv'
        print(f'filename1 = {filename}')
        submission_df = pd.DataFrame({'id': df_test_id, 'Exited': test_preds})
        submission_df.to_csv(filename, index=False)
        self.logger.info(f"提交文件成功保存: {filename}")
        return submission_df

In [None]:
logger = logging.getLogger('ModelTrainer')
if not logger.handlers:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class HyperparameterTuner:
    """超參數調優類別，使用 Optuna 進行優化。"""

    @staticmethod
    def _objective(trial: optuna.Trial, X: pd.DataFrame, y: pd.Series, cat_feature_names: List[str]) -> float:
        """
        Optuna 的目標函數：使用交叉驗證評估一組超參數。
        此函數已修正為兼容 XGBoost 1.7.6 的 fit 語法。
        """
        # 1. 定義要調優的 XGBoost 參數空間
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        }

        # 2. 定義固定參數
        fixed_params = {
            'random_state': Config.RANDOM_STATE,
            'verbose': 0,
            'eval_metric': 'logloss',
            'n_jobs': -1,
            'early_stopping_rounds': 50, # 舊版早停參數，必須在模型實例化時傳入
            'enable_categorical': True, # 啟用原生類別特徵支持 (如果 XGBoost 版本夠新)
        }

        full_params = {**params, **fixed_params}

        # 3. 創建模型
        model = XGBClassifier(**full_params)

        # 4. 交叉驗證與擬合
        skf = StratifiedKFold(n_splits=Config.N_SPLITS, shuffle=True, random_state=fixed_params['random_state'])
        roc_auc_scores = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # --- 兼容 XGBoost 1.7.6 的 fit 參數 ---
            fit_params = {
                'eval_set': [(X_val, y_val)], # 必須在 fit 中傳遞驗證集
                'verbose': False
            }
            # 注意：這裡不能傳遞 'callbacks' 參數

            try:
                model.fit(X_tr, y_tr, **fit_params)

                # 確保我們使用訓練完成的模型進行預測
                best_iteration = model.get_booster().best_iteration

                # 使用最佳迭代次數預測
                proba_val = model.predict_proba(X_val, iteration_range=(0, best_iteration))[:, 1]
                roc_auc_scores.append(roc_auc_score(y_val, proba_val))
            except Exception as e:
                logger.error(f"Optuna Fold {fold} 訓練失敗: {e}")
                # 遇到錯誤時，返回 0.0 或拋出 PruningError 讓 Optuna 處理

        # 5. 返回平均 ROC AUC 分數
        return np.mean(roc_auc_scores)

    @staticmethod
    def tune(X: pd.DataFrame, y: pd.Series, cat_feature_names: List[str], n_trials: int) -> dict:
        """執行 Optuna 調優並返回最佳參數。"""
        # 創建 Optuna 研究 (Study)
        study = optuna.create_study(direction='maximize')

        # 包裝目標函數，傳遞數據
        objective_with_args = lambda trial: HyperparameterTuner._objective(trial, X, y, cat_feature_names)

        # 開始優化
        study.optimize(objective_with_args, n_trials=n_trials, show_progress_bar=True)

        print(f"調優完成。最佳 ROC AUC: {study.best_value:.5f}")
        print("最佳參數:")
        for key, value in study.best_params.items():
            print(f"  {key}: {value}")

        return study.best_params

In [None]:
class ModelTrainer:
    """協調器類別，用於統一模型訓練、評估和預測的流程。"""

    def __init__(self, n_splits: int = Config.N_SPLITS, random_state: int = Config.RANDOM_STATE):
        self.n_splits = n_splits
        self.random_state = random_state
        self.logger = logging.getLogger(self.__class__.__name__)

        if not self.logger.handlers:
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    def run_experiment_tune(self,
                           train_df: pd.DataFrame,
                           test_df: pd.DataFrame,
                           feature_engineering_pipeline: Callable,
                           models: Dict[str, Any],
                           target_col: str = Config.TARGET_COL,
                           tune_hyperparams: bool = False,
                           tune_model_name: str = 'XGBoost',
                           n_trials: int = 50) -> Tuple[pd.DataFrame, Dict, pd.DataFrame, plt.Figure]:
        """啟動完整的實驗週期，可選配超參數調優。"""
        self.logger.info(f"--- 啟動新實驗 (特徵工程 FE: {feature_engineering_pipeline.__name__}) ---")

        if train_df.empty:
            self.logger.error("訓練數據為空，無法運行實驗。")
            return pd.DataFrame(), {}, pd.DataFrame(), plt.figure()


        test_ids = test_df['id'].copy()
        original_train_for_analysis = train_df.copy()
        y_train = train_df[target_col].astype(int)

        self.logger.info("步驟 1: 應用特徵工程...")
        X_train_processed = feature_engineering_pipeline(train_df, is_train=True)
        X_test_processed = feature_engineering_pipeline(test_df, is_train=False)

        # 對齊欄位
        train_cols = X_train_processed.columns
        test_cols = X_test_processed.columns
        if not train_cols.equals(test_cols):
            self.logger.warning("訓練集和測試集的欄位不一致! 正在對齊...")
            shared_cols = list(train_cols.intersection(test_cols))
            X_train_processed = X_train_processed[shared_cols]
            X_test_processed = X_test_processed[shared_cols]

        models_to_train = models.copy()
        
        # 步驟 1.5: 超參數調優
        if tune_hyperparams:
            self.logger.info(f"!!! 已為模型 '{tune_model_name}' 啟用超參數調優模式 !!!")
            if tune_model_name not in models:
                self.logger.error(f"用於調優的模型 '{tune_model_name}' 未在 models 字典中找到。調優已取消。")
            else:
                self.logger.info(f"步驟 1.5: 為 '{tune_model_name}' 進行超參數調優...")

                cat_features = X_train_processed.select_dtypes(include=['category', 'object']).columns.tolist()

                best_params = HyperparameterTuner.tune(
                    X=X_train_processed,
                    y=y_train,
                    cat_feature_names=cat_features,
                    n_trials=n_trials
                )

                # 重新整合參數
                best_params.update({
                    'random_state': self.random_state,
                    'eval_metric': 'logloss',
                    'n_jobs': -1,
                    # early_stopping_rounds 必須在模型實例化時傳入
                    'early_stopping_rounds': best_params.get('early_stopping_rounds', 50), 
                    'enable_categorical': True,
                    'verbose': 0
                })
                
                # 實例化調整後的模型
                tuned_model = XGBClassifier(**best_params)
                tuned_model_name = f"{tune_model_name}_Tuned"
                models_to_train = {tuned_model_name: tuned_model}
                self.logger.info(f"調優完成。模型 '{tuned_model_name}' 將用於訓練。")

        # 2. 訓練與評估模型
        self.logger.info("步驟 2: 在交叉驗證上訓練模型...")
        all_results = self._evaluate_models(models_to_train, X_train_processed, y_train, X_test_processed)

        # 3. 確定最佳模型名稱並定義返回值 (取代 ErrorAnalyzer)
        best_model_name = list(all_results.keys())[0] # 如果只訓練一個模型，就是它
        self.logger.info(f"步驟 3: 最佳模型名稱確定為: {best_model_name}")
        error_df = pd.DataFrame()
        dashboard_figure = plt.figure()

        # 4. 生成提交文件
        self.logger.info("步驟 4: 生成提交文件...")
        submission_df = self._generate_submission(
            f"submission_{best_model_name}_{feature_engineering_pipeline.__name__}.csv",
            test_ids,
            all_results[best_model_name]['test_preds']
        )

        self.logger.info("--- 調優成功完成 ---")
        return submission_df, all_results, error_df, dashboard_figure

    def run_experiment(self,
                       train_df: pd.DataFrame,
                       test_df: pd.DataFrame,
                       feature_engineering_pipeline: Callable,
                       models: Dict[str, Any],
                       target_col: str = Config.TARGET_COL) -> Tuple[pd.DataFrame, Dict, pd.DataFrame, plt.Figure]:
        """啟動完整的實驗週期：特徵工程 (FE)、訓練、生成提交文件。"""
        self.logger.info(f"--- 啟動新實驗 (特徵工程 FE: {feature_engineering_pipeline.__name__}) ---")

        if train_df.empty:
            self.logger.error("訓練數據為空，無法運行實驗。")
            return pd.DataFrame(), {}, pd.DataFrame(), plt.figure()

        test_ids = test_df['id'].copy()
        original_train_for_analysis = train_df.copy()
        y_train = train_df[target_col].astype(int)

        # 1. 特徵工程
        self.logger.info("步驟 1: 應用特徵工程...")
        X_train_processed = feature_engineering_pipeline(train_df, is_train=True)
        X_test_processed = feature_engineering_pipeline(test_df, is_train=False)

        # 對齊欄位
        train_cols = X_train_processed.columns
        test_cols = X_test_processed.columns
        if not train_cols.equals(test_cols):
            self.logger.warning("訓練集和測試集的欄位不一致! 正在對齊...")
            shared_cols = list(train_cols.intersection(test_cols))
            X_train_processed = X_train_processed[shared_cols]
            X_test_processed = X_test_processed[shared_cols]

        # 2. 訓練與評估模型
        self.logger.info("步驟 2: 在交叉驗證上訓練模型...")
        all_results = self._evaluate_models(models, X_train_processed, y_train, X_test_processed)

        # 3. 確定最佳模型名稱並定義返回值 (取代 ErrorAnalyzer)
        self.logger.info("步驟 3: 確定性能最佳的模型名稱...")
        best_roc_auc = -1.0
        best_model_name = None
        for name, result in all_results.items():
            current_auc = result['metrics_df']['ROC AUC'].mean()
            if current_auc > best_roc_auc:
                best_roc_auc = current_auc
                best_model_name = name

        if best_model_name is None:
            self.logger.error("沒有模型成功訓練或評估。")
            return pd.DataFrame(), all_results, pd.DataFrame(), plt.figure()


        error_df = pd.DataFrame()
        dashboard_figure = plt.figure()

        # 4. 生成提交文件
        self.logger.info("步驟 4: 生成提交文件...")
        submission_df = self._generate_submission(
            f"submission_{best_model_name}_{feature_engineering_pipeline.__name__}.csv",
            test_ids,
            all_results[best_model_name]['test_preds']
        )

        self.logger.info("--- 實驗成功完成 ---")
        return submission_df, all_results, error_df, dashboard_figure

    def _evaluate_models(self, models: Dict[str, Any], X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame) -> Dict:
        """
        使用交叉驗證訓練和驗證模型。
        此函數已修正為兼容 XGBoost 1.7.6 的 fit 語法。
        """
        self.logger.info("啟動交叉驗證...")
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        results = {}

        cat_feature_names = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
        if cat_feature_names:
            self.logger.info(f"偵測到類別特徵: {cat_feature_names}")

        for name, model in models.items():
            self.logger.info(f"正在訓練模型: {name}")
            oof_preds = np.zeros(len(X_train))
            test_preds_folds, fold_metrics_list, importances_folds = [], [], []

            for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                current_model = clone(model)
                fit_params = {}

                # --- 關鍵修正區塊：兼容 XGBoost 1.7.6 ---
                if isinstance(current_model, XGBClassifier):
                    # 舊版 XGBoost 需要 eval_set 作為 fit() 參數
                    fit_params['eval_set'] = [(X_val, y_val)]
                    fit_params['verbose'] = False # 設置靜默模式
                    # early_stopping_rounds 必須在模型實例化時傳入 
                # ----------------------------------------

                try:
                    # 訓練模型
                    current_model.fit(X_tr, y_tr, **fit_params)

                    # 確保我們使用最佳迭代次數預測
                    best_iteration = current_model.get_booster().best_iteration

                    # 預測驗證集
                    proba_val = current_model.predict_proba(X_val, iteration_range=(0, best_iteration))[:, 1]
                    # 預測測試集
                    proba_test = current_model.predict_proba(X_test, iteration_range=(0, best_iteration))[:, 1]

                    oof_preds[val_idx] = proba_val
                    test_preds_folds.append(proba_test)

                    # 收集指標
                    fold_metrics_list.append(
                        {'ROC AUC': roc_auc_score(y_val, proba_val), 'PR AUC': average_precision_score(y_val, proba_val)})
                    
                    # 收集特徵重要性 (假設是樹模型)
                    if hasattr(current_model, 'feature_importances_'):
                        importances_folds.append(current_model.feature_importances_)
                except Exception as e:
                    self.logger.error(f"模型 {name} 在折疊 {fold} 訓練時發生錯誤: {e}")
                    # 如果訓練失敗，則跳過此折疊的預測和指標收集
                    continue

            # 儲存結果
            results[name] = {
                'oof_preds': oof_preds,
                'test_preds': np.mean(test_preds_folds, axis=0) if test_preds_folds else np.zeros(len(X_test)),
                'metrics_df': pd.DataFrame(fold_metrics_list),
                'feature_importances': np.mean(importances_folds, axis=0) if importances_folds else None,
                'feature_names': X_train.columns
            }
            if not results[name]['metrics_df'].empty:
                self.logger.info(
                    f" 模型 {name} | CV ROC AUC: {results[name]['metrics_df']['ROC AUC'].mean():.4f} ± {results[name]['metrics_df']['ROC AUC'].std():.4f}")
            else:
                 self.logger.warning(f"模型 {name} 訓練失敗，無法計算 CV ROC AUC。")
        return results

    def _generate_submission(self, filename: str, df_test_id: pd.Series, test_preds: np.ndarray) -> pd.DataFrame:
        # 由於您原代碼中有一個特定的替換邏輯，為防止意外，我將其保留。
        # 除非有必要，否則應避免這種硬編碼的檔名替換。
        # print(f'filename = {filename}')
        # if filename == 'submission_CatBoost_final_run_v3_preprocessing.csv':
        #     filename = 'submission.csv'
        # print(f'filename1 = {filename}')
        
        submission_df = pd.DataFrame({'id': df_test_id, 'Exited': test_preds})
        # 由於 notebook 環境中，我們將檔案儲存在本地
        submission_df.to_csv(filename, index=False)
        self.logger.info(f"提交文件成功保存: {filename}")
        return submission_df

In [None]:
if not df_train.empty:
    trainer = ModelTrainer()

    # 整合 Optuna 找到的最佳參數和固定的 XGBoost 參數
    # 這些參數是從假設的 Optuna 運行中獲得的優化結果
    final_best_params = {
        # Optuna 最佳參數 (範例)
        'n_estimators': 2692,
        'learning_rate': 0.05786197845936901,
        'max_depth': 3,
        'reg_lambda': 1.0628185137032307e-08,
        'reg_alpha': 3.255737505871401,
        'subsample': 0.8409191153520594,
        'colsample_bytree': 0.7834673458794292,

        # 固定的參數 (確保兼容性)
        'random_state': Config.RANDOM_STATE,
        'eval_metric': 'logloss',
        'n_jobs': -1,
        'early_stopping_rounds': 50,  # 舊版早停參數
        'enable_categorical': True, 
        'verbose': 0
    }

    # 實例化最終模型
    final_tuned_model = XGBClassifier(**final_best_params)

    # 創建包含最終模型的字典
    models_final = {
        'XGBoost_Final_Tuned': final_tuned_model
    }

    # 選擇最佳特徵工程版本
    best_fe_pipeline = FeatureEngineer.run_v2_preprocessing

    # 運行最終實驗
    submission_final, results_final, errors_final, dashboard_final = trainer.run_experiment(
        train_df=df_train,
        test_df=df_test,
        feature_engineering_pipeline=best_fe_pipeline,
        models=models_final
    )
    
    print("\n--- 最終結果摘要 ---")
    if 'XGBoost_Final_Tuned' in results_final:
        metrics_df = results_final['XGBoost_Final_Tuned']['metrics_df']
        print(f"模型: XGBoost_Final_Tuned (FE: {best_fe_pipeline.__name__})")
        print(f"交叉驗證 ROC AUC: {metrics_df['ROC AUC'].mean():.5f} ± {metrics_df['ROC AUC'].std():.5f}")
    
    print(f"提交文件已保存為: submission_XGBoost_Final_Tuned_{best_fe_pipeline.__name__}.csv")

In [None]:
# 展示交叉驗證結果
if 'XGBoost_Final_Tuned' in results_final:
    metrics_df = results_final['XGBoost_Final_Tuned']['metrics_df']
    plt.figure(figsize=(10, 5))
    sns.barplot(x=metrics_df.index, y='ROC AUC', data=metrics_df, palette='viridis')
    plt.title('Fold ROC AUC Scores')
    plt.xlabel('Fold')
    plt.ylabel('ROC AUC Score')
    plt.show()

# 展示特徵重要性
if 'XGBoost_Final_Tuned' in results_final and results_final['XGBoost_Final_Tuned']['feature_importances'] is not None:
    feature_importances = results_final['XGBoost_Final_Tuned']['feature_importances']
    feature_names = results_final['XGBoost_Final_Tuned']['feature_names']
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='magma')
    plt.title('Top 20 Feature Importances (Mean across Folds)')
    plt.show()