In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, label_binarize, OrdinalEncoder, QuantileTransformer, TargetEncoder
from category_encoders import CatBoostEncoder, MEstimateEncoder

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import RidgeClassifier, LogisticRegression, LinearRegression, BayesianRidge, Ridge

from sklearn import set_config
import os

import optuna
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, root_mean_squared_error, mean_squared_error, precision_recall_curve, make_scorer, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, matthews_corrcoef
from scipy.stats import norm, skew

from colorama import Fore, Style, init
from copy import deepcopy
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, KFold, RepeatedKFold, cross_val_score, StratifiedGroupKFold
import xgboost as xgb
from xgboost import DMatrix, XGBClassifier, XGBRegressor

from lightgbm import log_evaluation, early_stopping, LGBMClassifier, LGBMRegressor, Dataset
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from tqdm.notebook import tqdm
from optuna.samplers import TPESampler, CmaEsSampler
from optuna.pruners import HyperbandPruner
from functools import partial
from IPython.display import display_html, clear_output
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import gc
import re
from typing import Literal, NamedTuple
from itertools import combinations

import keras
from keras.models import Sequential
from keras import layers
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

import warnings
warnings.filterwarnings("ignore")

In [2]:
#为预处理数据
class Config:

        state = 42
        n_splits = 10
        early_stop = 100
        
        target = 'y'
        train = pd.read_csv("data/train.csv")
        test = pd.read_csv("data/test.csv")
        submission =pd.read_csv("data/sample_submission.csv")
    
        original_data = False
        outliers = False
        log_trf = False
        feature_eng = True
        missing = False
        labels = list(train[target].unique())
        topk_interactions = 20
    


In [3]:
import time
class Transform(Config):
    
    def __init__(self):
        # 调用父类 Config 的 __init__ 方法（如果存在）
        super().__init__()
        t0 = time.time()
        # 如果启用 original_data，则将原始数据合并到训练集
        if self.original_data:
            start = time.time()
            # 将目标列转换为 0/1（假设值为 "yes" 和 "no"）
            self.train_org[self.target] = (self.train_org[self.target] == "yes").astype(int)
            # 合并并去重
            self.train = pd.concat([self.train, self.train_org], ignore_index=True).drop_duplicates()
            self.train.reset_index(drop=True, inplace=True)
            print(f"[合并原始数据] {time.time()-start:.2f}s")

        
        # 获取数值型特征列名（排除 object/bool/category/string）
        self.num_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(exclude=['object', 'bool', 'category', 'string']).columns.tolist()
        
        # 获取类别特征列名（只保留 object/bool/category/string）
        self.cat_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(include=['object', 'bool', 'category', 'string']).columns.tolist()

        if self.missing:
            self.missing_values()

        if self.outliers:
            self.remove_outliers()

        if self.log_trf:
            self.log_transformation()


        start = time.time()
        self.important_features = self.select_important_features(top_k=20)
        print(f"[特征重要度] {time.time()-start:.2f}s")

        if self.feature_eng and self.important_features:
            self.train = self.new_features(self.train, self.important_features)
            self.test  = self.new_features(self.test, self.important_features)
            self.num_features = self.train.drop(self.target, axis=1)\
                .select_dtypes(exclude=['object', 'bool', 'string', 'category']).columns.tolist()
            print(f"[交互特征] {time.time()-start:.2f}s")

        start = time.time()
        self.encode()

        print(f"[总耗时] {time.time()-t0:.2f}s")
        
    def __call__(self):
        # 保存目标列
        self.y = self.train[self.target]
        # 保存原始特征
        self.X = self.train.drop(self.target, axis=1)
        # 保存编码后的特征
        self.X_enc = self.train_enc.drop(self.target, axis=1)
        return self.X, self.X_enc, self.y, self.test, self.test_enc, self.cat_features, self.num_features
    
    def encode(self):
        self.train_enc = self.train.copy()
        self.test_enc = self.test.copy()
        
        self.cat_features_card = []
        for f in self.cat_features:
            self.cat_features_card.append(self.train[f].nunique())
        
        # 创建编码器并仅用训练集类别特征进行 fit
        oe = OrdinalEncoder()
        oe.fit(self.train_enc[self.cat_features])
        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.cat_features] = oe.transform(self.train_enc[self.cat_features]).astype(int)
        self.test_enc[self.cat_features] = oe.transform(self.test_enc[self.cat_features]).astype(int)
        
        # 创建标准化器并仅用训练集数值特征进行 fit
        scaler = StandardScaler()
        scaler.fit(self.train_enc[self.num_features])

        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.num_features] = scaler.transform(self.train_enc[self.num_features])
        self.test_enc[self.num_features] = scaler.transform(self.test_enc[self.num_features])

    def select_important_features(self, top_k=20, task='auto'):
        """
        基于树模型的特征重要度筛选，并记录耗时。
        功能：返回用于生成交互项的前 top_k 个重要数值特征。
        """
        start_time = time.time()

        # Step 1: 构造候选列
        feat_cols = [c for c in self.num_features if c in self.train.columns and c != self.target]
        if not feat_cols:
            print("[select_important_features] 无可用数值特征，返回空。")
            return []


        # Step 2: 准备训练数据
        X_train_imp = self.train[feat_cols]
        y_train_imp = self.train[self.target]
        is_class = (y_train_imp.nunique() <= 10)

        # 轻量模型（示例：ExtraTrees；也可 mutual_info_*）
        from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
        model = (ExtraTreesClassifier(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state)
                 if is_class else
                 ExtraTreesRegressor(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state))
        model.fit(X_train_imp, y_train_imp)
    
        importances = model.feature_importances_
        top_num_feats = pd.Series(importances, index=feat_cols).sort_values(ascending=False).head(top_k).index.tolist()
    
        print(f"[select_important_features] 运行耗时: {time.time()-start_time:.2f} 秒（未编码）")
        return top_num_feats


            
    def new_features(self, data, top_num_feats=None):
        # 创建所有数值特征两两组合的乘积特征
        feats = top_num_feats if top_num_feats else self.num_features
        for c1, c2 in list(combinations(feats, 2)):
            data[f"{c1}_{c2}"] = data[c1] * data[c2]
        # 将类别特征转换为 category 类型
        data[self.cat_features] = data[self.cat_features].astype('category')
        return data

    def log_transformation(self):
        # 对目标列做 log1p 变换
        self.train[self.target] = np.log1p(self.train[self.target]) 
        return self
        
    def remove_outliers(self):
        # 基于 IQR 的异常值去除方法
        Q1 = self.train[self.target].quantile(0.25)
        Q3 = self.train[self.target].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        self.train = self.train[(self.train[self.target] >= lower_limit) & (self.train[self.target] <= upper_limit)]
        self.train.reset_index(drop=True, inplace=True)
    
    def missing_values(self):
        # 将类别特征中的缺失值替换为字符串 'NaN'
        self.train[self.cat_features] = self.train[self.cat_features].fillna('NaN')
        self.test[self.cat_features] = self.test[self.cat_features].fillna('NaN')
        return self

    def reduce_mem(self, df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', "uint16", "uint32", "uint64"]
        for col in df.columns:
        # 修正④：将 dtype 转为字符串再比较
            col_type = str(df[col].dtype)
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if "int" in col_type:
                    if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
                else:
                # 修正⑤：第二个分支用 elif，避免覆盖
                    if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        return df

In [4]:
t = Transform()

[select_important_features] 运行耗时: 46.19 秒（未编码）
[特征重要度] 46.37s
[交互特征] 47.13s
[总耗时] 49.15s


In [5]:
X, X_enc, y, test, test_enc, cat_features, num_features = t()

In [6]:
def build_model(cat_features, num_features):
    
    x_input_cats = layers.Input(shape=(len(cat_features),))
    embs = []
    for j in range(len(cat_features)):
        e = layers.Embedding(t.cat_features_card[j], int(np.ceil(np.sqrt(t.cat_features_card[j]))))
        x = e(x_input_cats[:,j])
        x = layers.Flatten()(x)
        embs.append(x)
        
    x_input_nums = layers.Input(shape=(len(num_features),))
    
    x = layers.Concatenate(axis=-1)(embs+[x_input_nums]) 
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=[x_input_cats,x_input_nums], outputs=x)
    return model

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
import contextlib, io
import ydf; ydf.verbose(2)
from ydf import RandomForestLearner

def YDFClassification(learner_class):

    class YDFXClassifier(BaseEstimator, ClassifierMixin):

        def __init__(self, params=None):
            self.params = {} if params is None else params.copy()

        def fit(self, X: pd.DataFrame, y: pd.Series):
            assert isinstance(X, pd.DataFrame)
            assert isinstance(y, pd.Series)

            self.classes_ = list(y.unique())
            self.n_classes_ = len(self.classes_)

            target = y.name
            params = self.params.copy()
            params['label'] = target
            params['task'] = ydf.Task.CLASSIFICATION

            df = pd.concat([X, y], axis=1)

            with contextlib.redirect_stdout(io.StringIO()), \
                 contextlib.redirect_stderr(io.StringIO()):
                self.model = learner_class(**params).train(df)

            return self

        def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
            assert isinstance(X, pd.DataFrame)

            with contextlib.redirect_stdout(io.StringIO()), \
                 contextlib.redirect_stderr(io.StringIO()):
                raw = self.model.predict(X)

            proba = np.asarray(raw)
            if proba.ndim == 1:
                proba = np.vstack([1 - proba, proba]).T

            return proba

        def predict(self, X: pd.DataFrame) -> np.ndarray:
            proba = self.predict_proba(X)
            idx = proba.argmax(axis=1)
            return np.array(self.classes_)[idx]

    return YDFXClassifier

In [8]:
models = {
    'XGB': XGBClassifier(**{'tree_method': 'hist',
                            'n_estimators': 10000,
                            'objective': 'binary:logistic',
                            'random_state': Config.state,
                            'eval_metric': 'auc',
                            'booster': 'gbtree',
                            'n_jobs': -1,
                            'reg_lambda': 4.510522889747622,
                            'reg_alpha': 5.007953193043952, 
                            'colsample_bytree': 0.5831655543160346,
                            'subsample': 0.9808690492838653,
                            'learning_rate': 0.008247101477015132,
                            'max_depth': 11,
                            'min_child_weight': 1,
                            'device': 'cuda',
                            }),
    'LGBM': LGBMClassifier(**{'random_state': Config.state,
                              'verbose': -1,
                              'n_estimators': 10000,
                              'metric': 'AUC',
                              'objective': 'binary',
                              'max_depth': 16,
                              'learning_rate': 0.007366917567300051,
                              'min_child_samples': 164,
                              'subsample': 0.9022880020285295,
                              'colsample_bytree': 0.4213201532077694,
                              'num_leaves': 122, 
                              'reg_alpha': 1.083996192298843,
                              'reg_lambda': 0.0700057221912873,
                              'device_type': 'gpu', 
                              }),
    'LGBM2': LGBMClassifier(**{'random_state': Config.state,
                               'verbose': -1,
                               'n_estimators': 10000,
                               'metric': 'AUC',
                               'objective': 'binary',
                               'max_depth': 19,
                               'learning_rate': 0.010196940756517232,
                               'min_child_samples': 40,
                               'subsample': 0.5388367974706456,
                               'colsample_bytree': 0.24506890759293215,
                               'num_leaves': 360, 
                               'reg_alpha': 0.11493527242956506,
                               'reg_lambda': 0.8048854866109955,
                               'device_type': 'gpu', 
                              }),
    'CAT': CatBoostClassifier(**{'random_state': Config.state,
                                 'eval_metric': "Logloss",
                                 'n_estimators' : 5000,
                                 'learning_rate': 0.06524873965257823,
                                 'l2_leaf_reg': 0.8867612905712001,
                                 'bagging_temperature': 0.1317347791955057,
                                 'random_strength': 0.9922857768340815,
                                 'depth': 7,
                                 'min_data_in_leaf': 8,
                                 'task_type': "GPU",
                                 }),
    'CAT2': CatBoostClassifier(**{'random_state': Config.state,
                                  'eval_metric': "Logloss",
                                  'n_estimators' : 5000,
                                  'learning_rate': 0.034582298874165696,
                                  'l2_leaf_reg': 0.9838795180512044,
                                  'bagging_temperature': 0.22069473702418926,
                                  'random_strength': 1.0557491242401338,
                                  'depth': 9,
                                  'min_data_in_leaf': 166,
                                  'task_type': "GPU"
                                 }),
    'NN': _,
    'YDF': YDFClassification(RandomForestLearner)({'num_trees': 1000,
                                                   'max_depth': 6,
                                                   'random_seed': Config.state,
                                                   'growing_strategy': 'BEST_FIRST_GLOBAL'
                                               })
}

In [9]:
class Trainer(Config):
    def __init__(self, X, X_enc, y, test, test_enc, models, training=True):  
        super().__init__()
        self.X = X
        self.X_enc = X_enc
        self.test = test
        self.test_enc = test_enc
        self.y = y
        self.models = models  # 传入的模型字典
        self.training = training  # 是否进行训练，False 时读取已保存的预测结果
        self.scores = pd.DataFrame(columns=['Score'])  # 保存各模型的得分
        self.OOF_preds = pd.DataFrame(dtype=float)  # 保存 Out-Of-Fold 预测结果
        self.TEST_preds = pd.DataFrame(dtype=float)  # 保存测试集预测结果
        # 定义交叉验证方式：分层 K 折，保证类别分布一致
        self.folds = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.state)

    def train(self, model, X, y, test, model_name):
        # 初始化 OOF 预测和测试集预测
        oof_pred = np.zeros(X.shape[0], dtype=float)
        test_pred = np.zeros(test.shape[0], dtype=float)

        print('='*20)
        print(model_name)

        # 进行 K 折交叉验证
        for n_fold, (train_id, valid_id) in enumerate(self.folds.split(X, y)):
            # 按照索引切分训练集和验证集
            X_train = X.iloc[train_id].copy() if isinstance(X, pd.DataFrame) else X[train_id]
            y_train = y.iloc[train_id]
            X_val   = X.iloc[valid_id].copy() if isinstance(X, pd.DataFrame) else X[valid_id]
            y_val   = y.iloc[valid_id]
            X_test  = test.copy()



            # 如果是神经网络（NN），需要单独处理类别特征和数值特征
            if 'NN' in model_name:
                X_train_cats = X_train[cat_features]
                X_train_nums = X_train[num_features]

                X_val_cats = X_val[cat_features]
                X_val_nums = X_val[num_features]

                X_test_cats = X_test[cat_features]
                X_test_nums = X_test[num_features]

                # 构建神经网络模型
                model = build_model(cat_features, num_features)
                keras.utils.set_random_seed(self.state)
                optimizer = keras.optimizers.AdamW(learning_rate=1e-2, weight_decay=1e-3)
                model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['auc'])

                # 训练神经网络
                model.fit([X_train_cats, X_train_nums], y_train,
                          validation_data=([X_val_cats, X_val_nums], y_val),
                          epochs=20,
                          batch_size=1000,
                          callbacks=[keras.callbacks.ReduceLROnPlateau(patience=1),
                                     keras.callbacks.EarlyStopping(patience=3)])

                # 验证集预测
                y_pred_val = model.predict([X_val_cats, X_val_nums]).squeeze()
                # 测试集预测（取 K 折平均）
                test_pred += model.predict([X_test_cats, X_test_nums]).squeeze() / self.n_splits

            else:
                # XGBoost 模型
                if "XGB" in model_name:
                    # 1) 保证 X_train/X_val/X_test 是数值型（run() 已经给 XGB 走 self.X_enc/self.test_enc）
                    dtrain = xgb.DMatrix(X_train, label=y_train)
                    dvalid = xgb.DMatrix(X_val,   label=y_val)
                    dtest  = xgb.DMatrix(X_test)
                
                    # 2) 从 sklearn 的 XGBClassifier 取出超参，映射到原生 params
                    p = model.get_params()
                    params = {
                        "objective":          "binary:logistic",
                        "eval_metric":        "auc",
                        "tree_method":        p.get("tree_method", "hist"),
                        "device":             p.get("device", "cpu"),       # Kaggle GPU 则 "cuda"
                        "max_depth":          p.get("max_depth", 6),
                        "learning_rate":      p.get("learning_rate", 0.1),
                        "subsample":          p.get("subsample", 1.0),
                        "colsample_bytree":   p.get("colsample_bytree", 1.0),
                        "reg_lambda":         p.get("reg_lambda", 1.0),
                        "reg_alpha":          p.get("reg_alpha", 0.0),
                        "min_child_weight":   p.get("min_child_weight", 1),
                        # 如需其它超参，同理从 p 中取并加入
                    }
                    num_boost_round = p.get("n_estimators", 1000)
                
                    # 3) 训练 + 早停（以验证集为监控，AUC 越大越好）
                    booster = xgb.train(
                        params=params,
                        dtrain=dtrain,
                        num_boost_round=num_boost_round,
                        evals=[(dvalid, "valid")],
                        early_stopping_rounds=self.early_stop,
                        verbose_eval=False
                    )
                
                    # 4) 用 best_iteration 做预测（与你原先 y_pred_val / test_pred 的接口保持一致）
                    y_pred_val = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
                    test_pred += booster.predict(dtest,  iteration_range=(0, booster.best_iteration + 1)) / self.n_splits

                # CatBoost 模型
                elif "CAT" in model_name:
                    try:
                        from catboost import Pool
                        train_pool = Pool(X_train, y_train, cat_features=cat_features)
                        valid_pool = Pool(X_val,   y_val,   cat_features=cat_features)
                        model.fit(train_pool,
                                  eval_set=valid_pool,
                                  early_stopping_rounds=self.early_stop,
                                  verbose=False)
                    except Exception:
                        model.fit(X_train, y_train,
                                  eval_set=[(X_val, y_val)],
                                  early_stopping_rounds=self.early_stop,
                                  verbose=False)

                # LightGBM 模型
                elif "LGBM" in model_name:
                    model.fit(X_train, y_train,
                              eval_set=[(X_val, y_val)],
                              eval_metric='auc',
                              early_stopping_rounds=self.early_stop,
                              verbose=False)
                # 其他模型（如逻辑回归）
                else:
                    model.fit(X_train, y_train)

                # 验证集预测（取预测概率的正类部分）
                y_pred_val = model.predict_proba(X_val)[:, 1]
                # 测试集预测（取 K 折平均）
                test_pred += model.predict_proba(X_test)[:, 1] / self.n_splits

            # 保存 OOF 预测结果
            oof_pred[valid_id] = y_pred_val
            # 计算本折 ROC AUC 分数
            score = roc_auc_score(y_val, y_pred_val)
            print(score)
            self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = score

        # 计算该模型的平均分
        fold_cols = [c for c in self.scores.columns if c.startswith('Fold ')]
        self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}', fold_cols].astype(float).mean()

        return oof_pred, test_pred

    def run(self):
        # 遍历每个模型，进行训练或读取结果
        for model_name, model in tqdm(self.models.items()):

            if self.training:                
                if 'CAT' in model_name:
                  X = self.X.copy()
                  test = self.test.copy()
                else:
                  X = self.X_enc.copy()
                  test = self.test_enc.copy() 

                # 训练模型并保存预测结果
                oof_pred, test_pred = self.train(model, X, self.y, test, model_name)
                pd.DataFrame(oof_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_oof.csv', index=False)
                pd.DataFrame(test_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_test.csv', index=False)

            else:
                # 如果不训练，则从文件读取 OOF 和测试预测结果
                oof_pred  = pd.read_csv(f'/kaggle/input/bank-class-models/{model_name}_oof.csv')[f'{model_name}'].values
                test_pred = pd.read_csv(f'/kaggle/input/bank-class-models/{model_name}_test.csv')[f'{model_name}'].values
                for n_fold, (train_id, valid_id) in enumerate(self.folds.split(oof_pred, self.y)):
                    y_pred_val = oof_pred[valid_id]
                    y_val      = self.y.iloc[valid_id]
                    self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = roc_auc_score(y_val, y_pred_val)
                fold_cols = [c for c in self.scores.columns if c.startswith('Fold ')]
                self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}', fold_cols].astype(float).mean()

            # 将 OOF 和测试预测结果保存到类属性中
            self.OOF_preds[f'{model_name}']  = oof_pred
            self.TEST_preds[f'{model_name}'] = test_pred

        # 如果有多个模型，则进行二层 stacking（元模型逻辑回归）
        if len(self.models) > 1:
            meta_model = LogisticRegression(C=0.1, random_state=self.state, max_iter=1000)
            self.OOF_preds["Ensemble"], self.TEST_preds["Ensemble"] = self.train(
                meta_model, self.OOF_preds, self.y, self.TEST_preds, 'Ensemble'
            )
            # 排序模型分数并绘图
            self.scores = self.scores.sort_values('Score')
            self.score_bar()
            self.plot_result(self.OOF_preds["Ensemble"])
            return self.TEST_preds["Ensemble"]
        else:
            # 只有一个模型时，直接输出分数和绘图
            only = list(self.models.keys())[0]
            print(f'{only} score {self.scores.loc[f"{only}", "Score"]:.5f}\n')
            self.plot_result(self.OOF_preds[f'{only}'])
            return self.TEST_preds[f'{only}']

    def score_bar(self):
        # 绘制各模型分数的柱状图
        plt.figure(figsize=(18, 6))
        colors = ['#3cb371' if i != 'Ensemble' else 'r' for i in self.scores.Score.index]
        hbars = plt.barh(self.scores.index, self.scores.Score.astype(float), color=colors, height=0.8)
        plt.bar_label(hbars, fmt='%.6f')
        plt.xlim(0.8, 1)
        plt.ylabel('Models')
        plt.xlabel('Score')
        plt.show()

    def plot_result(self, oof):
        # 绘制 ROC 曲线和混淆矩阵
        fig, axes = plt.subplots(1, 2, figsize=(14, 7))

        for col in self.OOF_preds:
            RocCurveDisplay.from_predictions(self.y.sort_index(), self.OOF_preds[col], name=f"{col}", ax=axes[0])
        axes[0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
        axes[0].set_xlabel('False Positive Rate')
        axes[0].set_ylabel('True Positive Rate')
        axes[0].set_title('ROC')
        axes[0].legend(loc="lower right")

        ConfusionMatrixDisplay.from_predictions(self.y.sort_index(), (oof>=0.5).astype(int),
                                                display_labels=self.labels, colorbar=False, ax=axes[1], cmap='Greens')
        axes[1].set_title('Confusion Matrix')

        plt.tight_layout()
        plt.show()


In [None]:
trainer = Trainer(X, X_enc, y, test, test_enc, models, training = True)
TEST_preds = trainer.run()

  0%|          | 0/7 [00:00<?, ?it/s]

XGB
