In [13]:
!pip install --upgrade scikit-learn scikit-learn==1.7.1 xgboost==3.0.3 lightgbm==4.6.0 numpy==1.26.4 scipy==1.14.1

Collecting xgboost==3.0.3
  Using cached xgboost-3.0.3-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (114 kB)
Collecting scipy==1.14.1
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached xgboost-3.0.3-py3-none-macosx_12_0_arm64.whl (2.0 MB)
Downloading numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy, scipy, xgboost
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 2.3.2
[2K    Uninstalling numpy-2.3.2:
[2K      Successfully unins

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, label_binarize, OrdinalEncoder, QuantileTransformer, TargetEncoder
from category_encoders import CatBoostEncoder, MEstimateEncoder

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import RidgeClassifier, LogisticRegression, LinearRegression, BayesianRidge, Ridge

from sklearn import set_config
import os

import optuna
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, root_mean_squared_error, mean_squared_error, precision_recall_curve, make_scorer, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, matthews_corrcoef
from scipy.stats import norm, skew

from colorama import Fore, Style, init
from copy import deepcopy
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, KFold, RepeatedKFold, cross_val_score, StratifiedGroupKFold
import xgboost as xgb
from xgboost import DMatrix, XGBClassifier, XGBRegressor

from lightgbm import log_evaluation, early_stopping, LGBMClassifier, LGBMRegressor, Dataset
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from tqdm.notebook import tqdm
from optuna.samplers import TPESampler, CmaEsSampler
from optuna.pruners import HyperbandPruner
from functools import partial
from IPython.display import display_html, clear_output
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import gc
import re
from typing import Literal, NamedTuple
from itertools import combinations

import keras
from keras.models import Sequential
from keras import layers
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

import warnings
warnings.filterwarnings("ignore")

In [2]:
#为预处理数据
class Config:

        state = 42
        n_splits = 10
        early_stop = 100
        
        target = 'y'
        train = pd.read_csv("data/train.csv")
        test = pd.read_csv("data/test.csv")
        submission =pd.read_csv("data/sample_submission.csv")
    
        original_data = False
        outliers = False
        log_trf = False
        feature_eng = True
        missing = False
        labels = list(train[target].unique())
        topk_interactions = 20
    


In [3]:
import time
class Transform(Config):
    
    def __init__(self):
        # 调用父类 Config 的 __init__ 方法（如果存在）
        super().__init__()
        t0 = time.time()
        # 如果启用 original_data，则将原始数据合并到训练集
        if self.original_data:
            start = time.time()
            # 将目标列转换为 0/1（假设值为 "yes" 和 "no"）
            self.train_org[self.target] = (self.train_org[self.target] == "yes").astype(int)
            # 合并并去重
            self.train = pd.concat([self.train, self.train_org], ignore_index=True).drop_duplicates()
            self.train.reset_index(drop=True, inplace=True)
            print(f"[合并原始数据] {time.time()-start:.2f}s")

        
        # 获取数值型特征列名（排除 object/bool/category/string）
        self.num_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(exclude=['object', 'bool', 'category', 'string']).columns.tolist()
        
        # 获取类别特征列名（只保留 object/bool/category/string）
        self.cat_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(include=['object', 'bool', 'category', 'string']).columns.tolist()

        if self.missing:
            self.missing_values()

        if self.outliers:
            self.remove_outliers()

        if self.log_trf:
            self.log_transformation()


        start = time.time()
        self.important_features = self.select_important_features(top_k=20)
        print(f"[特征重要度] {time.time()-start:.2f}s")

        if self.feature_eng and self.important_features:
            self.train = self.new_features(self.train, self.important_features)
            self.test  = self.new_features(self.test, self.important_features)
            self.num_features = self.train.drop(self.target, axis=1)\
                .select_dtypes(exclude=['object', 'bool', 'string', 'category']).columns.tolist()
            print(f"[交互特征] {time.time()-start:.2f}s")

        start = time.time()
        self.encode()

        print(f"[总耗时] {time.time()-t0:.2f}s")
        
    def __call__(self):
        # 保存目标列
        self.y = self.train[self.target]
        # 保存原始特征
        self.X = self.train.drop(self.target, axis=1)
        # 保存编码后的特征
        self.X_enc = self.train_enc.drop(self.target, axis=1)
        return self.X, self.X_enc, self.y, self.test, self.test_enc, self.cat_features, self.num_features
    
    def encode(self):
        self.train_enc = self.train.copy()
        self.test_enc = self.test.copy()
        
        self.cat_features_card = []
        for f in self.cat_features:
            self.cat_features_card.append(self.train[f].nunique())
        
        # 创建编码器并仅用训练集类别特征进行 fit
        oe = OrdinalEncoder()
        oe.fit(self.train_enc[self.cat_features])
        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.cat_features] = oe.transform(self.train_enc[self.cat_features]).astype(int)
        self.test_enc[self.cat_features] = oe.transform(self.test_enc[self.cat_features]).astype(int)
        
        # 创建标准化器并仅用训练集数值特征进行 fit
        scaler = StandardScaler()
        scaler.fit(self.train_enc[self.num_features])

        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.num_features] = scaler.transform(self.train_enc[self.num_features])
        self.test_enc[self.num_features] = scaler.transform(self.test_enc[self.num_features])

    def select_important_features(self, top_k=20, task='auto'):
        """
        基于树模型的特征重要度筛选，并记录耗时。
        功能：返回用于生成交互项的前 top_k 个重要数值特征。
        """
        start_time = time.time()

        # Step 1: 构造候选列
        feat_cols = [c for c in self.num_features if c in self.train.columns and c != self.target]
        if not feat_cols:
            print("[select_important_features] 无可用数值特征，返回空。")
            return []


        # Step 2: 准备训练数据
        X_train_imp = self.train[feat_cols]
        y_train_imp = self.train[self.target]
        is_class = (y_train_imp.nunique() <= 10)

        # 轻量模型（示例：ExtraTrees；也可 mutual_info_*）
        from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
        model = (ExtraTreesClassifier(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state)
                 if is_class else
                 ExtraTreesRegressor(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state))
        model.fit(X_train_imp, y_train_imp)
    
        importances = model.feature_importances_
        top_num_feats = pd.Series(importances, index=feat_cols).sort_values(ascending=False).head(top_k).index.tolist()
    
        print(f"[select_important_features] 运行耗时: {time.time()-start_time:.2f} 秒（未编码）")
        return top_num_feats


            
    def new_features(self, data, top_num_feats=None):
        # 创建所有数值特征两两组合的乘积特征
        feats = top_num_feats if top_num_feats else self.num_features
        for c1, c2 in list(combinations(feats, 2)):
            data[f"{c1}_{c2}"] = data[c1] * data[c2]
        # 将类别特征转换为 category 类型
        data[self.cat_features] = data[self.cat_features].astype('category')
        return data

    def log_transformation(self):
        # 对目标列做 log1p 变换
        self.train[self.target] = np.log1p(self.train[self.target]) 
        return self
        
    def remove_outliers(self):
        # 基于 IQR 的异常值去除方法
        Q1 = self.train[self.target].quantile(0.25)
        Q3 = self.train[self.target].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        self.train = self.train[(self.train[self.target] >= lower_limit) & (self.train[self.target] <= upper_limit)]
        self.train.reset_index(drop=True, inplace=True)
    
    def missing_values(self):
        # 将类别特征中的缺失值替换为字符串 'NaN'
        self.train[self.cat_features] = self.train[self.cat_features].fillna('NaN')
        self.test[self.cat_features] = self.test[self.cat_features].fillna('NaN')
        return self

    def reduce_mem(self, df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', "uint16", "uint32", "uint64"]
        for col in df.columns:
        # 修正④：将 dtype 转为字符串再比较
            col_type = str(df[col].dtype)
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if "int" in col_type:
                    if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
                else:
                # 修正⑤：第二个分支用 elif，避免覆盖
                    if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        return df

In [4]:
t = Transform()

[select_important_features] 运行耗时: 46.19 秒（未编码）
[特征重要度] 46.37s
[交互特征] 47.13s
[总耗时] 49.15s


In [5]:
X, X_enc, y, test, test_enc, cat_features, num_features = t()

In [14]:
def build_model(cat_features, num_features):
    
    x_input_cats = layers.Input(shape=(n_cat,), dtype="int32", name="cats")
    embs = []
    for j in range(n_cat):
        # 取出第 j 列（稳妥写法：Lambda + gather）
        col_j = layers.Lambda(lambda x, idx=j: tf.gather(x, indices=idx, axis=1))(x_input_cats)
        # 将编码整体 +1，把 0 留作 unknown；input_dim 也 +1 做安全缓冲
        col_j = layers.Lambda(lambda z: z + 1)(col_j)

        vocab = int(np.ceil(t.cat_features_card[j]))  # 已在 Transform 里统计
        emb_dim = int(np.ceil(np.sqrt(max(2, vocab))))  # 简单经验公式，至少 2 维
        e = layers.Embedding(input_dim=vocab + 1, output_dim=emb_dim, name=f"emb_{cat_features[j]}")
        x = e(col_j)                     # (batch, 1, emb_dim)
        x = layers.Flatten()(x)          # (batch, emb_dim)
        embs.append(x)
        
    # 拼接：所有 embedding + 数值向量
    x = layers.Concatenate(axis=-1)(embs + [x_input_nums])

    # MLP
    for _ in range(3):
        x = layers.Dense(512, activation='relu')(x)
        x = layers.Dropout(0.3)(x)
        x = layers.BatchNormalization()(x)

    out = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs=[x_input_cats, x_input_nums], outputs=out)

    # 指标用显式 AUC
    opt = keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-4)
    model.compile(optimizer=opt, loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(name='auc')])
    return model

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
import contextlib, io
import ydf; ydf.verbose(2)
from ydf import RandomForestLearner

def YDFClassification(learner_class):

    class YDFXClassifier(BaseEstimator, ClassifierMixin):

        def __init__(self, params=None):
            self.params = {} if params is None else params.copy()

        def fit(self, X: pd.DataFrame, y: pd.Series):
            assert isinstance(X, pd.DataFrame)
            assert isinstance(y, pd.Series)

            self.classes_ = list(y.unique())
            self.n_classes_ = len(self.classes_)

            target = y.name
            params = self.params.copy()
            params['label'] = target
            params['task'] = ydf.Task.CLASSIFICATION

            df = pd.concat([X, y], axis=1)

            with contextlib.redirect_stdout(io.StringIO()), \
                 contextlib.redirect_stderr(io.StringIO()):
                self.model = learner_class(**params).train(df)

            return self

        def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
            assert isinstance(X, pd.DataFrame)

            with contextlib.redirect_stdout(io.StringIO()), \
                 contextlib.redirect_stderr(io.StringIO()):
                raw = self.model.predict(X)

            proba = np.asarray(raw)
            if proba.ndim == 1:
                proba = np.vstack([1 - proba, proba]).T

            return proba

        def predict(self, X: pd.DataFrame) -> np.ndarray:
            proba = self.predict_proba(X)
            idx = proba.argmax(axis=1)
            return np.array(self.classes_)[idx]

    return YDFXClassifier

In [8]:
models = {
    'XGB': XGBClassifier(**{
        'tree_method': 'hist',             # 本地CPU
        'n_estimators': 10000,
        'objective': 'binary:logistic',
        'random_state': Config.state,
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'n_jobs': -1,
        'reg_lambda': 4.510522889747622,
        'reg_alpha': 5.007953193043952,
        'colsample_bytree': 0.5831655543160346,
        'subsample': 0.9808690492838653,
        'learning_rate': 0.008247101477015132,
        'max_depth': 11,
        'min_child_weight': 1,
    }),
    'LGBM': LGBMClassifier(**{
        'random_state': Config.state,
        'verbose': -1,
        'n_estimators': 10000,
        # 'metric': 'auc',   # 建议在 fit 的 callbacks 里管控早停与评估
        'objective': 'binary',
        'max_depth': 16,
        'learning_rate': 0.007366917567300051,
        'min_child_samples': 164,
        'subsample': 0.9022880020285295,
        'colsample_bytree': 0.4213201532077694,
        'num_leaves': 122,
        'reg_alpha': 1.083996192298843,
        'reg_lambda': 0.0700057221912873,
        # 本地CPU：不要 device_type
    }),
    'LGBM2': LGBMClassifier(**{
        'random_state': Config.state,
        'verbose': -1,
        'n_estimators': 10000,
        'objective': 'binary',
        'max_depth': 19,
        'learning_rate': 0.010196940756517232,
        'min_child_samples': 40,
        'subsample': 0.5388367974706456,
        'colsample_bytree': 0.24506890759293215,
        'num_leaves': 360,
        'reg_alpha': 0.11493527242956506,
        'reg_lambda': 0.8048854866109955,
    }),
    'CAT': CatBoostClassifier(**{
        'random_state': Config.state,
        'loss_function': "Logloss",
        'eval_metric': "AUC",
        'n_estimators': 5000,
        'learning_rate': 0.06524873965257823,
        'l2_leaf_reg': 0.8867612905712001,
        'bagging_temperature': 0.1317347791955057,
        'random_strength': 0.9922857768340815,
        'depth': 7,
        'min_data_in_leaf': 8,
        'task_type': "CPU",     # 本地CPU
        'verbose': 0,
    }),
    'CAT2': CatBoostClassifier(**{
        'random_state': Config.state,
        'loss_function': "Logloss",
        'eval_metric': "AUC",
        'n_estimators': 5000,
        'learning_rate': 0.034582298874165696,
        'l2_leaf_reg': 0.9838795180512044,
        'bagging_temperature': 0.22069473702418926,
        'random_strength': 1.0557491242401338,
        'depth': 9,
        'min_data_in_leaf': 166,
        'task_type': "CPU",     # 本地CPU
        'verbose': 0,
    }),
    'NN': None,  # 占位即可，Trainer 里会 build
    'YDF': YDFClassification(RandomForestLearner)({
        'num_trees': 1000,
        'max_depth': 6,
        'random_seed': Config.state,
        'growing_strategy': 'BEST_FIRST_GLOBAL'
    })
}

In [29]:
class Trainer(Config):
    
    def __init__(self, X, X_enc, y, test, test_enc, models, training=True):
        self.X = X
        self.X_enc = X_enc
        self.test = test
        self.test_enc = test_enc
        self.y = y
        self.models = models
        self.training = training
        self.scores = pd.DataFrame(columns=['Score'])
        self.OOF_preds = pd.DataFrame(dtype=float)
        self.TEST_preds = pd.DataFrame(dtype=float)
        self.folds = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.state)
    
    def train(self, model, X, y, test, model_name):
        oof_pred = np.zeros(X.shape[0], dtype=float)
        test_pred = np.zeros(test.shape[0], dtype=float)

        print('='*20)
        print(model_name)
                
        for n_fold, (train_id, valid_id) in enumerate(self.folds.split(X, y)):
            X_train = X.iloc[train_id].copy()
            y_train = y.iloc[train_id]
            X_val = X.iloc[valid_id].copy()
            y_val = y.iloc[valid_id]
            X_test = test.copy()
            
            
            if 'NN' in model_name:
                X_train_cats = X_train[cat_features].astype('int32')
                X_train_nums = X_train[num_features].astype('float32')
                X_val_cats   = X_val[cat_features].astype('int32')
                X_val_nums   = X_val[num_features].astype('float32')
                X_test_cats  = X_test[cat_features].astype('int32')
                X_test_nums  = X_test[num_features].astype('float32')
                
                model = build_model(cat_features, num_features)                        
                keras.utils.set_random_seed(self.state)
                optimizer = keras.optimizers.AdamW(learning_rate=1e-2, weight_decay=1e-3)
                model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['auc'])
                
                model.fit([X_train_cats,X_train_nums], y_train, 
                          validation_data=([X_val_cats, X_val_nums], y_val),
                          epochs=20,
                          batch_size=1000,
                          callbacks=[keras.callbacks.ReduceLROnPlateau(patience=1),
                                     keras.callbacks.EarlyStopping(patience=3)
                                    ])
                y_pred_val = model.predict([X_val_cats, X_val_nums]).squeeze()                      
                test_pred += model.predict([X_test_cats, X_test_nums]).squeeze() / self.n_splits             
                                      
            else:
                # XGBoost：用 early_stopping_rounds
                if "XGB" in model_name:
                    model.set_params(early_stopping_rounds=50)  # ← 放到参数里
                    model.fit(
                        X_train, y_train,
                        eval_set=[(X_val, y_val)],
                        verbose=False
                    )
            
                # CatBoost：用过拟合检测参数（早停）
                elif "CAT" in model_name:
                    model.set_params(od_type='Iter', od_wait=50)  # ← 50轮不提升就停
                    model.fit(
                        X_train, y_train,
                        eval_set=(X_val, y_val),   # 注意：CatBoost 用 tuple
                        verbose=False
                    )
            
                # LightGBM：你原来就有早停（保持不变）
                elif "LGBM" in model_name:
                    model.fit(
                        X_train, y_train,
                        eval_set=[(X_val, y_val)],
                        categorical_feature=cat_features,
                        feature_name='auto',
                        callbacks=[log_evaluation(0), early_stopping(self.early_stop, verbose=False)]
                    )
            
                # 其他模型：正常拟合
                else:
                    model.fit(X_train, y_train)
                    
                y_pred_val = model.predict_proba(X_val)[:, 1]            
                test_pred += model.predict_proba(X_test)[:, 1] / self.n_splits
                
            oof_pred[valid_id] = y_pred_val
            score = roc_auc_score(y_val, y_pred_val)
            print(score)
            self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = score                                      

        self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}'][1:].mean()

        return oof_pred, test_pred

    def run(self):
        for model_name, model in tqdm(self.models.items()):

            if self.training:                
                use_raw = any(k in model_name for k in ['LGBM', 'CAT', 'HGB', 'YDF'])  # 删掉'XGB'
                if use_raw:
                    X = self.X.copy()
                    test = self.test.copy()
                else:
                    X = self.X_enc.copy()
                    test = self.test_enc.copy()
                print(model_name, X.dtypes.value_counts())

                oof_pred, test_pred = self.train(model, X, self.y, test, model_name)
                pd.DataFrame(oof_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_oof.csv', index=False)
                pd.DataFrame(test_pred, columns=[f'{model_name}']).to_csv(f'{model_name}_test.csv', index=False)
            
            else:
                oof_pred = pd.read_csv(f'/kaggle/input/bank-class-models/{model_name}_oof.csv')
                test_pred = pd.read_csv(f'/kaggle/input/bank-class-models/{model_name}_test.csv')
                for n_fold, (train_id, valid_id) in enumerate(self.folds.split(oof_pred, self.y)):
                    y_pred_val, y_val = oof_pred.loc[valid_id], self.y.loc[valid_id]
                    self.scores.loc[f'{model_name}', f'Fold {n_fold+1}'] = roc_auc_score(y_val, y_pred_val)
                self.scores.loc[f'{model_name}', 'Score'] = self.scores.loc[f'{model_name}'][1:].mean()

            self.OOF_preds[f'{model_name}'] = oof_pred
            self.TEST_preds[f'{model_name}'] = test_pred
            
        if len(self.models)>1:
            meta_model = LogisticRegression(C = 0.1, random_state = self.state, max_iter = 1000)
            self.OOF_preds["Ensemble"], self.TEST_preds["Ensemble"] = self.train(meta_model, self.OOF_preds, y, self.TEST_preds, 'Ensemble')
            self.scores = self.scores.sort_values('Score')
            self.score_bar()
            self.plot_result(self.OOF_preds["Ensemble"])
            return self.TEST_preds["Ensemble"]
        else:
            print(Style.BRIGHT+Fore.GREEN+f'{model_name} score {self.scores.loc[f"{model_name}", "Score"]:.5f}\n')
            self.plot_result(self.OOF_preds[f'{model_name}'])
            return self.TEST_preds[f'{model_name}']

    def score_bar(self):
        plt.figure(figsize=(18, 6))      
        colors = ['#3cb371' if i != 'Ensemble' else 'r' for i in self.scores.Score.index]
        hbars = plt.barh(self.scores.index, self.scores.Score, color=colors, height=0.8)
        plt.bar_label(hbars, fmt='%.6f')
        plt.xlim(0.8, 1)
        plt.ylabel('Models')
        plt.xlabel('Score')
        plt.show()
        
    def plot_result(self, oof):           
        fig, axes = plt.subplots(1, 2, figsize=(14, 7))

        for col in self.OOF_preds:
            RocCurveDisplay.from_predictions(self.y.sort_index(), self.OOF_preds[col], name=f"{col}", ax=axes[0])            
        axes[0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
        axes[0].set_xlabel('False Positive Rate')
        axes[0].set_ylabel('True Positive Rate')
        axes[0].set_title('ROC')
        axes[0].legend(loc="lower right")
        
        ConfusionMatrixDisplay.from_predictions(self.y.sort_index(), (oof>=0.5).astype(int), display_labels=self.labels, colorbar=False, ax=axes[1], cmap = 'Greens')
        axes[1].set_title('Confusion Matrix')
        
        plt.tight_layout()
        plt.show()

In [30]:
trainer = Trainer(X, X_enc, y, test, test_enc, models, training = True)
TEST_preds = trainer.run()

  0%|          | 0/7 [00:00<?, ?it/s]

XGB float64    36
int64       9
Name: count, dtype: int64
XGB


KeyboardInterrupt: 