In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, label_binarize, OrdinalEncoder, QuantileTransformer, TargetEncoder
from category_encoders import CatBoostEncoder, MEstimateEncoder

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import RidgeClassifier, LogisticRegression, LinearRegression, BayesianRidge, Ridge

from sklearn import set_config
import os

import optuna
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, root_mean_squared_error, mean_squared_error, precision_recall_curve, make_scorer, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, matthews_corrcoef
from scipy.stats import norm, skew

from colorama import Fore, Style, init
from copy import deepcopy
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, KFold, RepeatedKFold, cross_val_score, StratifiedGroupKFold
from xgboost import DMatrix, XGBClassifier, XGBRegressor
from lightgbm import log_evaluation, early_stopping, LGBMClassifier, LGBMRegressor, Dataset
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from tqdm.notebook import tqdm
from optuna.samplers import TPESampler, CmaEsSampler
from optuna.pruners import HyperbandPruner
from functools import partial
from IPython.display import display_html, clear_output
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import gc
import re
from typing import Literal, NamedTuple
from itertools import combinations

import keras
from keras.models import Sequential
from keras import layers
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

import warnings
warnings.filterwarnings("ignore")

In [2]:
#为预处理数据
class Config:

        state = 42
        n_splits = 10
        early_stop = 100
        
        target = 'y'
        train = pd.read_csv("data/train.csv")
        test = pd.read_csv("data/test.csv")
        submission =pd.read_csv("data/sample_submission.csv")
    
        original_data = False
        outliers = False
        log_trf = False
        feature_eng = True
        missing = False
        labels = list(train[target].unique())
        topk_interactions = 20
    


In [4]:
class EDA(Config):
    
    def __init__(self):
        super().__init__()

        self.cat_features = self.train.drop(self.target, axis=1).select_dtypes(include=['object']).columns.tolist()
        self.num_features = self.train.drop(self.target, axis=1).select_dtypes(exclude=['object']).columns.tolist()
        self.data_info()
        self.heatmap()
        self.dist_plots()
        self.cat_feature_plots()
        self.target_pie()
                
    def data_info(self):
        data_list = [self.train,self.test]
        label_list = ['train','test']
        for i in range(2):
            data = data_list[i]
            label = label_list[i]
        
            table_style = [{'selector': 'th:not(.index_name)',
                            'props': [('background-color', '#3cb371'),
                                      ('color', '#FFFFFF'),
                                      ('font-weight', 'bold'),
                                      ('border', '1px solid #DCDCDC'),
                                      ('text-align', 'center')]
                            }, 
                            {'selector': 'tbody td',
                             'props': [('border', '1px solid #DCDCDC'),
                                       ('font-weight', 'normal')]
                            }]
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} head\n')
            display(data.head().style.set_table_styles(table_style))
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} info\n'+Style.RESET_ALL)               
            display(data.info())
                           
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} describe\n')
            display(data.describe().drop(index='count', columns=self.target, errors = 'ignore').T
                    .style.set_table_styles(table_style).format('{:.3f}'))
            
            print(Style.BRIGHT+Fore.GREEN+f'\n{label} missing values\n'+Style.RESET_ALL)               
            display(data.isna().sum())
        return self
    
    def heatmap(self):
        print(Style.BRIGHT+Fore.GREEN+f'\nCorrelation Heatmap\n')
        plt.figure(figsize=(7,7))
        corr = self.train.select_dtypes(exclude='object').corr(method='pearson')
        sns.heatmap(corr, fmt = '0.4f', cmap = 'Greens', annot=True, cbar=False)
        plt.show()
        
    def dist_plots(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nDistribution analysis\n")
        df = pd.concat([self.train[self.num_features].assign(Source = 'Train'), 
                        self.test[self.num_features].assign(Source = 'Test'),], 
                        axis=0, ignore_index = True)

        fig, axes = plt.subplots(len(self.num_features), 2 ,figsize = (18, len(self.num_features) * 6), 
                                 gridspec_kw = {'hspace': 0.3, 
                                                'wspace': 0.2, 
                                                'width_ratios': [0.70, 0.30]
                                               }
                                )
        for i,col in enumerate(self.num_features):
            ax = axes[i,0]
            sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', 
                        palette = ['#3cb371', 'r'], ax = ax, linewidth = 2
                       )
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col}")
            ax.grid()

            ax = axes[i,1]
            sns.boxplot(data = df, y = col, x=df.Source, width = 0.5,
                        linewidth = 1, fliersize= 1,
                        ax = ax, palette=['#3cb371', 'r']
                       )
            ax.set_title(f"\n{col}")
            ax.set(xlabel = '', ylabel = '')
            ax.tick_params(axis='both', which='major')
            ax.set_xticklabels(['Train', 'Test'])

        plt.tight_layout()
        plt.show()
               
    def cat_feature_plots(self):
        fig, axes = plt.subplots(max(len(self.cat_features), 1), 2 ,figsize = (18, len(self.cat_features) * 6), 
                                 gridspec_kw = {'hspace': 0.5, 
                                                'wspace': 0.2,
                                               }
                                )
        if len(self.cat_features) == 1:
            axes = np.array([axes])
            
        for i, col in enumerate(self.cat_features):
            ax = axes[i,0]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='#3cb371')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Train")
            
            ax = axes[i,1]
            sns.barplot(data=self.train[col].value_counts().nlargest(10).reset_index(), x=col, y='count', ax=ax, color='r')
            ax.set(xlabel = '', ylabel = '')
            ax.set_title(f"\n{col} Test")

        plt.tight_layout()
        plt.show()

    def target_pie(self):
        print(Style.BRIGHT+Fore.GREEN+f"\nTarget feature distribution\n")
        targets = self.train[self.target]
        plt.figure(figsize=(6, 6))
        plt.pie(targets.value_counts(), labels=targets.value_counts().index, autopct='%1.2f%%', colors=sns.color_palette('viridis', len(targets.value_counts())))
        plt.show()
        

In [5]:
import time
class Transform(Config):
    
    def __init__(self):
        # 调用父类 Config 的 __init__ 方法（如果存在）
        super().__init__()
        t0 = time.time()
        # 如果启用 original_data，则将原始数据合并到训练集
        if self.original_data:
            start = time.time()
            # 将目标列转换为 0/1（假设值为 "yes" 和 "no"）
            self.train_org[self.target] = (self.train_org[self.target] == "yes").astype(int)
            # 合并并去重
            self.train = pd.concat([self.train, self.train_org], ignore_index=True).drop_duplicates()
            self.train.reset_index(drop=True, inplace=True)
            print(f"[合并原始数据] {time.time()-start:.2f}s")

        
        # 获取数值型特征列名（排除 object/bool/category/string）
        self.num_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(exclude=['object', 'bool', 'category', 'string']).columns.tolist()
        
        # 获取类别特征列名（只保留 object/bool/category/string）
        self.cat_features = self.train.drop(self.target, axis=1)\
            .select_dtypes(include=['object', 'bool', 'category', 'string']).columns.tolist()

        if self.missing:
            self.missing_values()

        if self.outliers:
            self.remove_outliers()

        if self.log_trf:
            self.log_transformation()


        start = time.time()
        self.important_features = self.select_important_features(top_k=20)
        print(f"[特征重要度] {time.time()-start:.2f}s")

        if self.feature_eng and self.important_features:
            self.train = self.new_features(self.train, self.important_features)
            self.test  = self.new_features(self.test, self.important_features)
            self.num_features = self.train.drop(self.target, axis=1)\
                .select_dtypes(exclude=['object', 'bool', 'string', 'category']).columns.tolist()
            print(f"[交互特征] {time.time()-start:.2f}s")

        start = time.time()
        self.encode()

        print(f"[总耗时] {time.time()-t0:.2f}s")
        
    def __call__(self):
        # 保存目标列
        self.y = self.train[self.target]
        # 保存原始特征
        self.X = self.train.drop(self.target, axis=1)
        # 保存编码后的特征
        self.X_enc = self.train_enc.drop(self.target, axis=1)
        return self.X, self.X_enc, self.y, self.test, self.test_enc, self.cat_features, self.num_features
    
    def encode(self):
        self.train_enc = self.train.copy()
        self.test_enc = self.test.copy()
        
        self.cat_features_card = []
        for f in self.cat_features:
            self.cat_features_card.append(self.train[f].nunique())
        
        # 创建编码器并仅用训练集类别特征进行 fit
        oe = OrdinalEncoder()
        oe.fit(self.train_enc[self.cat_features])
        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.cat_features] = oe.transform(self.train_enc[self.cat_features]).astype(int)
        self.test_enc[self.cat_features] = oe.transform(self.test_enc[self.cat_features]).astype(int)
        
        # 创建标准化器并仅用训练集数值特征进行 fit
        scaler = StandardScaler()
        scaler.fit(self.train_enc[self.num_features])

        
        # 分别对训练集和测试集进行 transform
        self.train_enc[self.num_features] = scaler.transform(self.train_enc[self.num_features])
        self.test_enc[self.num_features] = scaler.transform(self.test_enc[self.num_features])

    def select_important_features(self, top_k=20, task='auto'):
        """
        基于树模型的特征重要度筛选，并记录耗时。
        功能：返回用于生成交互项的前 top_k 个重要数值特征。
        """
        start_time = time.time()

        # Step 1: 构造候选列
        feat_cols = [c for c in self.num_features if c in self.train.columns and c != self.target]
        if not feat_cols:
            print("[select_important_features] 无可用数值特征，返回空。")
            return []


        # Step 2: 准备训练数据
        X_train_imp = self.train[feat_cols]
        y_train_imp = self.train[self.target]
        is_class = (y_train_imp.nunique() <= 10)

        # 轻量模型（示例：ExtraTrees；也可 mutual_info_*）
        from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
        model = (ExtraTreesClassifier(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state)
                 if is_class else
                 ExtraTreesRegressor(n_estimators=200, max_features='sqrt', n_jobs=-1, random_state=self.state))
        model.fit(X_train_imp, y_train_imp)
    
        importances = model.feature_importances_
        top_num_feats = pd.Series(importances, index=feat_cols).sort_values(ascending=False).head(top_k).index.tolist()
    
        print(f"[select_important_features] 运行耗时: {time.time()-start_time:.2f} 秒（未编码）")
        return top_num_feats


            
    def new_features(self, data, top_num_feats=None):
        # 创建所有数值特征两两组合的乘积特征
        feats = top_num_feats if top_num_feats else self.num_features
        for c1, c2 in list(combinations(feats, 2)):
            data[f"{c1}_{c2}"] = data[c1] * data[c2]
        # 将类别特征转换为 category 类型
        data[self.cat_features] = data[self.cat_features].astype('category')
        return data

    def log_transformation(self):
        # 对目标列做 log1p 变换
        self.train[self.target] = np.log1p(self.train[self.target]) 
        return self
        
    def remove_outliers(self):
        # 基于 IQR 的异常值去除方法
        Q1 = self.train[self.target].quantile(0.25)
        Q3 = self.train[self.target].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        self.train = self.train[(self.train[self.target] >= lower_limit) & (self.train[self.target] <= upper_limit)]
        self.train.reset_index(drop=True, inplace=True)
    
    def missing_values(self):
        # 将类别特征中的缺失值替换为字符串 'NaN'
        self.train[self.cat_features] = self.train[self.cat_features].fillna('NaN')
        self.test[self.cat_features] = self.test[self.cat_features].fillna('NaN')
        return self

    def reduce_mem(self, df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', "uint16", "uint32", "uint64"]
        for col in df.columns:
        # 修正④：将 dtype 转为字符串再比较
            col_type = str(df[col].dtype)
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if "int" in col_type:
                    if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
                else:
                # 修正⑤：第二个分支用 elif，避免覆盖
                    if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        return df

In [6]:
t = Transform()

[select_important_features] 运行耗时: 45.22 秒（未编码）
[特征重要度] 45.50s
[交互特征] 46.23s
[总耗时] 48.20s


In [3]:
def build_model(cat_features, num_features):
    
    x_input_cats = layers.Input(shape=(len(cat_features),))
    embs = []
    for j in range(len(cat_features)):
        e = layers.Embedding(t.cat_features_card[j], int(np.ceil(np.sqrt(t.cat_features_card[j]))))
        x = e(x_input_cats[:,j])
        x = layers.Flatten()(x)
        embs.append(x)
        
    x_input_nums = layers.Input(shape=(len(num_features),))
    
    x = layers.Concatenate(axis=-1)(embs+[x_input_nums]) 
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=[x_input_cats,x_input_nums], outputs=x)
    return model

In [None]:
git add *.ipynb
git commit -m "增加了一个模型"
git push