In [39]:
import optuna
from pathlib import Path
import optuna
from fastai.tabular.core import cont_cat_split
import pandas as pd
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from catboost import CatBoostClassifier
from fastai.tabular.all import Tensor, torch, store_attr

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Loss metrics

In [40]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)


def balanced_log_loss_tensor(output: Tensor, target: Tensor):
    y_true = target.flatten().cpu().detach().numpy()
    y_pred = output.cpu().detach().numpy()

    try:
        loss_value = balanced_log_loss(y_true, y_pred)
    except Exception:
        return torch.tensor(0.0, dtype=torch.float32, device=output.device, requires_grad=True)
    return torch.tensor(loss_value, dtype=torch.float32, device=output.device, requires_grad=True)


class BalancedLogLossMetric:
    def get_final_error(self, error, weight):
        return error / weight

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        y_true = np.array(target).astype(int)
        y_pred = np.array(approxes[0])
        nc = np.bincount(y_true)
        balanced_logloss = metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)
        return balanced_logloss, 1.0


class BalancedLogLoss:
    y_int = True

    def __init__(self, *args, **kwargs):
        store_attr()

    def __call__(self, inp, targ, **kwargs):
        return balanced_log_loss_tensor(inp, targ)

    def activation(self, out: Tensor) -> Tensor:
        return F.softmax(out, dim=-1)

    def decodes(self, out: Tensor) -> Tensor:
        return out.argmax(dim=-1)

# Data preprocessing

In [41]:
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
from sklearn import compose, ensemble, impute, pipeline, preprocessing, tree


def get_preprocess_pipeline(df, cont_cols, cat_cols, drop_cols):
    """
    Returns a pipeline that performs the following transformations:
    * Standard scaling
    * Log transformation
    * Reciprocal transformation
    * Box-Cox transformation
    * Yeo-Johnson transformation
    * Categorical imputing
    * Semi-constant feature binarization

    Based on the EDA from https://www.kaggle.com/code/mateuszk013/icr-eda-balanced-learning-with-lgbm-xgb/notebook

    :param df: The dataframe to be transformed.
    :type df: pandas.DataFrame
    :param cont_names: The names of the continuous variables.
    :type cont_names: list of str
    :param dep_vars: The names of the dependent variables.
    :type dep_vars: list of str
    """

    # Identify columns that doesn't follow a normal distribution
    # find an appropriate transformation for them to follow a normal distribution
    r2_scores = defaultdict(tuple)

    for feature in cont_cols:
        orig = df[feature].dropna()
        _, (*_, R_orig) = stats.probplot(orig, rvalue=True)
        _, (*_, R_log) = stats.probplot(np.log(orig), rvalue=True)
        _, (*_, R_sqrt) = stats.probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_reci) = stats.probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = stats.probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = stats.probplot(stats.yeojohnson(orig)[0], rvalue=True)
        r2_scores[feature] = (
            R_orig * R_orig,
            R_log * R_log,
            R_sqrt * R_sqrt,
            R_reci * R_reci,
            R_boxcox * R_boxcox,
            R_yeojohn * R_yeojohn,
        )

    r2_scores = pd.DataFrame(
        r2_scores,
        index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson"),
    ).T

    r2_scores["Winner"] = r2_scores.idxmax(axis=1)

    # Identify columns to be transformed
    no_transform_cols = r2_scores.query("Winner == 'Original'").index
    log_transform_cols = r2_scores.query("Winner == 'Log'").index
    reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
    boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
    yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index

    # Identify columns that are constant or semi-constant
    numeric_descr = df.drop(columns=drop_cols).describe().T
    semi_constant_mask = np.isclose(numeric_descr["min"], numeric_descr["50%"])
    semi_constant_descr = numeric_descr[semi_constant_mask]
    semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()

    # List of columns to be transformed
    semi_const_cols = semi_const_cols_thresholds.keys()
    no_transform_cols = no_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore").to_list()

    # Transformations
    standard_scaling = (
        preprocessing.StandardScaler(),
        no_transform_cols,
    )
    log_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        log_transform_cols,
    )
    reciprocal_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        reciprocal_transform_cols,
    )
    boxcox_transform = (
        preprocessing.PowerTransformer(method="box-cox", standardize=True),
        boxcox_transform_cols,
    )
    yeojohnson_transform = (
        preprocessing.PowerTransformer(method="yeo-johnson", standardize=True),
        yeojohnson_transform_cols,
    )

    # Other transformations
    categorical_imputing = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
        cat_cols,  # type: ignore
    )
    semi_const_transforms = [
        (
            pipeline.make_pipeline(
                impute.SimpleImputer(strategy="median"),
                preprocessing.Binarizer(threshold=thresh),
            ),
            [col],
        )
        for col, thresh in semi_const_cols_thresholds.items()
    ]

    return pipeline.make_pipeline(
        compose.make_column_transformer(
            standard_scaling,
            log_transform,
            reciprocal_transform,
            boxcox_transform,
            yeojohnson_transform,
            categorical_imputing,
            *semi_const_transforms,
            remainder="drop",
            verbose_feature_names_out=False,
        ),
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")


def get_tree_preprocess_pipeline():
    return pipeline.make_pipeline(
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")

In [42]:
from pathlib import Path

path = Path('./data')
output = Path('./output')
output.mkdir(exist_ok=True)

df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

drop_vars = ["EJ"]
df.drop(columns=drop_vars, inplace=True)

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)


def get_preprocessed_data(train_df, test_df, dep_vars):
    cont_names, cat_names = cont_cat_split(df, dep_var=dep_vars)

    preprocessor = get_preprocess_pipeline(df, cont_names, cat_names, dep_vars)

    # Preprocess training data
    X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
    train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
    X = train_df.drop(columns=dep_vars, errors="ignore")
    y = train_df[dep_vars]

    # Preprocess test data
    X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
    test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
    X_test = test_df.drop(columns=dep_vars, errors="ignore")
    y_test = test_df[dep_vars]

    return X, y, X_test, y_test


def get_tree_preprocessed_data(train_df, test_df, dep_vars):
    preprocessor = get_tree_preprocess_pipeline()

    # Preprocess training data
    X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
    train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
    X = train_df.drop(columns=dep_vars, errors="ignore")
    y = train_df[dep_vars]

    # Preprocess test data
    X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
    test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
    X_test = test_df.drop(columns=dep_vars, errors="ignore")
    y_test = test_df[dep_vars]

    # Calculate scale_pos_weight
    scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]

    return X, y, X_test, y_test, scale_pos_weight

In [43]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

# Model param getters from Optuna

In [76]:
best_trials = {
    "lightgbm_with_resampling": 248,
    "lightgbm_without_resampling": 220,
    "svc_without_resampling": 203,
    "svc_with_resampling": 161,
    "catboost_with_resampling": 234,
    "catboost_without_resampling": 223,
    "fastai_with_resampling": 123,
    "fastai_without_resampling": 101,
    "tabpfn_without_resampling": 30,
    "tabpfn_with_resampling": 65,
    "xgboost_without_resampling": 210,
    "xgboost_with_resampling": 179,
}

In [77]:
optuna_storage = "sqlite:///icr-ensemble-experiments.db"

In [78]:
estimator_params = {}
for study_name, trial_id in best_trials.items():
    study = optuna.load_study(study_name=study_name, storage=optuna_storage)
    best_trial = best_trials[study_name]
    estimator_params[study_name] = study.trials[trial_id].params

In [79]:
estimator_params["svc_with_resampling"]

{'C': 0.10494454342481872,
 'degree': 8,
 'gamma': 3.4748581836487165,
 'kernel': 'linear'}

In [48]:
from xgboost import XGBClassifier


def get_xgboost_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = dict(
        booster="gbtree",
        tree_method='gpu_hist',
        gpu_id=0,
        predictor='gpu_predictor',
        enable_categorical=True,
        scale_pos_weight=scale_pos_weight,
    ) | optuna_params

    model = XGBClassifier(
        **params,
        eval_metric=balanced_log_loss,
    )

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_loss = balanced_log_loss(y_test.values.ravel(), model.predict_proba(X_test))
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

In [49]:
xgboost_with_resampling = get_xgboost_estimator("xgboost_with_resampling")
xgboose_without_resampling = get_xgboost_estimator("xgboost_without_resampling")

Validation loss: 0.2717630661008243
Test loss: 0.25831930235898776
Validation loss: 0.3568297170940195
Test loss: 0.31751080701618967


In [54]:
from lightgbm import LGBMClassifier


def get_lightgbm_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = dict(
        boosting_type="gbdt",
        # device="gpu",
        scale_pos_weight=scale_pos_weight,
        data_sample_strategy="bagging",
        n_jobs=-1,
    ) | optuna_params

    model = LGBMClassifier(**params, objective="binary", verbosity=-1)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit, eval_metric=balanced_log_loss)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_loss = balanced_log_loss(y_test.values.ravel(), model.predict_proba(X_test))
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

In [55]:
lightgbm_with_resampling = get_lightgbm_estimator("lightgbm_with_resampling")
lightgbm_without_resampling = get_lightgbm_estimator("lightgbm_without_resampling")

Validation loss: 0.27788775297800195
Test loss: 0.2913405820647997
Validation loss: 0.3391997182685824
Test loss: 0.34534098102084476


In [58]:
from sklearn import svm


def get_svc_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test = get_preprocessed_data(train_df, test_df, dep_vars)

    params = dict(
        probability=True,
    ) | optuna_params

    model = svm.SVC(**params)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_loss = balanced_log_loss(y_test.values.ravel(), model.predict_proba(X_test))
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

In [59]:
svc_with_resampling = get_svc_estimator("svc_with_resampling")
svc_without_resampling = get_svc_estimator("svc_without_resampling")

Validation loss: 0.5354314436217044
Test loss: 0.5497892853161306
Validation loss: 0.4217511061671817
Test loss: 0.3766329745870105


In [62]:
from catboost import CatBoostClassifier


def get_catboost_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = dict(
        grow_policy="SymmetricTree",
        verbose=0,
        scale_pos_weight=scale_pos_weight,
    ) | optuna_params

    model = CatBoostClassifier(**params, eval_metric=BalancedLogLossMetric())

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_loss = balanced_log_loss(y_test.values.ravel(), model.predict_proba(X_test))
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

In [63]:
catboost_with_resampling = get_catboost_estimator("catboost_with_resampling")
catboost_without_resampling = get_catboost_estimator("catboost_without_resampling")

Validation loss: 0.29200997680502094
Test loss: 0.3182577856802042
Validation loss: 0.3422437693802506
Test loss: 0.3090137379347216


In [70]:
from tabpfn import TabPFNClassifier


def get_tabpfn_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    params = estimator_params[name]
    X, y, X_test, y_test, _ = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    model = TabPFNClassifier(
        device="cuda",
        only_inference=False,
        **params,
    )

    X_fit, y_fit = X, y
    if should_resample:
        X_fit, y_fit = resample(X, y)

    model.fit(X_fit, y_fit.values.ravel())

    test_loss = balanced_log_loss(y_test.values.ravel(), model.predict_proba(X_test))
    print(f"Test loss: {test_loss}")

In [71]:
tabpfn_with_resampling = get_tabpfn_estimator("tabpfn_with_resampling")
tabpfn_without_resampling = get_tabpfn_estimator("tabpfn_without_resampling")

Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Test loss: 0.4888267602549823
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Test loss: 0.4586789624230296


In [86]:
from fastai.tabular.all import (tabular_learner, tabular_config, TabularDataLoaders, CategoryBlock, FocalLossFlat,
                                TrainTestSplitter, EarlyStoppingCallback)


def get_fastai_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test = get_preprocessed_data(train_df, test_df, dep_vars)

    training_data = pd.merge(X, y, left_index=True, right_index=True)

    if should_resample:
        X_res, y_res = resample(X, y)
        training_data = pd.merge(X_res, y_res, left_index=True, right_index=True)

    testing_data = pd.merge(X_test, y_test, left_index=True, right_index=True)

    layers_map = {
        0: [2048, 1024, 512],
        1: [2048, 1024, 512, 256],
        2: [2048, 1024, 512, 256, 128],
        3: [2048, 1024, 512, 256, 128],
        4: [2048, 1024, 512, 256, 128, 64],
        5: [2048, 1024, 512, 256, 128, 64, 32],
        6: [2048, 1024, 512, 256, 128, 64, 32, 16],
        7: [512, 256, 128, 64, 32, 16],
        8: [256, 128, 64, 32, 16],
        9: [128, 64, 32, 16],
        10: [64, 32, 32, 16, 8],
    }

    bs = optuna_params.get("bs")
    epochs = optuna_params.get("epochs")
    layers_choice = optuna_params.get("layers_choice")
    layers = layers_map[layers_choice]

    config = tabular_config(
        ps=optuna_params.get("ps"),
        use_bn=optuna_params.get("use_bn"),
        bn_final=optuna_params.get("bn_final"),
        bn_cont=optuna_params.get("bn_cont"),
        lin_first=optuna_params.get("lin_first"),
    )

    dls = TabularDataLoaders.from_df(
        training_data,
        y_names=dep_vars,
        y_block=CategoryBlock,
        bs=bs,
        splits=TrainTestSplitter(
            test_size=0.2,
            stratify=train_df[dep_vars],
        ),
    )

    model = tabular_learner(
        dls,
        loss_func=FocalLossFlat(gamma=optuna_params.get("gamma")),
        layers=layers,
        config=config,
        cbs=[
            EarlyStoppingCallback(min_delta=0.1, patience=5),
        ],
        wd=optuna_params.get("wd"),
        wd_bn_bias=optuna_params.get("wd_bn_bias"),
    )

    with model.no_logging(), model.no_bar():
        model.fit_one_cycle(epochs)

    ypreds, targs = model.get_preds()
    val_loss = balanced_log_loss_tensor(ypreds, targs).item()

    test_dl = dls.test_dl(testing_data)
    ypreds, targs = model.get_preds(dl=test_dl)
    test_loss = balanced_log_loss_tensor(ypreds, targs).item()

    print(f"Validation loss: {val_loss}")
    print(f"Test loss: {test_loss}")

In [87]:
fastai_with_resampling = get_fastai_estimator("fastai_with_resampling")
fastai_without_resampling = get_fastai_estimator("fastai_without_resampling")

No improvement since epoch 2: early stopping


Validation loss: 0.23553065955638885
Test loss: 0.2887076139450073
No improvement since epoch 6: early stopping


Validation loss: 0.3589724898338318
Test loss: 0.34441179037094116
