In [28]:
import optuna
from pathlib import Path
import optuna
from fastai.tabular.core import cont_cat_split
import pandas as pd
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from catboost import CatBoostClassifier
from fastai.tabular.all import Tensor, torch, store_attr

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Loss metrics

In [29]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)


def balanced_log_loss_tensor(output: Tensor, target: Tensor):
    y_true = target.flatten().cpu().detach().numpy()
    y_pred = output.cpu().detach().numpy()

    try:
        loss_value = balanced_log_loss(y_true, y_pred)
    except Exception:
        return torch.tensor(0.0, dtype=torch.float32, device=output.device, requires_grad=True)
    return torch.tensor(loss_value, dtype=torch.float32, device=output.device, requires_grad=True)


class BalancedLogLossMetric:
    def get_final_error(self, error, weight):
        return error / weight

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        y_true = np.array(target).astype(int)
        y_pred = np.array(approxes[0])
        nc = np.bincount(y_true)
        balanced_logloss = metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)
        return balanced_logloss, 1.0


class BalancedLogLoss:
    y_int = True

    def __init__(self, *args, **kwargs):
        store_attr()

    def __call__(self, inp, targ, **kwargs):
        return balanced_log_loss_tensor(inp, targ)

    def activation(self, out: Tensor) -> Tensor:
        return F.softmax(out, dim=-1)

    def decodes(self, out: Tensor) -> Tensor:
        return out.argmax(dim=-1)

# Data preprocessing

In [30]:
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
from sklearn import compose, ensemble, impute, pipeline, preprocessing, tree


def get_preprocess_pipeline(df, cont_cols, cat_cols, drop_cols):
    """
    Returns a pipeline that performs the following transformations:
    * Standard scaling
    * Log transformation
    * Reciprocal transformation
    * Box-Cox transformation
    * Yeo-Johnson transformation
    * Categorical imputing
    * Semi-constant feature binarization

    Based on the EDA from https://www.kaggle.com/code/mateuszk013/icr-eda-balanced-learning-with-lgbm-xgb/notebook

    :param df: The dataframe to be transformed.
    :type df: pandas.DataFrame
    :param cont_names: The names of the continuous variables.
    :type cont_names: list of str
    :param dep_vars: The names of the dependent variables.
    :type dep_vars: list of str
    """

    # Identify columns that doesn't follow a normal distribution
    # find an appropriate transformation for them to follow a normal distribution
    r2_scores = defaultdict(tuple)

    for feature in cont_cols:
        orig = df[feature].dropna()
        _, (*_, R_orig) = stats.probplot(orig, rvalue=True)
        _, (*_, R_log) = stats.probplot(np.log(orig), rvalue=True)
        _, (*_, R_sqrt) = stats.probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_reci) = stats.probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = stats.probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = stats.probplot(stats.yeojohnson(orig)[0], rvalue=True)
        r2_scores[feature] = (
            R_orig * R_orig,
            R_log * R_log,
            R_sqrt * R_sqrt,
            R_reci * R_reci,
            R_boxcox * R_boxcox,
            R_yeojohn * R_yeojohn,
        )

    r2_scores = pd.DataFrame(
        r2_scores,
        index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson"),
    ).T

    r2_scores["Winner"] = r2_scores.idxmax(axis=1)

    # Identify columns to be transformed
    no_transform_cols = r2_scores.query("Winner == 'Original'").index
    log_transform_cols = r2_scores.query("Winner == 'Log'").index
    reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
    boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
    yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index

    # Identify columns that are constant or semi-constant
    numeric_descr = df.drop(columns=drop_cols).describe().T
    semi_constant_mask = np.isclose(numeric_descr["min"], numeric_descr["50%"])
    semi_constant_descr = numeric_descr[semi_constant_mask]
    semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()

    # List of columns to be transformed
    semi_const_cols = semi_const_cols_thresholds.keys()
    no_transform_cols = no_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors="ignore").to_list()
    yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore").to_list()

    # Transformations
    standard_scaling = (
        preprocessing.StandardScaler(),
        no_transform_cols,
    )
    log_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        log_transform_cols,
    )
    reciprocal_transform = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
            preprocessing.StandardScaler(),
        ),
        reciprocal_transform_cols,
    )
    boxcox_transform = (
        preprocessing.PowerTransformer(method="box-cox", standardize=True),
        boxcox_transform_cols,
    )
    yeojohnson_transform = (
        preprocessing.PowerTransformer(method="yeo-johnson", standardize=True),
        yeojohnson_transform_cols,
    )

    # Other transformations
    categorical_imputing = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
        cat_cols,  # type: ignore
    )
    semi_const_transforms = [
        (
            pipeline.make_pipeline(
                impute.SimpleImputer(strategy="median"),
                preprocessing.Binarizer(threshold=thresh),
            ),
            [col],
        )
        for col, thresh in semi_const_cols_thresholds.items()
    ]

    return pipeline.make_pipeline(
        compose.make_column_transformer(
            standard_scaling,
            log_transform,
            reciprocal_transform,
            boxcox_transform,
            yeojohnson_transform,
            categorical_imputing,
            *semi_const_transforms,
            remainder="drop",
            verbose_feature_names_out=False,
        ),
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")


def get_tree_preprocess_pipeline():
    return pipeline.make_pipeline(
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")

In [31]:
from pathlib import Path

path = Path("./data")
output = Path("./output")
output.mkdir(exist_ok=True)

df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

drop_vars = ["EJ"]
df.drop(columns=drop_vars, inplace=True)

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)


def get_preprocessed_data(train_df, test_df, dep_vars):
    cont_names, cat_names = cont_cat_split(df, dep_var=dep_vars)

    preprocessor = get_preprocess_pipeline(df, cont_names, cat_names, dep_vars)

    # Preprocess training data
    X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
    train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
    X = train_df.drop(columns=dep_vars, errors="ignore")
    y = train_df[dep_vars]

    # Preprocess test data
    X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
    test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
    X_test = test_df.drop(columns=dep_vars, errors="ignore")
    y_test = test_df[dep_vars]

    return X, y, X_test, y_test


def get_tree_preprocessed_data(train_df, test_df, dep_vars):
    preprocessor = get_tree_preprocess_pipeline()

    # Preprocess training data
    X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
    train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
    X = train_df.drop(columns=dep_vars, errors="ignore")
    y = train_df[dep_vars]

    # Preprocess test data
    X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
    test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
    X_test = test_df.drop(columns=dep_vars, errors="ignore")
    y_test = test_df[dep_vars]

    # Calculate scale_pos_weight
    scale_pos_weight = df["Class"].value_counts()[0] / df["Class"].value_counts()[1]

    return X, y, X_test, y_test, scale_pos_weight

In [32]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

# Model param getters from Optuna

In [33]:
best_trials = {
    "lightgbm_with_resampling": 248,
    "lightgbm_without_resampling": 220,
    "svc_without_resampling": 203,
    "svc_with_resampling": 161,
    "catboost_with_resampling": 234,
    "catboost_without_resampling": 223,
    "fastai_with_resampling": 123,
    "fastai_without_resampling": 101,
    "tabpfn_without_resampling": 30,
    "tabpfn_with_resampling": 65,
    "xgboost_without_resampling": 210,
    "xgboost_with_resampling": 179,
}

In [34]:
optuna_storage = "sqlite:///icr-ensemble-experiments.db"

In [35]:
estimator_params = {}
for study_name, trial_id in best_trials.items():
    study = optuna.load_study(study_name=study_name, storage=optuna_storage)
    best_trial = best_trials[study_name]
    estimator_params[study_name] = study.trials[trial_id].params

In [36]:
estimator_params["svc_with_resampling"]

{'C': 0.10494454342481872,
 'degree': 8,
 'gamma': 3.4748581836487165,
 'kernel': 'linear'}

In [37]:
from xgboost import XGBClassifier


def get_xgboost_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = (
            dict(
                booster="gbtree",
                tree_method="gpu_hist",
                gpu_id=0,
                predictor="gpu_predictor",
                enable_categorical=True,
                scale_pos_weight=scale_pos_weight,
            )
            | optuna_params
    )

    model = XGBClassifier(
        **params,
        eval_metric=balanced_log_loss,
    )

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    targs = y_test.values.ravel()
    ypreds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(targs, ypreds)
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [38]:
xgboost_with_resampling = get_xgboost_estimator("xgboost_with_resampling")
xgboose_without_resampling = get_xgboost_estimator("xgboost_without_resampling")

Validation loss: 0.26450599056169405
Test loss: 0.3078743331221907
Validation loss: 0.3600682567715471
Test loss: 0.34729932550867476


In [39]:
from lightgbm import LGBMClassifier


def get_lightgbm_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = (
            dict(
                boosting_type="gbdt",
                device="gpu",
                scale_pos_weight=scale_pos_weight,
                data_sample_strategy="bagging",
                n_jobs=-1,
            )
            | optuna_params
    )

    model = LGBMClassifier(**params, objective="binary", verbosity=-1)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit, eval_metric=balanced_log_loss)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    targs = y_test.values.ravel()
    ypreds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(targs, ypreds)
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [40]:
lightgbm_with_resampling = get_lightgbm_estimator("lightgbm_with_resampling")
lightgbm_without_resampling = get_lightgbm_estimator("lightgbm_without_resampling")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtyp

Validation loss: 0.25527170120571846
Test loss: 0.31979205993610177


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtyp

Validation loss: 0.35623672866926354
Test loss: 0.2947965841951847


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  target_dtype = np.find_common_type(df_dtypes, [])


In [41]:
from sklearn import svm


def get_svc_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test = get_preprocessed_data(train_df, test_df, dep_vars)

    params = (
            dict(
                probability=True,
            )
            | optuna_params
    )

    model = svm.SVC(**params)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    targs = y_test.values.ravel()
    ypreds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(targs, ypreds)
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [42]:
svc_with_resampling = get_svc_estimator("svc_with_resampling")
svc_without_resampling = get_svc_estimator("svc_without_resampling")

Validation loss: 0.5400077008152435
Test loss: 0.48264124635908023
Validation loss: 0.4074203932210792
Test loss: 0.4268884874310695


In [43]:
from catboost import CatBoostClassifier


def get_catboost_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test, scale_pos_weight = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    params = (
            dict(
                grow_policy="SymmetricTree",
                verbose=0,
                scale_pos_weight=scale_pos_weight,
            )
            | optuna_params
    )

    model = CatBoostClassifier(**params, eval_metric=BalancedLogLossMetric())

    skf = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=3)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        X_fit, y_fit = X_train, y_train
        if should_resample:
            X_fit, y_fit = resample(X_train, y_train)

        model.fit(X_fit, y_fit)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    targs = y_test.values.ravel()
    ypreds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(targs, ypreds)
    print(f"Validation loss: {np.mean(val_loss_list)}")
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [44]:
catboost_with_resampling = get_catboost_estimator("catboost_with_resampling")
catboost_without_resampling = get_catboost_estimator("catboost_without_resampling")

Validation loss: 0.3022235686776513
Test loss: 0.2690990850101488
Validation loss: 0.33222046493102847
Test loss: 0.3595578561334823


In [45]:
from tabpfn import TabPFNClassifier


def get_tabpfn_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    params = estimator_params[name]
    X, y, X_test, y_test, _ = get_tree_preprocessed_data(train_df, test_df, dep_vars)

    model = TabPFNClassifier(
        device="cuda",
        only_inference=False,
        **params,
    )

    X_fit, y_fit = X, y
    if should_resample:
        X_fit, y_fit = resample(X, y)

    model.fit(X_fit, y_fit.values.ravel())

    targs = y_test.values.ravel()
    ypreds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(targs, ypreds)
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [46]:
tabpfn_with_resampling = get_tabpfn_estimator("tabpfn_with_resampling")
tabpfn_without_resampling = get_tabpfn_estimator("tabpfn_without_resampling")

Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Test loss: 0.4872132304897355
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Test loss: 0.4586789758074815


In [47]:
from fastai.tabular.all import tabular_learner, tabular_config, TabularDataLoaders, CategoryBlock, FocalLossFlat, \
    TrainTestSplitter, EarlyStoppingCallback


def get_fastai_estimator(name: str):
    should_resample = name.endswith("with_resampling")

    optuna_params = estimator_params[name]
    X, y, X_test, y_test = get_preprocessed_data(train_df, test_df, dep_vars)

    training_data = pd.merge(X, y, left_index=True, right_index=True)

    if should_resample:
        X_res, y_res = resample(X, y)
        training_data = pd.merge(X_res, y_res, left_index=True, right_index=True)

    testing_data = pd.merge(X_test, y_test, left_index=True, right_index=True)

    layers_map = {
        0: [2048, 1024, 512],
        1: [2048, 1024, 512, 256],
        2: [2048, 1024, 512, 256, 128],
        3: [2048, 1024, 512, 256, 128],
        4: [2048, 1024, 512, 256, 128, 64],
        5: [2048, 1024, 512, 256, 128, 64, 32],
        6: [2048, 1024, 512, 256, 128, 64, 32, 16],
        7: [512, 256, 128, 64, 32, 16],
        8: [256, 128, 64, 32, 16],
        9: [128, 64, 32, 16],
        10: [64, 32, 32, 16, 8],
    }

    bs = optuna_params.get("bs")
    epochs = optuna_params.get("epochs")
    layers_choice = optuna_params.get("layers_choice")
    layers = layers_map[layers_choice]

    config = tabular_config(
        ps=optuna_params.get("ps"),
        use_bn=optuna_params.get("use_bn"),
        bn_final=optuna_params.get("bn_final"),
        bn_cont=optuna_params.get("bn_cont"),
        lin_first=optuna_params.get("lin_first"),
    )

    dls = TabularDataLoaders.from_df(
        training_data,
        y_names=dep_vars,
        y_block=CategoryBlock,
        bs=bs,
        splits=TrainTestSplitter(
            test_size=0.2,
            stratify=train_df[dep_vars],
        ),
    )

    model = tabular_learner(
        dls,
        loss_func=FocalLossFlat(gamma=optuna_params.get("gamma")),
        layers=layers,
        config=config,
        cbs=[
            EarlyStoppingCallback(min_delta=0.1, patience=8),
        ],
        wd=optuna_params.get("wd"),
        wd_bn_bias=optuna_params.get("wd_bn_bias"),
    )

    with model.no_logging(), model.no_bar():
        model.fit_one_cycle(epochs)

    ypreds, targs = model.get_preds()
    val_loss = balanced_log_loss_tensor(ypreds, targs).item()

    test_dl = dls.test_dl(testing_data)
    ypreds, targs = model.get_preds(dl=test_dl)
    test_loss = balanced_log_loss_tensor(ypreds, targs).item()

    print(f"Validation loss: {val_loss}")
    print(f"Test loss: {test_loss}")

    return ypreds, targs, model

In [48]:
fastai_with_resampling = get_fastai_estimator("fastai_with_resampling")
fastai_without_resampling = get_fastai_estimator("fastai_without_resampling")

Validation loss: 0.19319112598896027
Test loss: 0.33733823895454407
No improvement since epoch 7: early stopping


Validation loss: 0.4285264015197754
Test loss: 0.3670297861099243


In [49]:
X_normalized, y_normalized, X_test_normalized, y_test_normalized = get_preprocessed_data(train_df, test_df, dep_vars)
X_tree, y_tree, X_test_tree, y_test_tree, _ = get_tree_preprocessed_data(train_df, test_df, dep_vars)

In [211]:
estimators = {
    "xgboost_with_resampling": xgboost_with_resampling,
    "xgboose_without_resampling": xgboose_without_resampling,
    "lightgbm_with_resampling": lightgbm_with_resampling,
    "lightgbm_without_resampling": lightgbm_without_resampling,
    "catboost_with_resampling": catboost_with_resampling,
    "catboost_without_resampling": catboost_without_resampling,
    "tabpfn_with_resampling": tabpfn_with_resampling,
    "tabpfn_without_resampling": tabpfn_without_resampling,
    "fastai_with_resampling": fastai_with_resampling,
    "fastai_without_resampling": fastai_without_resampling,
    "svc_with_resampling": svc_with_resampling,
    "svc_without_resampling": svc_without_resampling,
}

In [216]:
predictions = {name: ypreds[:, 1] for name, (ypreds, targs, model) in estimators.items()}

scores = {}

for name, (ypreds, targs, model) in estimators.items():
    if name.startswith("fastai"):
        y_true = targs.cpu().detach().numpy().flatten()
        y_pred = ypreds.cpu().detach().numpy()

        scores[name] = [
            balanced_log_loss(y_true, y_pred),
            metrics.accuracy_score(y_true, np.argmax(y_pred, axis=1)),
            metrics.precision_score(y_true, np.argmax(y_pred, axis=1)),
            metrics.recall_score(y_true, np.argmax(y_pred, axis=1)),
            metrics.f1_score(targs, np.argmax(ypreds, axis=1)),
        ]
    else:
        scores[name] = [
            balanced_log_loss(targs, ypreds),
            metrics.accuracy_score(targs, np.argmax(ypreds, axis=1)),
            metrics.precision_score(targs, np.argmax(ypreds, axis=1)),
            metrics.recall_score(targs, np.argmax(ypreds, axis=1)),
            metrics.f1_score(targs, np.argmax(ypreds, axis=1)),
        ]

In [217]:
scores_df = pd.DataFrame(scores, index=["balanced_log_loss", "accuracy", "precision", "recall", "f1"]).T

In [220]:
scores_df

Unnamed: 0,balanced_log_loss,accuracy,precision,recall,f1
xgboost_with_resampling,0.307874,0.894737,0.666667,0.790698,0.723404
xgboose_without_resampling,0.347299,0.91498,0.761905,0.744186,0.752941
lightgbm_with_resampling,0.319792,0.878543,0.610169,0.837209,0.705882
lightgbm_without_resampling,0.294797,0.919028,0.767442,0.767442,0.767442
catboost_with_resampling,0.269099,0.927126,0.735849,0.906977,0.8125
catboost_without_resampling,0.359558,0.91498,0.761905,0.744186,0.752941
tabpfn_with_resampling,0.487213,0.910931,0.8,0.651163,0.717949
tabpfn_without_resampling,0.458679,0.894737,0.742857,0.604651,0.666667
fastai_with_resampling,0.337338,0.882591,0.634615,0.767442,0.694737
fastai_without_resampling,0.36703,0.878543,0.622642,0.767442,0.6875


In [56]:
pred_df = pd.DataFrame(predictions)

In [81]:
correlation_matrix = pred_df.corr(method="pearson")

In [88]:
model_matrix = pd.DataFrame(scores, index=["score"]).T

In [90]:
model_matrix.sort_values(by="score")

Unnamed: 0,score
catboost_with_resampling,0.269099
lightgbm_without_resampling,0.294797
xgboost_with_resampling,0.307874
lightgbm_with_resampling,0.319792
fastai_with_resampling,0.337338
xgboose_without_resampling,0.347299
catboost_without_resampling,0.359558
fastai_without_resampling,0.36703
svc_without_resampling,0.426888
tabpfn_without_resampling,0.458679


In [92]:
correlation_matrix["catboost_with_resampling"].sort_values(ascending=False)

catboost_with_resampling       1.000000
catboost_without_resampling    0.912707
xgboost_with_resampling        0.907788
lightgbm_without_resampling    0.898697
lightgbm_with_resampling       0.892832
xgboose_without_resampling     0.881215
tabpfn_without_resampling      0.845918
tabpfn_with_resampling         0.822164
fastai_without_resampling      0.813217
fastai_with_resampling         0.805797
svc_without_resampling         0.803571
svc_with_resampling            0.756543
Name: catboost_with_resampling, dtype: float64

In [104]:
from visualize.results import plot_results

In [107]:
pred_df

Unnamed: 0,xgboost_with_resampling,xgboose_without_resampling,lightgbm_with_resampling,lightgbm_without_resampling,catboost_with_resampling,catboost_without_resampling,tabpfn_with_resampling,tabpfn_without_resampling,fastai_with_resampling,fastai_without_resampling,svc_with_resampling,svc_without_resampling
0,0.001234,0.019256,0.002083,0.015092,0.019096,0.008726,0.000014,0.000330,0.060260,0.258211,0.008254,0.025210
1,0.009289,0.012315,0.009936,0.004724,0.009896,0.019106,0.000010,0.000156,0.061067,0.292828,0.003090,0.025625
2,0.002172,0.037606,0.037696,0.012962,0.002489,0.007963,0.000008,0.000327,0.004849,0.250330,0.000193,0.012320
3,0.004758,0.024295,0.009589,0.010544,0.003979,0.007102,0.000808,0.004187,0.108994,0.342620,0.034630,0.031328
4,0.755470,0.617068,0.790937,0.473647,0.962509,0.457188,0.839631,0.299171,0.665503,0.277318,0.551801,0.269875
...,...,...,...,...,...,...,...,...,...,...,...,...
242,0.240579,0.623691,0.853304,0.816227,0.707542,0.867993,0.893942,0.909409,0.917254,0.998526,0.961842,0.989333
243,0.168885,0.029538,0.543029,0.083749,0.013250,0.160640,0.014701,0.032452,0.549531,0.285469,0.285505,0.112997
244,0.004150,0.008932,0.007517,0.003012,0.004397,0.002916,0.000088,0.000501,0.011839,0.221354,0.000545,0.014042
245,0.433591,0.413470,0.611553,0.402315,0.895062,0.628181,0.260091,0.405778,0.880548,0.994314,0.522385,0.318611


In [112]:
plot_results(targs, pred_df['catboost_with_resampling'])

In [235]:
plot_results(targs, pred_df['tabpfn_with_resampling'])

In [200]:
plot_results(targs, pred_df['fastai_without_resampling'])

In [223]:
def objective(trial):
    w1 = trial.suggest_float("w1", 0.1, 1.0, step=0.1)
    w2 = trial.suggest_float("w2", 0.1, 1.0, step=0.1)
    w3 = trial.suggest_float("w2", 0.1, 1.0, step=0.1)
    w4 = trial.suggest_float("w2", 0.1, 1.0, step=0.1)
    w5 = trial.suggest_float("w2", 0.1, 1.0, step=0.1)

    total_weight = w1 + w2 + w3 + w4 + w5
    w1 = w1 / total_weight
    w2 = w2 / total_weight
    w2 = w2 / total_weight
    w2 = w2 / total_weight
    w2 = w2 / total_weight

    predictions = [
        pred_df['catboost_with_resampling'].values,
        pred_df['lightgbm_without_resampling'].values,
        pred_df['xgboose_without_resampling'].values,
        pred_df['catboost_without_resampling'].values,
        pred_df['xgboost_with_resampling'].values,
    ]



    ensemble_pred = (w1 * predictions[0] + w2 * predictions[1] + w2 * predictions[2]) / total_weight

    return balanced_log_loss(targs, ensemble_pred)


In [224]:
# optuna.delete_study(
#     study_name="ensemble_weights",
#     storage=optuna_storage,
# )

In [225]:
import optuna

study = optuna.create_study(
    direction="minimize",
    study_name="ensemble_weights",
    storage=optuna_storage,
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-07 13:50:03,133] Using an existing study with name 'ensemble_weights' instead of creating a new one.
[I 2023-08-07 13:50:03,246] Trial 250 finished with value: 0.27218123132705246 and parameters: {'w1': 0.4, 'w2': 0.30000000000000004}. Best is trial 250 with value: 0.27218123132705246.
[I 2023-08-07 13:50:03,308] Trial 251 finished with value: 0.73636065200028 and parameters: {'w1': 0.4, 'w2': 0.2}. Best is trial 250 with value: 0.27218123132705246.
[I 2023-08-07 13:50:03,369] Trial 252 finished with value: 0.41897978684174153 and parameters: {'w1': 0.30000000000000004, 'w2': 0.30000000000000004}. Best is trial 250 with value: 0.27218123132705246.
[I 2023-08-07 13:50:03,429] Trial 253 finished with value: 0.8232912776935746 and parameters: {'w1': 0.2, 'w2': 0.30000000000000004}. Best is trial 250 with value: 0.27218123132705246.
[I 2023-08-07 13:50:03,490] Trial 254 finished with value: 0.27218123132705246 and parameters: {'w1': 0.4, 'w2': 0.30000000000000004}. Best is trial

In [176]:
for trial in study.best_trials:
    print(trial.values)
    print(trial.params)

[0.2579267934168578, 0.8124999999999999]
{'catboost_with_resampling_weight': 0.6000000000000001, 'fastai_without_resampling_weight': 0.1, 'tabpfn_with_resampling_weight': 0.0}
[0.29672064041398166, 0.8314606741573034]
{'catboost_with_resampling_weight': 0.5, 'fastai_without_resampling_weight': 0.5, 'tabpfn_with_resampling_weight': 0.8}
[0.27139644879020103, 0.826086956521739]
{'catboost_with_resampling_weight': 0.7000000000000001, 'fastai_without_resampling_weight': 0.1, 'tabpfn_with_resampling_weight': 0.30000000000000004}
[0.2579267934168578, 0.8124999999999999]
{'catboost_with_resampling_weight': 0.6000000000000001, 'fastai_without_resampling_weight': 0.1, 'tabpfn_with_resampling_weight': 0.0}


In [238]:
def get_weighted_preds(w1, w2):
    predictions = [
        pred_df['fastai_without_resampling'].values,
        pred_df['catboost_with_resampling'].values,
    ]
    
    ensemble_pred = (w1 * predictions[0] + w2 * predictions[1]) / (w1 + w2 )

    return ensemble_pred

In [246]:
en_preds = get_weighted_preds(0.7, 0.3)
print(balanced_log_loss(targs, en_preds))
plot_results(targs, en_preds)

0.3053150559685556


In [247]:
plot_results(targs, pred_df['catboost_with_resampling'])
