In [1]:
from pathlib import Path

import catboost as cb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data_path = Path("../data/")
seed = 42

In [3]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    n0, n1 = np.bincount(y_true.astype(int))
    w0 = 1 / (n0 / len(y_true))
    w1 = 1 / (n1 / len(y_true))

    l0 = -w0 / n0 * np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred))
    l1 = -w1 / n1 * np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred))

    return (l0 + l1) / (w0 + w1)


In [4]:
# read data
train_df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")
greeks_df = pd.read_csv(data_path / "greeks.csv")

In [5]:
# some columns have trailing spaces
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
feature_cols = train_df.columns.tolist()[1:-1]

In [6]:
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["Class"], random_state=42
)
greeks_df = greeks_df.loc[train_df.index]

train_df = train_df.reset_index(drop=True)
greeks_df = greeks_df.reset_index(drop=True)

train_df.drop(columns=["Id"], inplace=True)

In [7]:
num_ensembles = 25

all_val_preds = []
all_test_preds = []

for ens in range(num_ensembles):
    print(f"Ensemble-{ens + 1}".center(150, "*"))
    # training
    
    under_sampler = RandomUnderSampler(random_state=ens)
    df, _ = under_sampler.fit_resample(train_df, train_df["Class"])

    oof_preds = np.zeros(len(df))
    val_preds = []
    test_preds = []

    fold = 1
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=ens)
    for train_idx, val_idx in skf.split(df, df["Class"]):
        # print(f"Fold-{fold}".center(110, "-"))
        fold += 1

        X_train, y_train = (
            df.loc[train_idx, feature_cols],
            df.loc[train_idx, "Class"],
        )

        X_val, y_val = (
            df.loc[val_idx, feature_cols],
            df.loc[val_idx, "Class"],
        )

        params = {
            "iterations": 10000,
            "early_stopping_rounds": 1000,
            "use_best_model": True,
            "random_seed": ens,
        }

        model = cb.CatBoostClassifier(**params)
        model.fit(
            X_train, y_train, eval_set=[(X_val, y_val)], cat_features=["EJ"], verbose=False
        )

        # make oof preds
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

        # make val preds
        val_preds.append(model.predict_proba(val_df[feature_cols])[:, 1])

        # make test predictions
        # notice we are logging probs for both classes
        test_preds.append(model.predict_proba(test_df.iloc[:, 1:]))

    # print("-" * 110)
    print(f"OOF score: {balanced_log_loss(df['Class'], oof_preds):.4f}")
    print(f"CV score: {balanced_log_loss(val_df['Class'], np.mean(val_preds, axis=0)):.4f}")

    # save predictions
    all_val_preds.append(val_preds.copy())
    all_test_preds.append(test_preds.copy())


**********************************************************************Ensemble-1**********************************************************************
OOF score: 0.2565
CV score: 0.3018
**********************************************************************Ensemble-2**********************************************************************
OOF score: 0.2843
CV score: 0.3720
**********************************************************************Ensemble-3**********************************************************************
OOF score: 0.1918
CV score: 0.2743
**********************************************************************Ensemble-4**********************************************************************
OOF score: 0.2412
CV score: 0.2693
**********************************************************************Ensemble-5**********************************************************************
OOF score: 0.2895
CV score: 0.3170
**********************************************************************

In [8]:
mean_val_preds = np.mean(np.mean(all_val_preds, axis=1), axis=0)
print(f"CV score: {balanced_log_loss(val_df['Class'], mean_val_preds):.4f}")

CV score: 0.3073


In [9]:
# generate a submission file
test_probs = np.mean(np.mean(all_test_preds, axis=1), axis=0)
sub_df = pd.DataFrame(
    {"Id": test_df.Id, "Class_0": test_probs[:, 0], "Class_1": test_probs[:, 1]}
)
sub_df.to_csv("submission.csv", index=False)