In [None]:
from pathlib import Path

import catboost as cb
import numpy as np
import optuna
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
data_path = Path("../data/")
seed = 42

In [None]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    n0, n1 = np.bincount(y_true.astype(int))
    w0 = 1 / (n0 / len(y_true))
    w1 = 1 / (n1 / len(y_true))

    l0 = -w0 / n0 * np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred))
    l1 = -w1 / n1 * np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred))

    return (l0 + l1) / (w0 + w1)


In [None]:
# read data
train_df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")
greeks_df = pd.read_csv(data_path / "greeks.csv")

In [None]:
# some columns have trailing spaces
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
feature_cols = train_df.columns.tolist()[1:-1]

In [None]:
def objective(trial):
    oof = np.zeros(len(train_df))
    skf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    params = {
        "iterations": trial.suggest_int("iterations", 1000, 10000),
        # "early_stopping_rounds": 1000,
        "use_best_model": True,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_int("random_strength", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 100.00),
        "grow_policy": "Lossguide",
        "auto_class_weights": "Balanced",
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
    }

    for train_idx, val_idx in skf.split(train_df, greeks_df.iloc[:, 1:-1]):
        X_train, y_train = (
            train_df.loc[train_idx, feature_cols],
            train_df.loc[train_idx, "Class"],
        )

        X_val, y_val = (
            train_df.loc[val_idx, feature_cols],
            train_df.loc[val_idx, "Class"],
        )

        model = cb.CatBoostClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            cat_features=["EJ"],
            verbose=0,
        )

        preds = model.predict_proba(X_val)[:, 1]
        oof[val_idx] = preds

    return balanced_log_loss(train_df["Class"], oof)


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))