In [1]:
from pathlib import Path

import catboost as cb
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
data_path = Path("../data/")
seed = 42

In [3]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    n0, n1 = np.bincount(y_true.astype(int))
    w0 = 1 / (n0 / len(y_true))
    w1 = 1 / (n1 / len(y_true))

    l0 = -w0 / n0 * np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred))
    l1 = -w1 / n1 * np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred))

    return (l0 + l1) / (w0 + w1)


In [4]:
# read data
train_df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")
greeks_df = pd.read_csv(data_path / "greeks.csv")

In [5]:
# some columns have trailing spaces
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
feature_cols = train_df.columns.tolist()[1:-1]

In [6]:
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["Class"], random_state=42
)
greeks_df = greeks_df.loc[train_df.index]

train_df = train_df.reset_index()
greeks_df = greeks_df.reset_index()

In [7]:
# training
oof = np.zeros(len(train_df))
skf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
test_probs = []
val_preds = []

In [8]:
fold = 1
for train_idx, val_idx in skf.split(train_df, greeks_df.iloc[:, 1:-1]):

    print(f"Fold-{fold}".center(110, "-"))
    fold += 1

    X_train, y_train = (
        train_df.loc[train_idx, feature_cols],
        train_df.loc[train_idx, "Class"],
    )

    X_val, y_val = (
        train_df.loc[val_idx, feature_cols],
        train_df.loc[val_idx, "Class"],
    )

    params = {
        "iterations": 1178,
        "use_best_model": True,
        "learning_rate": 0.02307884135288726,
        "l2_leaf_reg": 8,
        "random_strength": 2,
        "bagging_temperature": 7.043228545140888,
        "grow_policy": "Lossguide",
        "auto_class_weights": "Balanced",
        "od_type": "IncToDec",
        "od_wait": 36,
    }
    # params = {
    #     "iterations": 10000,
    #     "learning_rate": 0.005,
    #     "early_stopping_rounds": 1000,
    #     "auto_class_weights": "Balanced",
    #     "loss_function": "MultiClass",
    #     "eval_metric": "MultiClass:use_weights=False",
    #     "random_seed": 42,
    #     "use_best_model": True,
    #     "l2_leaf_reg": 1,
    #     "max_ctr_complexity": 15,
    #     "max_depth": 10,
    #     "grow_policy": "Lossguide",
    #     "max_leaves": 64,
    #     "min_data_in_leaf": 40,
    # }

    model = cb.CatBoostClassifier(**params)
    model.fit(
        X_train, y_train, eval_set=[(X_val, y_val)], cat_features=["EJ"], verbose=1000
    )

    # make oof preds
    oof[val_idx] = model.predict_proba(X_val)[:, 1]

    # make val preds
    val_preds.append(model.predict_proba(val_df[feature_cols])[:, 1])

    # make test predictions
    test_preds = model.predict_proba(test_df.iloc[:, 1:])
    test_probs.append(test_preds)

print("-" * 110)
print(f"OOF score: {balanced_log_loss(train_df['Class'], oof):.4f}")
print(f"CV score: {balanced_log_loss(val_df['Class'], np.mean(val_preds, axis=0)):.4f}")


----------------------------------------------------Fold-1----------------------------------------------------
0:	learn: 0.6791237	test: 0.6840397	best: 0.6840397 (0)	total: 64.7ms	remaining: 1m 16s
1000:	learn: 0.0056753	test: 0.3520920	best: 0.2878196 (243)	total: 4.72s	remaining: 834ms
1177:	learn: 0.0048033	test: 0.3578826	best: 0.2878196 (243)	total: 5.47s	remaining: 0us

bestTest = 0.2878196194
bestIteration = 243

Shrink model to first 244 iterations.
----------------------------------------------------Fold-2----------------------------------------------------
0:	learn: 0.6804877	test: 0.6831767	best: 0.6831767 (0)	total: 3.57ms	remaining: 4.2s
1000:	learn: 0.0049614	test: 0.4457416	best: 0.3444903 (273)	total: 5.09s	remaining: 901ms
1177:	learn: 0.0043805	test: 0.4606609	best: 0.3444903 (273)	total: 6.09s	remaining: 0us

bestTest = 0.344490309
bestIteration = 273

Shrink model to first 274 iterations.
----------------------------------------------------Fold-3-------------------

In [9]:
# OOF score: 0.4016
# CV score: 0.4071

In [10]:
# generate a submission file
test_probs = np.mean(test_probs, axis=0)
sub_df = pd.DataFrame(
    {"Id": test_df.Id, "Class_0": test_probs[:, 0], "Class_1": test_probs[:, 1]}
)
sub_df.to_csv("submission.csv", index=False)
