In [1]:
from pathlib import Path

import catboost as cb
from catboost import Pool, cv
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
data_path = Path("../data/")
seed = 42

In [3]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    n0, n1 = np.bincount(y_true.astype(int))
    w0 = 1 / (n0 / len(y_true))
    w1 = 1 / (n1 / len(y_true))

    l0 = -w0 / n0 * np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred))
    l1 = -w1 / n1 * np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred))

    return (l0 + l1) / (w0 + w1)


def lgb_metric(y_true, y_pred):
    return "balanced_log_loss", balanced_log_loss(y_true, y_pred), False

In [4]:
# read data
train_df = pd.read_csv(data_path / "train.csv")
test_df = pd.read_csv(data_path / "test.csv")
greeks_df = pd.read_csv(data_path / "greeks.csv")

In [5]:
# some columns have trailing spaces
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
feature_cols = train_df.columns.tolist()[1:-1]

In [6]:
# training
oof = np.zeros(len(train_df))
skf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
test_probs = []

In [7]:
cv_params = {
    "iterations": 10000,
    "learning_rate": 0.005,
    # "early_stopping_rounds": 1000,
    # "auto_class_weights": "Balanced",
    "loss_function": "MultiClass",
    # "eval_metric": "MultiClass:use_weights=False",
    # "random_seed": 42,
    # "use_best_model": True,
    # "l2_leaf_reg": 1,
    # "max_ctr_complexity": 15,
    # "max_depth": 10,
    # "grow_policy": "Lossguide",
    # "max_leaves": 64,
    # "min_data_in_leaf": 40,
    "logging_level": "Silent"
}
cv_dataset = Pool(
    data=train_df[feature_cols], label=train_df["Class"], cat_features=["EJ"]
)
scores = cv(cv_dataset, cv_params, fold_count=5, plot="True")


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [9]:
scores.head()

Unnamed: 0,iterations,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std
0,0,0.690798,0.000212,0.690612,0.000157
1,1,0.688434,0.000548,0.688061,0.000473
2,2,0.685976,0.000775,0.685481,0.000724
3,3,0.683862,0.000765,0.683088,0.00048
4,4,0.681669,0.000639,0.680663,0.000315


In [8]:
# for train_idx, val_idx in skf.split(train_df, greeks_df.iloc[:, 1:-1]):

#     X_train, y_train = (
#         train_df.loc[train_idx, feature_cols],
#         train_df.loc[train_idx, "Class"],
#     )

#     X_val, y_val = (
#         train_df.loc[val_idx, feature_cols],
#         train_df.loc[val_idx, "Class"],
#     )

#     params = {
#         "iterations": 10000,
#         "learning_rate": 0.005,
#         "early_stopping_rounds": 1000,
#         "auto_class_weights": "Balanced",
#         "loss_function": "MultiClass",
#         "eval_metric": "MultiClass:use_weights=False",
#         "random_seed": 42,
#         "use_best_model": True,
#         "l2_leaf_reg": 1,
#         "max_ctr_complexity": 15,
#         "max_depth": 10,
#         "grow_policy": "Lossguide",
#         "max_leaves": 64,
#         "min_data_in_leaf": 40,
#     }
#     model = cb.CatBoostClassifier(**params)
#     model.fit(
#         X_train, y_train, eval_set=[(X_val, y_val)], cat_features=["EJ"], verbose=1000
#     )
#     preds = model.predict_proba(X_val)
#     oof[val_idx] = model.predict_proba(X_val)[:, 1]
#     test_probs.append(model.predict_proba(test_df.iloc[:, 1:]))

# print(f"OOF score: {balanced_log_loss(train_df['Class'], oof):.4f}")


0:	learn: 0.6906482	test: 0.6912274	best: 0.6912274 (0)	total: 174ms	remaining: 29m 3s
1000:	learn: 0.1061947	test: 0.2128374	best: 0.2128374 (1000)	total: 9.65s	remaining: 1m 26s
2000:	learn: 0.0252801	test: 0.1843363	best: 0.1838559 (1943)	total: 18.6s	remaining: 1m 14s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.1838559036
bestIteration = 1943

Shrink model to first 1944 iterations.
0:	learn: 0.6908401	test: 0.6908988	best: 0.6908988 (0)	total: 7.75ms	remaining: 1m 17s
1000:	learn: 0.1083615	test: 0.2026649	best: 0.2026576 (999)	total: 9.22s	remaining: 1m 22s
2000:	learn: 0.0247290	test: 0.1568647	best: 0.1567780 (1989)	total: 18.3s	remaining: 1m 13s
3000:	learn: 0.0092830	test: 0.1598822	best: 0.1564179 (2187)	total: 27.2s	remaining: 1m 3s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.1564179385
bestIteration = 2187

Shrink model to first 2188 iterations.
0:	learn: 0.6906621	test: 0.6909199	best: 0.6909199 (0)	total: 20.1ms	remainin

In [9]:
# # generate a submission file
# test_probs = np.mean(test_probs, axis=0)
# sub_df = pd.DataFrame(
#     {"Id": test_df.Id, "Class_0": test_probs[:, 0], "Class_1": test_probs[:, 1]}
# )
# sub_df.to_csv("submission.csv", index=False)
