In [8]:
from pathlib import Path
import optuna
from fastai.tabular.core import cont_cat_split
import pandas as pd
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from catboost import CatBoostClassifier

path = Path("./data")
output_path = Path("./output")

In [9]:
import warnings
# ignore UserWarning
warnings.filterwarnings("ignore", category=UserWarning)

In [10]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)


class BalancedLogLossMetric:
    def get_final_error(self, error, weight):
        return error / weight

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        y_true = np.array(target).astype(int)
        y_pred = np.array(approxes[0])
        nc = np.bincount(y_true)
        balanced_logloss = metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)
        return balanced_logloss, 1.0

In [11]:
def get_tree_preprocess_pipeline():
    return pipeline.make_pipeline(
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")


# Load data

df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

drop_vars = ["EJ"]
df.drop(columns=drop_vars, inplace=True)

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)

preprocessor = get_tree_preprocess_pipeline()

# Preprocess training data
X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
X = train_df.drop(columns=dep_vars, errors="ignore")
y = train_df[dep_vars]

# Preprocess test data
X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
X_test = test_df.drop(columns=dep_vars, errors="ignore")
y_test = test_df[dep_vars]

# Calculate scale_pos_weight
scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]

In [12]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res


# Optimize

In [13]:
def objective(trial):
    params = dict(
        iterations=trial.suggest_int("iterations", 550, 650, step=50),
        learning_rate=trial.suggest_float("learning_rate", 0.02, 0.05, log=True),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 5.0, 6.0, step=0.1),
        depth=trial.suggest_int("depth", 3, 4),
        random_strength=trial.suggest_float("random_strength", 30.0, 75.0, step=5.0),
        border_count=trial.suggest_int("border_count", 1, 75),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 0.5, step=0.1),
        grow_policy="SymmetricTree",
        verbose=0,
        scale_pos_weight=scale_pos_weight,
    )

    model = CatBoostClassifier(**params, eval_metric=BalancedLogLossMetric())

    skf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        # X_res, y_res = resample(X_train, y_train)
        # model.fit(X_res, y_res, eval_metric=balanced_log_loss)

        model.fit(X_train, y_train)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_preds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(y_test.values.ravel(), test_preds)
    mean_val_loss = np.mean(val_loss_list)

    return test_loss, test_loss - mean_val_loss

In [16]:
# optuna.delete_study(
#     study_name="catboost_no_categorical_no_resample",
#     storage="sqlite:////storage/optuna-final.db",
# ) 

In [None]:
pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(
    pruner=pruner,
    directions=["minimize", "minimize"],
    study_name="catboost_no_categorical_no_resample",
    storage="sqlite:////storage/optuna-final.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-06 21:55:15,451] A new study created in RDB with name: catboost_no_categorical_no_resample
[I 2023-08-06 21:55:32,193] Trial 0 finished with values: [0.3938893783929825, -0.009228330292742337] and parameters: {'bagging_temperature': 0.2, 'border_count': 66, 'depth': 4, 'iterations': 650, 'l2_leaf_reg': 5.8, 'learning_rate': 0.03495018647309725, 'random_strength': 75.0}. 
[I 2023-08-06 21:55:46,015] Trial 1 finished with values: [0.3230229015473937, -0.03820190500082438] and parameters: {'bagging_temperature': 0.5, 'border_count': 32, 'depth': 3, 'iterations': 600, 'l2_leaf_reg': 5.1, 'learning_rate': 0.04228807098883717, 'random_strength': 45.0}. 
[I 2023-08-06 21:55:59,528] Trial 2 finished with values: [0.3668901703025941, 0.004399347385294072] and parameters: {'bagging_temperature': 0.1, 'border_count': 28, 'depth': 3, 'iterations': 600, 'l2_leaf_reg': 5.5, 'learning_rate': 0.026712283671114233, 'random_strength': 35.0}. 
[I 2023-08-06 21:56:12,011] Trial 3 finished with 