In [None]:
from pathlib import Path

from fastai.tabular.core import cont_cat_split

import optuna

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier

path = Path("./data")
output_path = Path("./output")

In [2]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)

In [3]:
def add_binned_age(df):
    df["Age_binned"] = pd.cut(df["BN"], bins=20, labels=False)
    return df

In [4]:
def get_tree_preprocess_pipeline():
    feature_engineering = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(add_binned_age, validate=False),
        ),
        compose.make_column_selector(dtype_include='float64'),
    )

    categorical_imputing = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
        compose.make_column_selector(dtype_include='object'),
    )

    return pipeline.make_pipeline(
        compose.make_column_transformer(
            feature_engineering,
            categorical_imputing,
            remainder='passthrough',
            verbose_feature_names_out=True
        ),
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")

In [5]:
df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)

In [6]:
preprocessor = get_tree_preprocess_pipeline()
X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)



In [7]:
X = train_df.drop(columns=dep_vars, errors="ignore")
y = train_df[dep_vars]
train_df.shape

(370, 58)

In [8]:
X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)

In [9]:
X_test = test_df.drop(columns=dep_vars, errors="ignore")
y_test = test_df[dep_vars]
test_df.shape

(247, 58)

In [10]:
scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]


# Optimize

In [11]:
def objective(trial):
    params = dict(
        booster="gbtree",
        tree_method='gpu_hist',
        gpu_id=0,
        predictor='gpu_predictor',
        enable_categorical=True,
        scale_pos_weight=scale_pos_weight,
        learning_rate=trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=50),
        max_depth=trial.suggest_int("max_depth", 3, 15, step=1),
        subsample=trial.suggest_float("subsample", 0.0, 1.0, step=0.1),
        gamma=trial.suggest_float("gamma", 0.0, 1.0, step=0.1),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 10.0, step=0.5),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 10.0, step=0.5),
        min_child_weight=trial.suggest_float("min_child_weight", 0.0, 20.0, step=0.5),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.0, 1.0, step=0.1),
        colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.0, 1.0, step=0.1),
        colsample_bynode=trial.suggest_float("colsample_bynode", 0.0, 1.0, step=0.1),
    )

    model = XGBClassifier(
        **params,
        eval_metric=balanced_log_loss,
    )

    skf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        model.fit(X_train, y_train)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_preds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(y_test.values.ravel(), test_preds)
    mean_val_loss = np.mean(val_loss_list)
    
    return test_loss, test_loss - mean_val_loss

In [12]:
# optuna.delete_study(
#     study_name="xgboost_bin_age",
#     storage="sqlite:////storage/optuna.db",
# )

In [None]:
pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(
    pruner=pruner,
    directions=["minimize", "minimize"],
    study_name="xgboost_bin_age",
    storage="sqlite:////storage/optuna.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-06 19:16:50,552] A new study created in RDB with name: xgboost_bin_age
[I 2023-08-06 19:17:03,617] Trial 0 finished with values: [0.4081645784316923, -0.007397330130071433] and parameters: {'colsample_bylevel': 0.8, 'colsample_bynode': 0.8, 'colsample_bytree': 0.30000000000000004, 'gamma': 0.4, 'learning_rate': 0.004129165935735537, 'max_depth': 13, 'min_child_weight': 7.0, 'n_estimators': 950, 'reg_alpha': 0.0, 'reg_lambda': 6.0, 'subsample': 0.5}. 
[I 2023-08-06 19:17:05,836] Trial 1 finished with values: [0.45617113386036545, -0.013390874933755792] and parameters: {'colsample_bylevel': 0.5, 'colsample_bynode': 0.5, 'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.2377255116922506, 'max_depth': 3, 'min_child_weight': 13.0, 'n_estimators': 150, 'reg_alpha': 10.0, 'reg_lambda': 2.0, 'subsample': 0.30000000000000004}. 
[I 2023-08-06 19:17:11,243] Trial 2 finished with values: [0.30265503097725066, -0.05181826209174806] and parameters: {'colsample_bylevel': 0.70000000