In [1]:
from pathlib import Path

from fastai.tabular.core import cont_cat_split

import optuna

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier

path = Path("./data")
output_path = Path("./output")

In [2]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)

In [3]:
def get_tree_preprocess_pipeline():

    return pipeline.make_pipeline(
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")
    
# Load data
    
df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

drop_vars = ["EJ"]
df.drop(columns=drop_vars, inplace=True)

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)

preprocessor = get_tree_preprocess_pipeline()

# Preprocess training data
X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
X = train_df.drop(columns=dep_vars, errors="ignore")
y = train_df[dep_vars]

# Preprocess test data
X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
X_test = test_df.drop(columns=dep_vars, errors="ignore")
y_test = test_df[dep_vars]

# Calculate scale_pos_weight
scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]

In [4]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res


# Optimize

In [5]:
def objective(trial):
    params = dict(
        booster="gbtree",
        tree_method='gpu_hist',
        gpu_id=0,
        predictor='gpu_predictor',
        enable_categorical=True,
        scale_pos_weight=scale_pos_weight,
        learning_rate=trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=50),
        max_depth=trial.suggest_int("max_depth", 3, 15, step=1),
        subsample=trial.suggest_float("subsample", 0.0, 1.0, step=0.1),
        gamma=trial.suggest_float("gamma", 0.0, 1.0, step=0.1),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 10.0, step=0.5),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 10.0, step=0.5),
        min_child_weight=trial.suggest_float("min_child_weight", 0.0, 20.0, step=0.5),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.0, 1.0, step=0.1),
        colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.0, 1.0, step=0.1),
        colsample_bynode=trial.suggest_float("colsample_bynode", 0.0, 1.0, step=0.1),
    )

    model = XGBClassifier(
        **params,
        eval_metric=balanced_log_loss,
    )

    skf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()
        
        X_res, y_res = resample(X_train, y_train)

        model.fit(X_res, y_res)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)

    test_preds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(y_test.values.ravel(), test_preds)
    mean_val_loss = np.mean(val_loss_list)
    
    return test_loss, test_loss - mean_val_loss

In [6]:
# optuna.delete_study(
#     study_name="xgboost_bin_age_no_categorical",
#     storage="sqlite:////storage/optuna.db",
# )

In [7]:
pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(
    pruner=pruner,
    directions=["minimize", "minimize"],
    study_name="xgboost_no_categorical_with_resample",
    storage="sqlite:////storage/optuna-final.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-06 21:06:22,706] A new study created in RDB with name: xgboost_no_categorical_with_resample
[I 2023-08-06 21:06:30,847] Trial 0 finished with values: [0.47771575405788785, -0.01596796122948624] and parameters: {'colsample_bylevel': 1.0, 'colsample_bynode': 0.2, 'colsample_bytree': 0.9, 'gamma': 1.0, 'learning_rate': 0.00528000154286599, 'max_depth': 11, 'min_child_weight': 19.0, 'n_estimators': 650, 'reg_alpha': 9.5, 'reg_lambda': 5.5, 'subsample': 0.7000000000000001}. 
[I 2023-08-06 21:06:36,993] Trial 1 finished with values: [0.31884632471428426, -0.00973987117083902] and parameters: {'colsample_bylevel': 1.0, 'colsample_bynode': 0.1, 'colsample_bytree': 1.0, 'gamma': 0.7000000000000001, 'learning_rate': 0.058486833848859165, 'max_depth': 14, 'min_child_weight': 4.5, 'n_estimators': 700, 'reg_alpha': 8.0, 'reg_lambda': 7.5, 'subsample': 0.8}. 
[I 2023-08-06 21:06:41,665] Trial 2 finished with values: [0.31327739641111113, -0.04616561541019887] and parameters: {'colsample_b