In [1]:
from pathlib import Path
import optuna

import pandas as pd
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from lightgbm import LGBMClassifier

path = Path("./data")
output_path = Path("./output")

In [2]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)

In [3]:
def add_binned_age(df):
    df["Age_binned"] = pd.cut(df["BN"], bins=20, labels=False)
    return df

In [4]:
def get_tree_preprocess_pipeline():
    feature_engineering = (
        pipeline.make_pipeline(
            preprocessing.FunctionTransformer(add_binned_age, validate=False),
        ),
        compose.make_column_selector(dtype_include='float64'),
    )

    categorical_imputing = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="most_frequent"),
            preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
        compose.make_column_selector(dtype_include='object'),
    )

    return pipeline.make_pipeline(
        compose.make_column_transformer(
            feature_engineering,
            categorical_imputing,
            remainder='passthrough',
            verbose_feature_names_out=True
        ),
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")

In [5]:
df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)

In [6]:
preprocessor = get_tree_preprocess_pipeline()
X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)



In [7]:
X = train_df.drop(columns=dep_vars, errors="ignore")
y = train_df[dep_vars]
train_df.shape

(370, 58)

In [8]:
X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)

In [9]:
X_test = test_df.drop(columns=dep_vars, errors="ignore")
y_test = test_df[dep_vars]
test_df.shape

(247, 58)

In [10]:
scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]


# Optimize

In [14]:
def objective(trial):
    params = dict(
        boosting_type="gbdt",
        device="gpu",
        scale_pos_weight=scale_pos_weight,
        learning_rate=trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=50),
        max_depth=trial.suggest_int("max_depth", 3, 15, step=1),
        subsample=trial.suggest_float("subsample", 0.1, 1.0, step=0.1),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 10.0, step=0.5),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 10.0, step=0.5),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0, step=0.1),
        colsample_bynode=trial.suggest_float("colsample_bynode", 0.1, 1.0, step=0.1),
        data_sample_strategy="bagging",
        num_leaves=trial.suggest_int("num_leaves", 1, 131072),
        max_bin=trial.suggest_int("max_bin", 2, 1000),
        n_jobs=-1,
    )

    model = LGBMClassifier(**params, objective="binary", verbosity=-1)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()

        model.fit(X_train, y_train, eval_metric=balanced_log_loss)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)
        
    
    test_preds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(y_test.values.ravel(), test_preds)
    mean_val_loss = np.mean(val_loss_list)

    return test_loss, test_loss - mean_val_loss

In [15]:
# optuna.delete_study(
#     study_name="lightgbm_bin_age",
#     storage="sqlite:///optuna.db",
# )

In [16]:
pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(
    pruner=pruner,
    directions=["minimize", "minimize"],
    study_name="lightgbm_bin_age",
    storage="sqlite:///optuna.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-06 15:21:16,419] A new study created in RDB with name: lightgbm_bin_age
[I 2023-08-06 15:21:22,079] Trial 0 finished with values: [0.37290371391466787, 0.014559690552709448] and parameters: {'colsample_bynode': 0.30000000000000004, 'colsample_bytree': 0.2, 'learning_rate': 0.017537464694188272, 'max_bin': 231, 'max_depth': 6, 'n_estimators': 1000, 'num_leaves': 94273, 'reg_alpha': 5.0, 'reg_lambda': 9.0, 'subsample': 0.8}. 
[I 2023-08-06 15:21:25,648] Trial 1 finished with values: [0.4238780487329025, 0.003330404722431235] and parameters: {'colsample_bynode': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.026306641057335316, 'max_bin': 139, 'max_depth': 14, 'n_estimators': 100, 'num_leaves': 90177, 'reg_alpha': 3.5, 'reg_lambda': 8.5, 'subsample': 0.1}. 
[I 2023-08-06 15:21:32,714] Trial 2 finished with values: [0.3683269309592376, -0.02466956130352338] and parameters: {'colsample_bynode': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.052194574934478946, 'max_bin': 416,