In [8]:
from pathlib import Path
import optuna
from fastai.tabular.core import cont_cat_split
import pandas as pd
from sklearn import compose, impute, pipeline, preprocessing, model_selection
import numpy as np
from sklearn import metrics
from lightgbm import LGBMClassifier

path = Path("./data")
output_path = Path("./output")

In [9]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return metrics.log_loss(y_true, y_pred, sample_weight=1 / nc[y_true], eps=1e-15)

In [10]:
def get_tree_preprocess_pipeline():

    return pipeline.make_pipeline(
        impute.KNNImputer(n_neighbors=10, weights="distance"),
    ).set_output(transform="pandas")
    
# Load data
    
df = pd.read_csv(path / "train.csv", index_col="Id")
dep_vars = ["Class"]

drop_vars = ["EJ"]
df.drop(columns=drop_vars, inplace=True)

train_df, test_df = model_selection.train_test_split(df, test_size=0.4, stratify=df[dep_vars], random_state=33)

preprocessor = get_tree_preprocess_pipeline()

# Preprocess training data
X_pre = preprocessor.fit_transform(train_df.drop(columns=dep_vars))
train_df = pd.merge(X_pre, train_df[dep_vars], left_index=True, right_index=True)
X = train_df.drop(columns=dep_vars, errors="ignore")
y = train_df[dep_vars]

# Preprocess test data
X_test_pre = preprocessor.transform(test_df.drop(columns=dep_vars))
test_df = pd.merge(X_test_pre, test_df[dep_vars], left_index=True, right_index=True)
X_test = test_df.drop(columns=dep_vars, errors="ignore")
y_test = test_df[dep_vars]

# Calculate scale_pos_weight
scale_pos_weight = df['Class'].value_counts()[0] / df['Class'].value_counts()[1]

In [11]:
# Resample
from imblearn.over_sampling import SMOTE


def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res


# Optimize

In [12]:
def objective(trial):
    params = dict(
        boosting_type="gbdt",
        device="gpu",
        scale_pos_weight=scale_pos_weight,
        learning_rate=trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=50),
        max_depth=trial.suggest_int("max_depth", 3, 15, step=1),
        subsample=trial.suggest_float("subsample", 0.1, 1.0, step=0.1),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 10.0, step=0.5),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 10.0, step=0.5),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0, step=0.1),
        colsample_bynode=trial.suggest_float("colsample_bynode", 0.1, 1.0, step=0.1),
        data_sample_strategy="bagging",
        num_leaves=trial.suggest_int("num_leaves", 1, 131072),
        max_bin=trial.suggest_int("max_bin", 2, 1000),
        n_jobs=-1,
    )

    model = LGBMClassifier(**params, objective="binary", verbosity=-1)

    skf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    val_loss_list = []

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx].values.ravel()

        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx].values.ravel()
        
        # X_res, y_res = resample(X_train, y_train)

        model.fit(X_train, y_train, eval_metric=balanced_log_loss)

        val_preds = model.predict_proba(X_val)
        val_loss = balanced_log_loss(y_val, val_preds)

        val_loss_list.append(val_loss)
        
    
    test_preds = model.predict_proba(X_test)
    test_loss = balanced_log_loss(y_test.values.ravel(), test_preds)
    mean_val_loss = np.mean(val_loss_list)

    return test_loss, test_loss - mean_val_loss

In [13]:
# optuna.delete_study(
#     study_name="lightgbm_bin_age_no_categorical",
#     storage="sqlite:///optuna.db",
# )

In [14]:
pruner = optuna.pruners.SuccessiveHalvingPruner()
study = optuna.create_study(
    pruner=pruner,
    directions=["minimize", "minimize"],
    study_name="lightgbm_no_categorical_no_resample",
    storage="sqlite:///optuna.db",
    load_if_exists=True,
)
study.optimize(objective, n_trials=250)

[I 2023-08-06 17:05:52,112] A new study created in RDB with name: lightgbm_no_categorical_no_resample
[I 2023-08-06 17:05:56,971] Trial 0 finished with values: [0.33770779912613885, -0.10555533232165748] and parameters: {'colsample_bynode': 0.6, 'colsample_bytree': 1.0, 'learning_rate': 0.04967007503148081, 'max_bin': 435, 'max_depth': 9, 'n_estimators': 550, 'num_leaves': 121804, 'reg_alpha': 0.0, 'reg_lambda': 3.5, 'subsample': 0.30000000000000004}. 
[I 2023-08-06 17:05:59,606] Trial 1 finished with values: [0.418228334531823, 0.05125330509453152] and parameters: {'colsample_bynode': 0.2, 'colsample_bytree': 0.4, 'learning_rate': 0.09413682300746015, 'max_bin': 195, 'max_depth': 10, 'n_estimators': 300, 'num_leaves': 123679, 'reg_alpha': 3.0, 'reg_lambda': 7.5, 'subsample': 0.2}. 
[I 2023-08-06 17:06:22,134] Trial 2 finished with values: [0.5023277024075637, 0.0054372997313497096] and parameters: {'colsample_bynode': 0.5, 'colsample_bytree': 0.2, 'learning_rate': 0.003357123119437469