In [10]:
import pandas as pd
import optuna

import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import category_encoders as ce

import xgboost as xgb

from preprocessing import *
from ucimlrepo import fetch_ucirepo

import gc

In [11]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = fetch_ucirepo(id=848)['data']['original']

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in train_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

train_df = convert_cols(train_df, CONT_FEATS, CAT_FEATS)
test_df = convert_cols(test_df, CONT_FEATS, CAT_FEATS)
orig_df = convert_cols(orig_df, CONT_FEATS, CAT_FEATS)

train_df = null_all_non_original_categories(train_df, orig_df, CAT_FEATS)
test_df = null_all_non_original_categories(test_df, orig_df, CAT_FEATS)



In [12]:
for df in [train_df, orig_df]:
    df["class"] = df["class"].cat.rename_categories({"e":0, "p":1})

In [13]:
x_train, x_val, y_train, y_val = skms.train_test_split(train_df.drop(columns=[RESPONSE_COL]), train_df[RESPONSE_COL], stratify = train_df[RESPONSE_COL])


def objective(trial):
    model = xgb.XGBClassifier(
            enable_categorical=True,
            device="cuda",
            random_state=0,
            n_jobs=-1,

            n_estimators=trial.suggest_int("n_estimators", 1, 1e4),
            eta=trial.suggest_float("eta", 0.01, 1),
            gamma=trial.suggest_float("gamma", 0, 20),
            max_depth=trial.suggest_int("max_depth", 3, 50),
            max_leaves=trial.suggest_int("max_leaves", 0, 1e4),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.05, 1),
            colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1),
            colsample_bynode=trial.suggest_float("colsample_bynode", 0.05, 1),
            reg_lambda=trial.suggest_int("reg_lambda", 0, 20),
            reg_alpha=trial.suggest_int("reg_alpha", 0, 20),
            grow_policy=trial.suggest_categorical(
                "grow_policy", ["depthwise", "lossguide"]
            ),
            min_child_weight=trial.suggest_float("min_child_weight", 0, 1e3),
            subsample=trial.suggest_float("subsample", 0.01, 1),
            max_delta_step=trial.suggest_float("max_delta_step", 0, 1e2)
        )

    model.fit(x_train, y_train)

    preds = model.predict(x_val)

    return metrics.matthews_corrcoef(y_val, preds)


In [15]:
# optuna.delete_study(study_name="xgb_tuning", storage="sqlite:///optuna.sqlite3")

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///optuna.sqlite3",
    study_name="xgb_tuning",
    load_if_exists=True
)
study.optimize(objective, n_trials=100)
gc.collect()

[I 2024-08-08 01:11:34,603] Using an existing study with name 'xgb_tuning' instead of creating a new one.
[I 2024-08-08 01:15:46,004] Trial 107 finished with value: 0.9845350422504081 and parameters: {'n_estimators': 9417, 'eta': 0.1644306850346588, 'gamma': 0.6244892733642694, 'max_depth': 35, 'max_leaves': 427, 'colsample_bytree': 0.5710259485267294, 'colsample_bylevel': 0.45032004590709246, 'colsample_bynode': 0.8314401929649404, 'reg_lambda': 2, 'reg_alpha': 15, 'grow_policy': 'depthwise', 'min_child_weight': 28.370140788038043, 'subsample': 0.7060332218079458, 'max_delta_step': 99.65469643219718}. Best is trial 101 with value: 0.984799205075381.
[I 2024-08-08 01:19:58,499] Trial 108 finished with value: 0.9846748910427767 and parameters: {'n_estimators': 9657, 'eta': 0.15655814493405476, 'gamma': 0.5130094794681466, 'max_depth': 35, 'max_leaves': 426, 'colsample_bytree': 0.5714285416671221, 'colsample_bylevel': 0.44957552938226086, 'colsample_bynode': 0.7693250221139433, 'reg_lamb

6280

In [16]:
best_model = xgb.XGBClassifier(
    enable_categorical=True,
    device="cuda",
    random_state=0,
    n_jobs=-1,
    **study.best_params
)
best_model.fit(train_df.drop(columns=["class"]), train_df["class"])
preds = best_model.predict(test_df)
out_df = pd.DataFrame({"class":preds}, index=test_df.index)

In [17]:
out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
out_df.to_csv("tuned_xgb_200_v1.csv")