In [1]:
import pandas as pd
import optuna

import sklearn.linear_model as sklm
import sklearn.model_selection as skms
import sklearn.metrics as metrics
import sklearn.pipeline as skpl
import sklearn.preprocessing as skpp
import sklearn.compose as skcmp
import category_encoders as ce

import xgboost as xgb

from preprocessing import *
from ucimlrepo import fetch_ucirepo

import gc

In [2]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
orig_df = fetch_ucirepo(id=848)['data']['original']

CONT_FEATS = [
    "cap-diameter",
    "stem-height",
    "stem-width"
]
CAT_FEATS = [c for c in train_df.columns if c not in CONT_FEATS]
RESPONSE_COL = "class"

train_df = convert_cols(train_df, CONT_FEATS, CAT_FEATS)
test_df = convert_cols(test_df, CONT_FEATS, CAT_FEATS)
orig_df = convert_cols(orig_df, CONT_FEATS, CAT_FEATS)

train_df = null_all_non_original_categories(train_df, orig_df, CAT_FEATS)
test_df = null_all_non_original_categories(test_df, orig_df, CAT_FEATS)



In [3]:
for df in [train_df, orig_df]:
    df["class"] = df["class"].cat.rename_categories({"e":0, "p":1})

In [4]:
x_train, x_val, y_train, y_val = skms.train_test_split(train_df.drop(columns=[RESPONSE_COL]), train_df[RESPONSE_COL], stratify = train_df[RESPONSE_COL])


def objective(trial):
    model = xgb.XGBClassifier(
            enable_categorical=True,
            device="cuda",
            random_state=0,
            n_jobs=-1,

            n_estimators=trial.suggest_int("n_estimators", 1, 1e4),
            eta=trial.suggest_float("eta", 0.01, 1),
            gamma=trial.suggest_float("gamma", 0, 20),
            max_depth=trial.suggest_int("max_depth", 3, 10, step=1),
            max_leaves=trial.suggest_int("max_leaves", 0, 1e4),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.05, 1),
            colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1),
            colsample_bynode=trial.suggest_float("colsample_bynode", 0.05, 1),
            reg_lambda=trial.suggest_int("reg_lambda", 0, 20),
            reg_alpha=trial.suggest_int("reg_alpha", 0, 20),
            grow_policy=trial.suggest_categorical(
                "grow_policy", ["depthwise", "lossguide"]
            ),
            min_child_weight=trial.suggest_float("min_child_weight", 0, 1e3),
            subsample=trial.suggest_float("subsample", 0.1, 1)
            max_delta_step=trial.suggest_float("max_delta_step", 0, 1e2)
        )

    model.fit(x_train, y_train)

    preds = model.predict(x_val)

    return metrics.matthews_corrcoef(y_val, preds)


In [5]:
# optuna.delete_study(study_name="xgb_exploration", storage="sqlite:///optuna.sqlite3")

study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///optuna.sqlite3",
    study_name="xgb_exploration",
    load_if_exists=True
)
study.optimize(objective, n_trials=100)
gc.collect()

[I 2024-08-05 08:47:52,256] Using an existing study with name 'xgb_exploration' instead of creating a new one.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-08-05 08:48:42,323] Trial 10 finished with value: 0.981949041206204 and parameters: {'n_estimators': 4616, 'eta': 0.38043050364088726, 'gamma': 0.6677166996644122, 'max_depth': 5, 'max_leaves': 6039, 'colsample_bytree': 0.6982376322297125, 'colsample_bylevel': 0.0750578483507347, 'colsample_bynode': 0.9967706726097811, 'reg_lambda': 0, 'reg_alpha': 20, 'grow_policy': 'depthwise', 'min_child_weight': 678.5183485048985, 'max_delta_step': 66.93440467573659}. Best is trial 3 with value: 0.9824590281367335.
[I 2024-08-05 08:49:24,197] Trial 11 finished with value: 0.9821449377510819 and parameters: {'n_estimators': 4014, 'eta': 0.3699315024370275, 'gamma': 0.612290714470924, 'max_depth': 5, 'max_leaves': 6569, 'colsample_

210

In [6]:
best_model = xgb.XGBClassifier(
    enable_categorical=True,
    device="cuda",
    random_state=0,
    n_jobs=-1,
    **study.best_params
)
best_model.fit(train_df.drop(columns=["class"]), train_df["class"])
preds = best_model.predict(test_df)
out_df = pd.DataFrame({"class":preds}, index=test_df.index)


In [7]:
out_df["class"] = out_df["class"].replace({0:"e", 1:"p"})
out_df.to_csv("simple_xgb_100.csv")