In [1]:
import sys
from importlib import reload

import pandas as pd

import fraud_detection as fd

# Add the src directory to the system path to import modules
reload(fd)
sys.path.append('../src')

datapath = "../data/transformed_label_and_damage.parquet"

In [2]:
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from fraud_detection import data_loader, metrics

In [None]:

def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "scale_pos_weight": trial.suggest_int("scale_pos_weight", 1, 30),
        "eval_metric": "logloss",
    }

    clf = XGBClassifier(**params)

    clf.fit(X_train, y_train[:, 0])
    preds = clf.predict(X_test)
    bew = metrics.bewertung(preds, y_test[:, 0], y_test[:, 1])
    return -bew["Bewertung"]


In [13]:
def optimize(path, seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    study = optuna.create_study(direction="minimize")
    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=100)

    return study

In [14]:
study = optimize(datapath)

[I 2025-06-11 19:29:53,164] A new study created in memory with name: no-name-eaa1049e-f276-4d63-9706-38f96e53a7f3


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-11 19:29:54,477] Trial 0 finished with value: 16140.89 and parameters: {'n_estimators': 95, 'max_depth': 4, 'learning_rate': 0.09166516349524984, 'subsample': 0.6019533290401176, 'colsample_bytree': 0.7167605408603062, 'gamma': 2.707808693892426, 'eta': 0.018106224019577406, 'reg_alpha': 3.5530235012383633, 'reg_lambda': 1.151909966876941, 'min_child_weight': 1, 'scale_pos_weight': 29}. Best is trial 0 with value: 16140.89.
[I 2025-06-11 19:29:56,967] Trial 1 finished with value: 2119.87 and parameters: {'n_estimators': 165, 'max_depth': 6, 'learning_rate': 0.0015290427371926092, 'subsample': 0.714311000338882, 'colsample_bytree': 0.6051097831594114, 'gamma': 4.430458323401042, 'eta': 0.032928947914759, 'reg_alpha': 4.0334112438704, 'reg_lambda': 3.6477850200969604, 'min_child_weight': 2, 'scale_pos_weight': 22}. Best is trial 1 with value: 2119.87.
[I 2025-06-11 19:29:58,231] Trial 2 finished with value: 8071.449999999999 and parameters: {'n_estimators': 79, 'max_depth': 5,

In [15]:
study.best_value

1198.15

In [None]:
study.best_params

{'n_estimators': 204,
 'max_depth': 14,
 'learning_rate': 0.029639904051825004,
 'subsample': 0.9890671048478101,
 'colsample_bytree': 0.64967629023458,
 'gamma': 2.144478553480802,
 'eta': 0.04388928683651236,
 'reg_alpha': 0.5171768584127643,
 'reg_lambda': 1.990636988067381,
 'min_child_weight': 2,
 'scale_pos_weight': 1}

In [8]:
import optuna.visualization as vis

In [17]:
vis.plot_optimization_history(study)

In [18]:
vis.plot_param_importances(study)

In [19]:
vis.plot_slice(study)