In [88]:
import sys
from importlib import reload

import pandas as pd

import fraud_detection as fd

# Add the src directory to the system path to import modules
reload(fd)
sys.path.append('../src')

datapath = "../data/transformed_label_and_damage.parquet"

In [89]:
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from fraud_detection import data_loader, metrics

In [192]:

def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        #"n_estimators": trial.suggest_int("n_estimators", 70, 150),
        # "max_depth": trial.suggest_int("max_depth", 4, 10),
        # "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.2, log=True),
        # "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        # "gamma": trial.suggest_float("gamma", 0, 5),
        # "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
        # "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        # "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 2),
        #"reg_alpha": 0.0,
        "n_estimators": 100,
        "max_depth": 5,
        "learning_rate": 0.1,
        "objective": "binary:logistic",
    }

    clf = XGBClassifier(**params)

    clf.fit(X_train, y_train[:, 0])
    preds = clf.predict(X_test)
    bew = metrics.bewertung(preds, y_test[:, 0], y_test[:, 1])
    return bew["Bewertung"]
    # return bew["recall"], bew["precision"]


In [193]:
def optimize(path, seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    pruner = optuna.pruners.MedianPruner()

    study = optuna.create_study(directions=["maximize"], pruner=pruner)
    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=10)

    return study

In [None]:
def continue_optimize(study, path, seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=10)

    return study

In [196]:
study = optimize(datapath)

[I 2025-06-15 16:28:19,509] A new study created in memory with name: no-name-e3ac98e9-aaa1-4574-847c-c34636bd61d7


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-06-15 16:28:20,739] Trial 0 finished with value: -1229.54 and parameters: {'min_child_weight': 9}. Best is trial 0 with value: -1229.54.
[I 2025-06-15 16:28:21,977] Trial 1 finished with value: -1209.98 and parameters: {'min_child_weight': 1}. Best is trial 1 with value: -1209.98.
[I 2025-06-15 16:28:23,109] Trial 2 finished with value: -1117.26 and parameters: {'min_child_weight': 4}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:24,244] Trial 3 finished with value: -1140.69 and parameters: {'min_child_weight': 5}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:25,459] Trial 4 finished with value: -1192.6399999999999 and parameters: {'min_child_weight': 10}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:26,649] Trial 5 finished with value: -1129.84 and parameters: {'min_child_weight': 7}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:27,776] Trial 6 finished with value: -1140.69 and parameters: {'min_child_weight': 5}. Best is tri

In [199]:
continue_optimize(study, datapath)

  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-06-15 16:28:52,083] Trial 20 finished with value: -1117.26 and parameters: {'min_child_weight': 4}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:53,429] Trial 21 finished with value: -1117.26 and parameters: {'min_child_weight': 4}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:54,626] Trial 22 finished with value: -1175.6299999999999 and parameters: {'min_child_weight': 2}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:55,812] Trial 23 finished with value: -1140.69 and parameters: {'min_child_weight': 5}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:57,001] Trial 24 finished with value: -1117.26 and parameters: {'min_child_weight': 4}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:58,106] Trial 25 finished with value: -1175.6299999999999 and parameters: {'min_child_weight': 2}. Best is trial 2 with value: -1117.26.
[I 2025-06-15 16:28:59,187] Trial 26 finished with value: -1180.5 and parameters: {'min_child_weight':

<optuna.study.study.Study at 0x7373d5857950>

In [61]:
p = sorted(study.best_trials, key=lambda t: t.values[0], reverse=True)[6].params

In [203]:
study.best_trial

FrozenTrial(number=2, state=1, values=[-1117.26], datetime_start=datetime.datetime(2025, 6, 15, 16, 28, 21, 979018), datetime_complete=datetime.datetime(2025, 6, 15, 16, 28, 23, 109471), params={'min_child_weight': 4}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1)}, trial_id=2, value=None)

In [204]:
study.best_trial.params

{'min_child_weight': 4}

In [122]:
import optuna.visualization as vis

In [189]:
vis.plot_optimization_history(study)

In [190]:
vis.plot_param_importances(study)

In [191]:
vis.plot_slice(study)

In [62]:
model = fd.models.models.get_xgb_clf_with_reg_from_params(p)
models = [("new xgboost", model)]
model_metrics = fd.model_comparison.compare_models(models, datapath, n_splits=5)

df = pd.DataFrame(model_metrics)
df.T



Start trainings for new xgboost
Round 0
Round 1
Round 2
Round 3
Round 4


Unnamed: 0,precision,recall,f1,damage_total,damage_prevented,damage_missed,detected bonus,fp penalty,Bewertung,cm
new xgboost,0.798289,0.575503,0.668116,6015.164,3710.634,2304.53,2462.0,1258.0,-1100.53,"[[28548.0, 125.8], [363.2, 492.4]]"
