In [4]:
import sys
from importlib import reload

import numpy as np
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

import fraud_detection as fd
from fraud_detection import data_loader, metrics

# Add the src directory to the system path to import modules
reload(fd)
sys.path.append('../src')

datapath = "../data/transformed_label_and_damage.parquet"

In [5]:

def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        "metric": "minkowski",
        "n_jobs": 1,
        "algorithm": "auto",
        "leaf_size": 30,
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 5),
        "p": trial.suggest_int("p", 1, 4),
    }
    clf = KNeighborsClassifier(**params)

    clf.fit(X_train, y_train[:, 0])
    preds = clf.predict(X_test)
    bew = metrics.bewertung(preds, y_test[:, 0], y_test[:, 1])
    return bew["Bewertung"]
    # return bew["recall"], bew["precision"]


In [6]:
def optimize(study, path, seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=50)

    return study

In [7]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(directions=["maximize"], pruner=pruner)

[I 2025-06-15 23:32:01,536] A new study created in memory with name: no-name-8f48cdd4-48f8-4730-ac5c-44c7a6136cce


In [None]:
study = optimize(study, datapath)

  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-06-15 23:32:48,936] Trial 0 finished with value: -5356.620000000001 and parameters: {'n_neighbors': 4, 'p': 1}. Best is trial 0 with value: -5356.620000000001.


In [49]:
study.best_trial

FrozenTrial(number=46, state=1, values=[-1074.3600000000001], datetime_start=datetime.datetime(2025, 6, 15, 16, 57, 20, 391858), datetime_complete=datetime.datetime(2025, 6, 15, 16, 57, 21, 74884), params={'max_depth': 6, 'num_leaves': 43}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=4, step=1), 'num_leaves': IntDistribution(high=100, log=False, low=20, step=1)}, trial_id=46, value=None)

In [50]:
study.best_trial.params

{'max_depth': 6, 'num_leaves': 43}

In [11]:
import optuna.visualization as vis

In [51]:
vis.plot_optimization_history(study)

In [52]:
vis.plot_param_importances(study)

In [53]:
vis.plot_slice(study)

In [62]:
model = fd.models.models.get_xgb_clf_with_reg_from_params(p)
models = [("new xgboost", model)]
model_metrics = fd.model_comparison.compare_models(models, datapath, n_splits=5)

df = pd.DataFrame(model_metrics)
df.T



Start trainings for new xgboost
Round 0
Round 1
Round 2
Round 3
Round 4


Unnamed: 0,precision,recall,f1,damage_total,damage_prevented,damage_missed,detected bonus,fp penalty,Bewertung,cm
new xgboost,0.798289,0.575503,0.668116,6015.164,3710.634,2304.53,2462.0,1258.0,-1100.53,"[[28548.0, 125.8], [363.2, 492.4]]"
