In [None]:
import sys
from importlib import reload

import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

import fraud_detection as fd
from fraud_detection import data_loader, metrics

# Add the src directory to the system path to import modules
reload(fd)
sys.path.append('../src')

datapath = "../data/transformed_label_and_damage.parquet"

In [42]:

def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        # "n_estimators": trial.suggest_int("n_estimators", 70, 150),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        # "learning_rate": trial.suggest_float("learning_rate", 0.08, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "n_estimators": 100,
        #"max_depth": 5,
        "learning_rate": 0.12,
        "verbosity":-1,
    }

    clf = LGBMClassifier(**params)

    clf.fit(X_train, y_train[:, 0])
    preds = clf.predict(X_test)
    bew = metrics.bewertung(preds, y_test[:, 0], y_test[:, 1])
    return bew["Bewertung"]
    # return bew["recall"], bew["precision"]


In [43]:
def optimize(path, seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    pruner = optuna.pruners.MedianPruner()

    study = optuna.create_study(directions=["maximize"], pruner=pruner)
    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=10)

    return study

In [44]:
def continue_optimize(study, path,seed=42):
    X, targets = data_loader.load_data_np(path)

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=50)

    return study

In [45]:
study = optimize(datapath)

[I 2025-06-15 16:56:44,839] A new study created in memory with name: no-name-e3d40e52-8328-4917-9a64-89797b61ff73


  0%|          | 0/10 [00:00<?, ?it/s]


X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:45,612] Trial 0 finished with value: -1239.96 and parameters: {'max_depth': 6, 'num_leaves': 73}. Best is trial 0 with value: -1239.96.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:46,256] Trial 1 finished with value: -1341.93 and parameters: {'max_depth': 10, 'num_leaves': 32}. Best is trial 0 with value: -1239.96.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:46,912] Trial 2 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 93}. Best is trial 2 with value: -1103.35.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:47,518] Trial 3 finished with value: -1124.8899999999999 and parameters: {'max_depth': 5, 'num_leaves': 26}. Best is trial 2 with value: -1103.35.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:48,181] Trial 4 finished with value: -1168.15 and parameters: {'max_depth': 6, 'num_leaves': 33}. Best is trial 2 with value: -1103.35.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:48,940] Trial 5 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 33}. Best is trial 2 with value: -1103.35.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:49,725] Trial 6 finished with value: -1241.3600000000001 and parameters: {'max_depth': 8, 'num_leaves': 45}. Best is trial 2 with value: -1103.35.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:50,468] Trial 7 finished with value: -1096.81 and parameters: {'max_depth': 6, 'num_leaves': 47}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:51,309] Trial 8 finished with value: -1316.5 and parameters: {'max_depth': 8, 'num_leaves': 74}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:52,184] Trial 9 finished with value: -1421.24 and parameters: {'max_depth': 7, 'num_leaves': 92}. Best is trial 7 with value: -1096.81.


In [46]:
continue_optimize(study, datapath)

  0%|          | 0/50 [00:00<?, ?it/s]


X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:55,708] Trial 10 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 54}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:56,348] Trial 11 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 98}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:57,099] Trial 12 finished with value: -1239.96 and parameters: {'max_depth': 6, 'num_leaves': 72}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:57,874] Trial 13 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 57}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:58,776] Trial 14 finished with value: -1220.18 and parameters: {'max_depth': 7, 'num_leaves': 85}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:56:59,570] Trial 15 finished with value: -1295.3999999999999 and parameters: {'max_depth': 7, 'num_leaves': 46}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:00,419] Trial 16 finished with value: -1464.73 and parameters: {'max_depth': 10, 'num_leaves': 63}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:01,087] Trial 17 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 45}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:02,002] Trial 18 finished with value: -1239.03 and parameters: {'max_depth': 8, 'num_leaves': 83}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:02,605] Trial 19 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 64}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:03,350] Trial 20 finished with value: -1399.43 and parameters: {'max_depth': 9, 'num_leaves': 51}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:04,118] Trial 21 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 38}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:04,782] Trial 22 finished with value: -1217.72 and parameters: {'max_depth': 6, 'num_leaves': 22}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:05,439] Trial 23 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 40}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:06,214] Trial 24 finished with value: -1214.33 and parameters: {'max_depth': 6, 'num_leaves': 28}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:06,898] Trial 25 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 37}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:07,505] Trial 26 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 20}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:08,260] Trial 27 finished with value: -1204.81 and parameters: {'max_depth': 6, 'num_leaves': 50}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:08,903] Trial 28 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 68}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:09,608] Trial 29 finished with value: -1239.96 and parameters: {'max_depth': 6, 'num_leaves': 79}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:10,323] Trial 30 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 98}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:10,987] Trial 31 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 54}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:11,627] Trial 32 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 60}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:12,303] Trial 33 finished with value: -1227.38 and parameters: {'max_depth': 6, 'num_leaves': 32}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:12,928] Trial 34 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 57}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:13,692] Trial 35 finished with value: -1274.46 and parameters: {'max_depth': 7, 'num_leaves': 40}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:14,355] Trial 36 finished with value: -1303.6 and parameters: {'max_depth': 6, 'num_leaves': 31}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:14,941] Trial 37 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 46}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:15,611] Trial 38 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 91}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:16,550] Trial 39 finished with value: -1109.77 and parameters: {'max_depth': 7, 'num_leaves': 69}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:17,165] Trial 40 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 25}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:17,787] Trial 41 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 59}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:18,449] Trial 42 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 79}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:19,185] Trial 43 finished with value: -1239.96 and parameters: {'max_depth': 6, 'num_leaves': 63}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:19,814] Trial 44 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 65}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:20,390] Trial 45 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 93}. Best is trial 7 with value: -1096.81.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:21,075] Trial 46 finished with value: -1074.3600000000001 and parameters: {'max_depth': 6, 'num_leaves': 43}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:21,826] Trial 47 finished with value: -1111.35 and parameters: {'max_depth': 7, 'num_leaves': 49}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:22,534] Trial 48 finished with value: -1141.9 and parameters: {'max_depth': 6, 'num_leaves': 54}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:23,304] Trial 49 finished with value: -1246.19 and parameters: {'max_depth': 7, 'num_leaves': 44}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:23,989] Trial 50 finished with value: -1341.87 and parameters: {'max_depth': 8, 'num_leaves': 34}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:24,618] Trial 51 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 41}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:25,301] Trial 52 finished with value: -1239.96 and parameters: {'max_depth': 6, 'num_leaves': 74}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:26,017] Trial 53 finished with value: -1176.0 and parameters: {'max_depth': 6, 'num_leaves': 56}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:26,648] Trial 54 finished with value: -1103.35 and parameters: {'max_depth': 5, 'num_leaves': 36}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:27,210] Trial 55 finished with value: -1182.7 and parameters: {'max_depth': 4, 'num_leaves': 43}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:27,886] Trial 56 finished with value: -1251.27 and parameters: {'max_depth': 6, 'num_leaves': 49}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:28,542] Trial 57 finished with value: -1184.74 and parameters: {'max_depth': 5, 'num_leaves': 28}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:29,379] Trial 58 finished with value: -1265.31 and parameters: {'max_depth': 10, 'num_leaves': 52}. Best is trial 46 with value: -1074.3600000000001.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



[I 2025-06-15 16:57:30,157] Trial 59 finished with value: -1096.81 and parameters: {'max_depth': 6, 'num_leaves': 47}. Best is trial 46 with value: -1074.3600000000001.


<optuna.study.study.Study at 0x7bafcc1e1c70>

In [61]:
p = sorted(study.best_trials, key=lambda t: t.values[0], reverse=True)[6].params

In [49]:
study.best_trial

FrozenTrial(number=46, state=1, values=[-1074.3600000000001], datetime_start=datetime.datetime(2025, 6, 15, 16, 57, 20, 391858), datetime_complete=datetime.datetime(2025, 6, 15, 16, 57, 21, 74884), params={'max_depth': 6, 'num_leaves': 43}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=4, step=1), 'num_leaves': IntDistribution(high=100, log=False, low=20, step=1)}, trial_id=46, value=None)

In [50]:
study.best_trial.params

{'max_depth': 6, 'num_leaves': 43}

In [11]:
import optuna.visualization as vis

In [51]:
vis.plot_optimization_history(study)

In [52]:
vis.plot_param_importances(study)

In [53]:
vis.plot_slice(study)

In [62]:
model = fd.models.models.get_xgb_clf_with_reg_from_params(p)
models = [("new xgboost", model)]
model_metrics = fd.model_comparison.compare_models(models, datapath, n_splits=5)

df = pd.DataFrame(model_metrics)
df.T



Start trainings for new xgboost
Round 0
Round 1
Round 2
Round 3
Round 4


Unnamed: 0,precision,recall,f1,damage_total,damage_prevented,damage_missed,detected bonus,fp penalty,Bewertung,cm
new xgboost,0.798289,0.575503,0.668116,6015.164,3710.634,2304.53,2462.0,1258.0,-1100.53,"[[28548.0, 125.8], [363.2, 492.4]]"
