In [None]:
import sys
from importlib import reload

import numpy as np
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

import fraud_detection as fd
from fraud_detection import data_loader, metrics

# Add the src directory to the system path to import modules
reload(fd)
sys.path.append('../src')

datapath = "../data/transformed_label_and_damage.parquet"

In [47]:

def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        #"n_estimators": trial.suggest_int("n_estimators", 70, 150),
        # "max_depth": trial.suggest_int("max_depth", 4, 10),
        #"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.08, 0.12),
        # "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        # "gamma": trial.suggest_float("gamma", 0, 5),
        # "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
        # "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        #"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        # "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 2),
        #"reg_alpha": 0.0,
        "n_estimators": 100,
        "max_depth": 5,
        #"learning_rate": 0.1,
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
    }

    # Asuign weights
    sample_weights = np.where(y_train > 0, 5.0, 1.0)  # Non-zero targets get 5x the importance

    clf = XGBRegressor(**params)

    clf.fit(X_train, y_train, sample_weight=sample_weights)
    preds = clf.predict(X_test)
    bew = metrics.regression(preds, y_test)
    return bew["R2"]

In [46]:
def optimize(study, path, seed=42):
    X, targets = data_loader.load_data_np(path)
    targets = targets[:, 1]

    X_train, X_test, y_train, y_test = map(
        np.asarray, train_test_split(X, targets, test_size=0.2, random_state=seed)
    )

    def wrapped_objective(trial):
        return objective(trial, X_train, X_test, y_train, y_test)

    study.optimize(wrapped_objective, show_progress_bar=True, n_trials=10)

    return study

In [48]:
study = optuna.create_study(direction="maximize")

[I 2025-06-15 21:13:02,420] A new study created in memory with name: no-name-33fc59d5-700f-4148-a9dd-8387fc967211


In [49]:
study = optimize(study, datapath)

  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-06-15 21:13:05,366] Trial 0 finished with value: 0.3361972208975269 and parameters: {'learning_rate': 0.11072557042239867}. Best is trial 0 with value: 0.3361972208975269.
[I 2025-06-15 21:13:06,608] Trial 1 finished with value: 0.32665556091894143 and parameters: {'learning_rate': 0.11181038715836536}. Best is trial 0 with value: 0.3361972208975269.
[I 2025-06-15 21:13:07,823] Trial 2 finished with value: 0.3265678024722224 and parameters: {'learning_rate': 0.11867341917931473}. Best is trial 0 with value: 0.3361972208975269.
[I 2025-06-15 21:13:08,967] Trial 3 finished with value: 0.33262504772584656 and parameters: {'learning_rate': 0.09348416423277241}. Best is trial 0 with value: 0.3361972208975269.
[I 2025-06-15 21:13:10,202] Trial 4 finished with value: 0.3307429612670304 and parameters: {'learning_rate': 0.10621247110685342}. Best is trial 0 with value: 0.3361972208975269.
[I 2025-06-15 21:13:11,422] Trial 5 finished with value: 0.33275022613962235 and parameters: {'lea

In [22]:
study.best_value

0.48066360285358123

In [23]:
study.best_params

{'learning_rate': 0.084375262515291}

In [24]:
import optuna.visualization as vis

In [25]:
vis.plot_optimization_history(study)

In [26]:
vis.plot_param_importances(study)

In [27]:
vis.plot_slice(study)