In [113]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize_scalar
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from xgboost import XGBClassifier, XGBRegressor

import fraud_detection as fd
from fraud_detection import data_loader, metrics

datapath = "../data/transformed_label_and_damage.parquet"

In [114]:
seed = 42

In [116]:
def cost_fn(probs, damage, cost_fp, tain_tp=5):
    return probs > cost_fp / (tain_tp + cost_fp + damage)


def bewertung(yhat, y, damage):
    res = np.zeros(yhat.shape)
    # Case 1: FRAUD caught
    res += ((y == 1) & (yhat == 1)) * 5
    # Case 2: False positive
    res -= ((y == 0) & (yhat == 1)) * 10
    # Case 3: FRAUD missed
    res -= ((y == 1) & (yhat == 0)) * damage
    return res.sum()


def value_fn(probs, damage_preds, cost_fp, label_true, damage_true):
    yhat = cost_fn(probs, damage_preds, cost_fp)
    return bewertung(yhat, label_true, damage_true)

In [118]:
def get_balanced_data(X_train, y_train, seed=42):
    n_damage = np.sum(y_train[:, 1] > 0)
    n_no_damage = np.sum(y_train[:, 1] == 0)
    n_samples = min(n_damage, n_no_damage)
    idx_balanced = np.concatenate([
            np.random.choice(np.where(y_train[:, 1] > 0)[0], n_samples, replace=False),
            np.random.choice(np.where(y_train[:, 1] == 0)[0], n_samples, replace=False),
    ])
    X_train = X_train[idx_balanced]
    y_train = y_train[idx_balanced]
    return X_train, y_train


def get_data_with_damage_only(X_train, y_train):
    idx_damage = np.where(y_train[:, 1] > 0)[0]
    X_damage = X_train[idx_damage,:]
    y_damage = y_train[idx_damage,:]
    return X_damage, y_damage


def id(X, y):
    return X, y

In [119]:
X, targets = data_loader.load_data_np(datapath)

In [120]:
def fit_regressor_and_predict(X_train, y_train, X_test, y_test, traindata_function: callable):
    X_train, y_train = traindata_function(X_train, y_train)

    model = XGBRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        objective="reg:squarederror",
    )

    #sample_weights = np.where(y_train[:, 0] > 0, 200.0, 1.0)
    model.fit(X_train, y_train[:, 1])#, sample_weight=sample_weights)

    #model.fit(X_train, y_train[:, 1])
    preds = model.predict(X_test)
    bew = metrics.regression(preds, y_test[:, 1])

    return model, bew, preds

In [121]:
def find_optimal_cost_fp(p_preds, d_preds, y_train):
    def objective(theta):
        values = value_fn(p_preds, d_preds, theta, y_train[:, 0], y_train[:, 1])
        return -np.mean(values)  # negative because we minimize

    res = minimize_scalar(objective, bounds=(0.1, 1000.0), method='bounded')
    return res.x


In [None]:
def create_and_evaluate_model(X_train, y_train, X_test, y_test, train_data_function: callable):
    threshold = 0.01
    clf = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        objective="binary:logistic",
    )
    clf.fit(X_train, y_train[:, 0])
    probs_baseline = clf.predict_proba(X_test)[:, 1]
    preds_baseline = clf.predict(X_test)

    metrics_baseline = metrics.bewertung(probs_baseline, preds_baseline, y_test[:, 0], y_test[:, 1])

    model, bew, preds = fit_regressor_and_predict(X_train, y_train, X_test, y_test, train_data_function)

    damage_preds = model.predict(X_test)

    preds = (probs_baseline > threshold) & cost_fn(probs_baseline, damage_preds, 10)
    metrics_combined = metrics.bewertung(probs_baseline, preds, y_test[:, 0], y_test[:, 1])

    # optimiere cost_fp mit threshold
    probs_test_baseline_op = clf.predict_proba(X_test)[:, 1]
    idx = np.where(probs_test_baseline_op > threshold)[0]
    d_preds_op = model.predict(X_test[idx, :])
    cost_fp_opt = find_optimal_cost_fp(probs_test_baseline_op[idx], d_preds_op, y_test[idx])
    print(f"Optimal cost_fp: {cost_fp_opt}")

    preds = (probs_baseline > threshold) & cost_fn(probs_baseline, damage_preds, cost_fp_opt)
    metrics_combined_opt = metrics.bewertung(probs_baseline, preds, y_test[:, 0], y_test[:, 1])

    return damage_preds, metrics_baseline, metrics_combined, metrics_combined_opt


In [None]:
def run_experiment(X, targets, train_data_function, n_splits=5, n_repeats=1, random_state=42):

    # Initialize the RepeatedStratifiedKFold
    skf = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )

    df = pd.DataFrame()

    for i, (train_idx, test_idx) in enumerate(skf.split(X, targets[:, 0])):
        print(f"Fold {i + 1}/{n_splits * n_repeats}")
        d, m_base, m_comb, m_comb_opt = create_and_evaluate_model(
            X[train_idx],
            targets[train_idx],
            X[test_idx],
            targets[test_idx],
            train_data_function,
        )
        res = {
         "precision_baseline": m_base["precision"],
         "recall_baseline": m_base["recall"],

         "precision_combined": m_comb["precision"],
         "recall_combined": m_comb["recall"],

         "precision_combined_opt": m_comb_opt["precision"],
         "recall_combined_opt": m_comb_opt["recall"],

         "bewertung_baseline": m_base["Bewertung"],
         "bewertung_combined": m_comb["Bewertung"],
         "difference_bewertung": m_comb["Bewertung"] - m_base["Bewertung"],
         "bewertung_combined_opt": m_comb_opt["Bewertung"],
         "difference_bewertung_opt": m_comb_opt["Bewertung"] - m_base["Bewertung"],
        }
        df_fold = pd.DataFrame(res, index=[f"Fold {i + 1}"])
        df = pd.concat([df, df_fold], axis=0)

    return df


In [194]:
n_splits = 5
n_repeats = 5

df_full = run_experiment(X, targets, id, n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
df_balance = run_experiment(X, targets, get_balanced_data, n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
df_damage = run_experiment(X, targets, get_data_with_damage_only, n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

Fold 1/25
Optimal cost_fp: 8.081781308667264
Fold 2/25
Optimal cost_fp: 12.957722732945669
Fold 3/25
Optimal cost_fp: 9.585067743337035
Fold 4/25
Optimal cost_fp: 7.118854367532223
Fold 5/25
Optimal cost_fp: 5.112023051172431
Fold 6/25
Optimal cost_fp: 5.123357473319871
Fold 7/25
Optimal cost_fp: 28.859792339675867
Fold 8/25
Optimal cost_fp: 8.229810676059557
Fold 9/25
Optimal cost_fp: 8.952725302334768
Fold 10/25
Optimal cost_fp: 5.758172291332696
Fold 11/25
Optimal cost_fp: 9.322983566687935
Fold 12/25
Optimal cost_fp: 9.428138565526591
Fold 13/25
Optimal cost_fp: 8.155739485335799
Fold 14/25
Optimal cost_fp: 6.863701372899727
Fold 15/25
Optimal cost_fp: 5.367381889487058
Fold 16/25
Optimal cost_fp: 5.611426963392618
Fold 17/25
Optimal cost_fp: 8.229809856145508
Fold 18/25
Optimal cost_fp: 5.801222230108149
Fold 19/25
Optimal cost_fp: 8.648775105763214
Fold 20/25
Optimal cost_fp: 13.205012337356765
Fold 21/25
Optimal cost_fp: 13.25430895212401
Fold 22/25
Optimal cost_fp: 9.5761059860

In [204]:
df_diff = pd.DataFrame({
    "Full": df_full.difference_bewertung.mean(),
    "Full (Optimized)": df_full.difference_bewertung_opt.mean(),
    "Balanced": df_balance.difference_bewertung.mean(),
    "Balanced (Optimized)": df_balance.difference_bewertung_opt.mean(),
    "Damage Only": df_damage.difference_bewertung.mean(),
    "Damage Only (Optimized)": df_damage.difference_bewertung_opt.mean(),
}, index=["Mittlere Differenz der Bewertung"]).T

In [205]:
df_diff

Unnamed: 0,Mittlere Differenz der Bewertung
Full,0.102
Full (Optimized),55.1872
Balanced,-7.9928
Balanced (Optimized),52.1244
Damage Only,-20.4824
Damage Only (Optimized),30.656


In [214]:
df_full.describe().round(3).iloc[:, 6:]

Unnamed: 0,bewertung_baseline,bewertung_combined,difference_bewertung,bewertung_combined_opt,difference_bewertung_opt
count,25.0,25.0,25.0,25.0,25.0
mean,-1005.2,-1005.098,0.102,-950.013,55.187
std,157.545,164.623,46.659,155.668,94.388
min,-1312.33,-1262.07,-85.97,-1207.74,-299.98
25%,-1102.32,-1135.83,-21.13,-1081.65,20.67
50%,-1024.92,-1011.12,-0.02,-925.12,71.46
75%,-891.98,-914.82,17.6,-847.58,100.85
max,-617.5,-654.62,78.69,-656.17,212.9
