# Optimierung des Parameters cost_fp bei Annahme von konstantem Damage (Mittelwert)

In diesem Notebook wird untersucht, ob eine Optimierung des Parameters `cost_fp`, durch den in der Entscheidungsfunktion bestimmt wird, ab welchem Score des Klassifikationsmodells, sich eine Kontrolle lohnt, bei der Annahme eines konstanten Schadens (Mittelwert) sinnvoll ist. 

In [1]:
import numpy as np
from scipy.optimize import minimize_scalar
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from xgboost import XGBClassifier

import fraud_detection as fd
from fraud_detection import data_loader, metrics
from fraud_detection.models.costoptim import bewertung

datapath = "../data/transformed_label_and_damage.parquet"
seed = 42

In [2]:
seed = 42

In [3]:
# lade Daten ohne die nutzlosen Features
X, targets = data_loader.load_data_np(datapath, drop_features=data_loader.useless_features)

X_train, X_test, y_train, y_test = train_test_split(
    X, targets, test_size=0.2, random_state=seed, stratify=targets[:, 1] > 0
)

In [4]:
def cost_fn(probs, damage, cost_fp, gain_tp=5):
    """
    Vorhersagefunktion: lohnt sich eine Kontrolle.
    """
    return probs > cost_fp / (gain_tp + cost_fp + damage)

def bewertung(yhat, y, damage):
    """
    Bewertung der Vorhersagen mittels der Bewertungsfunktion der Wertkauf GmbH.
    """
    res = np.zeros(yhat.shape)
    # Case 1: FRAUD caught
    res += ((y == 1) & (yhat == 1)) * 5
    # Case 2: False positive
    res -= ((y == 0) & (yhat == 1)) * 10
    # Case 3: FRAUD missed
    res -= ((y == 1) & (yhat == 0)) * damage
    return res.sum()

def calc_bewertung_for_given_cost_fp(probs, damage_preds, cost_fp, label_true, damage_true):
    """
    Berechnet die Bewertung der Vorhersagen basierend auf den gegebenen Parametern.
    """
    yhat = cost_fn(probs, damage_preds, cost_fp)
    return bewertung(yhat, label_true, damage_true)


In [5]:
def optimize_cost_fp(clf, X_train, y_train):
    """
    Optimiert den cost_fp-Wert für das gegebene Modell und die Trainingsdaten.
    """
    probs_train = clf.predict_proba(X_train)[:, 1]
    damage_mean = np.full(y_train[:, 1].shape, y_train[:, 1].mean())

    p_preds = probs_train
    d_preds = damage_mean

    def objective(theta):
        values = calc_bewertung_for_given_cost_fp(p_preds, d_preds, theta, y_train[:, 0], y_train[:, 1])
        return -np.mean(values)  # negative because we minimize


    res = minimize_scalar(objective, bounds=(0.01, 1000.0), method='bounded')
    cost_tp = res.x
    return cost_tp

In [6]:
def evaluate_cost_fp(clf, X_test, y_test, cost_fp):
    """
    Bewertet die Vorhersagen des Modells auf den Testdaten unter Verwendung des gegebenen cost_fp-Werts.
    """
    probs_test = clf.predict_proba(X_test)[:, 1]
    damage_mean = np.full(y_test[:, 1].shape, y_test[:, 1].mean())

    p_preds = probs_test
    d_preds = damage_mean
    yhat = cost_fn(p_preds, d_preds, cost_fp)

    return metrics.bewertung(p_preds, yhat, y_test[:, 0], y_test[:, 1])

In [7]:
def optimize_and_evaluate(clf, X_train, y_train, X_test, y_test):
    probs_baseline = clf.predict_proba(X_test)[:, 1]
    preds_baseline = clf.predict(X_test)

    metrics_baseline = metrics.bewertung(probs_baseline, preds_baseline, y_test[:, 0], y_test[:, 1])

    opt_cost_fp = optimize_cost_fp(clf, X_train, y_train)
    metrics_opt = evaluate_cost_fp(clf, X_test, y_test, opt_cost_fp)

    diff = metrics_baseline["Bewertung"] - metrics_opt["Bewertung"]
    if diff < 0:
        print(f"Optimized cost_fp improved the score by {-diff:.2f} points.")
    else:
        print(f"Optimized cost_fp did not improve the score, difference: {diff:.2f}.")

    return { "baseline": metrics_baseline, "optimized_cost_fp": metrics_opt,}

In [8]:
def run_experiment(X, targets, n_splits=5, n_repeats=1, random_state=42):

    # Initialize the RepeatedStratifiedKFold
    skf = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )

    model_metrics = []

    for i, (train_idx, test_idx) in enumerate(skf.split(X, targets[:, 0])):
        clf = XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            objective="binary:logistic",
        )
        clf.fit(X[train_idx,:], targets[train_idx, 0])

        mm = optimize_and_evaluate(clf, X[train_idx], targets[train_idx], X[test_idx], targets[test_idx])
        # optimize on test set to check if functions work
        # mm = optimize_and_evaluate(clf, X[test_idx], targets[test_idx], X[test_idx], targets[test_idx])
        model_metrics.append(mm)

    return model_metrics


In [9]:
model_metrics = run_experiment(X, targets, n_splits=5, n_repeats=5, random_state=seed)

Optimized cost_fp did not improve the score, difference: 182.99.
Optimized cost_fp improved the score by 14.06 points.
Optimized cost_fp improved the score by 100.19 points.
Optimized cost_fp did not improve the score, difference: 170.37.
Optimized cost_fp did not improve the score, difference: 40.00.
Optimized cost_fp improved the score by 12.80 points.
Optimized cost_fp did not improve the score, difference: 73.92.
Optimized cost_fp did not improve the score, difference: 47.89.
Optimized cost_fp improved the score by 36.39 points.
Optimized cost_fp did not improve the score, difference: 75.04.
Optimized cost_fp improved the score by 13.46 points.
Optimized cost_fp did not improve the score, difference: 73.40.
Optimized cost_fp improved the score by 29.25 points.
Optimized cost_fp did not improve the score, difference: 208.95.
Optimized cost_fp did not improve the score, difference: 114.35.
Optimized cost_fp improved the score by 7.38 points.
Optimized cost_fp improved the score by 9.

Die Differenz von Baseline und Resultat mit Optimierung des cost_fp-Wertes ist positiv. Das bedeutet, dass die Optimierung des cost_fp-Wertes generell keine Verbesserung. Die Unterschiede sind jedoch sehr gering, was darauf hindeutet, dass die Optimierung des cost_fp-Wertes in diesem Fall keinen signifikanten Einfluss auf das Ergebnis hat. 

Der Schwellwert des Model von 0.5 ist hochstwahrscheinlich schon relativ nah am Optimum, sodass eine weitere Optimierung des cost_fp-Wertes nur marginale Verbesserungen bringen würde, wenn überhaupt.

In [11]:
np.mean([m["baseline"]["Bewertung"] - m["optimized_cost_fp"]["Bewertung"] for m in model_metrics])

np.float64(37.79199999999999)