In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier, XGBRegressor

from fraud_detection import data_loader, metrics

datapath = "../data/transformed_label_and_damage.parquet"

In [3]:
seed = 42
np.random.seed(seed)


In [26]:
def cost_fn(probs, damage, cost_fp, tain_tp=5):
    return probs > cost_fp / (tain_tp + cost_fp + damage)

def id(X, y):
    return X, y

In [8]:
X, _ = data_loader.load_data(datapath, drop_features=data_loader.useless_features)
for i, c in enumerate(X.columns):
    if "diff" in c:
      print(f"Feature {i}: {c}")

Feature 28: calculated_price_difference


In [14]:
X, targets = data_loader.load_data_np(datapath, drop_features=data_loader.useless_features)

In [24]:
def fit_regressor_and_predict(X_train, y_train, X_test, y_test, traindata_function: callable):
    X_train, y_train = traindata_function(X_train, y_train)

    model = XGBRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        objective="reg:squarederror",
    )

    #sample_weights = np.where(y_train[:, 0] > 0, 200.0, 1.0)
    model.fit(X_train, y_train[:, 1])#, sample_weight=sample_weights)

    #model.fit(X_train, y_train[:, 1])
    preds = model.predict(X_test)
    bew = metrics.regression(preds, y_test[:, 1])

    return model, bew, preds

In [39]:
def eval_rabattfaelle(X, y, preds):
    # Fälle mit Rabatt
    idxr = np.where(X[:, 28] > 0)[0]
    # Fraudfälle
    idxf = np.where(y[:, 0] == 1)[0]
    # Fälle mit Rabatt und Fraud
    idxrf = np.intersect1d(idxr, idxf)

    # Now we want the indices of idxrf where preds == 1 (i.e. correctly predicted fraud)
    idxrf_pred_mask = preds[idxrf] == 1
    idxrf_pred = idxrf[idxrf_pred_mask]

    # Anzahl der Fälle mit Rabatt und Fraud
    num_rf = len(idxrf)
    # Anzahl der korrekt vorhergesagten Fälle mit Rabatt und Fraud
    num_rf_pred = len(idxrf_pred)

    # Schaden der Fraud-Fälle mit Rabatt
    damage_rf = y[idxrf, 1]
    # Schaden der korrekt vorhergesagten Fraud-Fälle mit Rabatt
    damage_rf_pred = y[idxrf_pred, 1]

    assert damage_rf_pred.min() > 0, "Damage of predicted fraud cases with discount should be greater than 0"
    assert damage_rf.min() > 0, "Damage of fraud cases with discount should be greater than 0"

    return {
        "Anzahl Fälle mit Rabatt und Fraud": num_rf,
        "Anzahl korrekt vorhergesagter Fälle": num_rf_pred,
        "Schaden der Fälle mit Rabatt und Fraud": damage_rf,
        "Schaden der korrekt vorhergesagten Fälle": damage_rf_pred,
    }

In [40]:
def create_and_evaluate_model(X_train, y_train, X_test, y_test, train_data_function: callable):
    threshold = 0.0
    clf = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        objective="binary:logistic",
    )
    clf.fit(X_train, y_train[:, 0])
    probs_baseline = clf.predict_proba(X_test)[:, 1]
    preds_baseline = clf.predict(X_test)

    metrics_baseline = metrics.bewertung(probs_baseline, preds_baseline, y_test[:, 0], y_test[:, 1])

    model, bew, preds = fit_regressor_and_predict(X_train, y_train, X_test, y_test, train_data_function)

    damage_preds = model.predict(X_test)

    preds = (probs_baseline > threshold) & cost_fn(probs_baseline, damage_preds, 10)

    res = eval_rabattfaelle(X_test, y_test, preds)
    return res


In [41]:
def run_experiment(X, targets, train_data_function, n_splits=5, n_repeats=1, random_state=42):

    # Initialize the RepeatedStratifiedKFold
    skf = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )


    results = []
    for i, (train_idx, test_idx) in enumerate(skf.split(X, targets[:, 0])):
        print(f"Fold {i + 1}/{n_splits * n_repeats}")
        res = create_and_evaluate_model(
            X[train_idx],
            targets[train_idx],
            X[test_idx],
            targets[test_idx],
            train_data_function,
        )
        results.append(res)

    return results


In [75]:
n_splits = 5
n_repeats = 10

results = run_experiment(X, targets, id, n_splits=n_splits, n_repeats=n_repeats, random_state=seed)

Fold 1/50
Fold 2/50
Fold 3/50
Fold 4/50
Fold 5/50
Fold 6/50
Fold 7/50
Fold 8/50
Fold 9/50
Fold 10/50
Fold 11/50
Fold 12/50
Fold 13/50
Fold 14/50
Fold 15/50
Fold 16/50
Fold 17/50
Fold 18/50
Fold 19/50
Fold 20/50
Fold 21/50
Fold 22/50
Fold 23/50
Fold 24/50
Fold 25/50
Fold 26/50
Fold 27/50
Fold 28/50
Fold 29/50
Fold 30/50
Fold 31/50
Fold 32/50
Fold 33/50
Fold 34/50
Fold 35/50
Fold 36/50
Fold 37/50
Fold 38/50
Fold 39/50
Fold 40/50
Fold 41/50
Fold 42/50
Fold 43/50
Fold 44/50
Fold 45/50
Fold 46/50
Fold 47/50
Fold 48/50
Fold 49/50
Fold 50/50


In [76]:
k0, k1 = list(results[0].keys())[:2]
d1 = pd.DataFrame({
  k0: [d[k0] for d in results],
  k1: [d[k1] for d in results],
})

In [77]:
d1["Anteil erkannt"] = d1[k1] / d1[k0]
d1

Unnamed: 0,Anzahl Fälle mit Rabatt und Fraud,Anzahl korrekt vorhergesagter Fälle,Anteil erkannt
0,598,367,0.613712
1,608,386,0.634868
2,567,346,0.610229
3,573,349,0.609075
4,558,360,0.645161
5,577,364,0.630849
6,593,362,0.610455
7,567,348,0.613757
8,573,355,0.619546
9,594,360,0.606061


In [84]:
d1.describe().T["mean"].round(2)

Anzahl Fälle mit Rabatt und Fraud      580.80
Anzahl korrekt vorhergesagter Fälle    359.70
Anteil erkannt                           0.62
Name: mean, dtype: float64

In [None]:
k2, k3 = list(results[0].keys())[2:4]
d2 = pd.DataFrame({
  k2: [d[k2].sum() for d in results],
  k3: [d[k3].sum() for d in results],
})
d2["Anteil Schaden verhindert"] = d2[k3] / d2[k2]
d2

Unnamed: 0,Schaden der Fälle mit Rabatt und Fraud,Schaden der korrekt vorhergesagten Fälle,Anteil Schaden verhindert
0,3993.35,2478.87,0.620749
1,4215.05,2613.15,0.619957
2,3843.84,2372.37,0.617188
3,3775.16,2307.67,0.611277
4,3867.3,2540.37,0.656885


In [85]:
d2.describe().T["mean"].round(2)

Schaden der Fälle mit Rabatt und Fraud      3938.94
Schaden der korrekt vorhergesagten Fälle    2462.49
Anteil Schaden verhindert                      0.63
Name: mean, dtype: float64