# Start

In [1]:
import numpy as np
import pandas as pd
import torch

import fraud_detection as fd

datapath = "./data/transformed_label_and_damage.parquet"

In [2]:
import os
os.chdir("..")

In [3]:
seed = 4
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x759eb02cfd10>

In [4]:
linear_features = [
    "payment_medium",
    "has_positive_price_difference",
    "calculated_price_difference",
]

In [5]:
useless_features = [
    "max_product_price",
    "has_positive_price_difference",
    "has_bakery",
    "time_to_first_scan",
    "popularity_max",
    "has_age_restricted",
    "cash_desk",
    "transaction_duration_seconds",
    "feedback_low",
    "feedback_middle",
    "feedback_high",
    "feedback_top",
    "store_id",
    "location",
    "urbanization",
    "has_voided",
    "has_sold_by_weight",
    "has_limited_time_offers",
    "has_fruits_vegetables",
    "has_missing",
    "has_camera_detected_wrong_product",
    "day_of_week",
    "hour_categorical",
]

# Model Vergleich

Vergleich verschiedener Modelle mit Cross Validition auf identischen Splits.

Die Combinded Modelle bestehen aus einem Klassifikator und einem Regressor, die zusammenarbeiten, um eine bessere Vorhersage zu erzielen.

In [6]:
# Anzahl der Input-Features bestimmen
X, _ = fd.data_loader.load_data_np(datapath, drop_features=useless_features)
n_input = X.shape[1]

In [7]:
#%%capture

models = [
    ("Decision Tree", lambda: fd.models.classifiers.get_decsion_tree()),
    ("Random Forest", lambda: fd.models.classifiers.get_random_forest()),
    ("LGBMClassifier", lambda: fd.models.classifiers.get_lgmb()),
    ("XGBoost simple", lambda: fd.models.classifiers.get_xgb_simple()),
    ("XGBoost", lambda: fd.models.classifiers.get_xgb()),
    ("CatBoost", lambda: fd.models.classifiers.get_catboost()),
    ("NeuralNet", lambda: fd.neuralnets.train_nn.getNN(n_input)),
]

model_metrics = fd.model_comparison.compare_models(
    models,
    datapath,
    n_splits=5,
    n_repeats=5,
    random_state=seed,
    drop_features=useless_features,
)
model_metrics = pd.DataFrame(model_metrics)



Start trainings for Decision Tree
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


Start trainings for Random Forest
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


Start trainings for LGBMClassifier
Round 0




Round 1




Round 2




Round 3




Round 4




Round 5




Round 6




Round 7




Round 8




Round 9




Round 10




Round 11




Round 12




Round 13




Round 14




Round 15




Round 16




Round 17




Round 18




Round 19




Round 20




Round 21




Round 22




Round 23




Round 24






Start trainings for XGBoost simple
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


Start trainings for XGBoost
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


Start trainings for CatBoost
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


Start trainings for NeuralNet
Round 0
=> Starting training
=> epoch: 1, loss: 0.0107, duration: 1.8088316917419434
Metric            | train     | test    |
precision         |      0.84 |     0.85|
recall            |      0.34 |     0.35|
f1            

In [21]:
# lineares Modell mit reduzierter Feature-Auswahl
logreg_features = [
    "payment_medium",
    "has_positive_price_difference",
    "calculated_price_difference",
]
models = [
    ("Logistic Regression", lambda: fd.models.classifiers.get_logistic_regression()),
]

model_metrics_logreg = fd.model_comparison.compare_models(
    models,
    datapath,
    n_splits=5,
    n_repeats=5,
    random_state=seed,
    select_features=logreg_features,
)
model_metrics_logreg = pd.DataFrame(model_metrics_logreg)



Start trainings for Logistic Regression
Round 0
Round 1
Round 2
Round 3
Round 4
Round 5
Round 6
Round 7
Round 8
Round 9
Round 10
Round 11
Round 12
Round 13
Round 14
Round 15
Round 16
Round 17
Round 18
Round 19
Round 20
Round 21
Round 22
Round 23
Round 24


In [23]:
model_metrics["Logistic Regression"] = model_metrics_logreg["Logistic Regression"]

In [None]:
cols = model_metrics.columns.to_list()
cols.remove("Logistic Regression")
cols.remove("XGBoost")
cols = ["Logistic Regression", *cols]

In [79]:
model_metrics.to_csv("model_metrics_grand_comparison_all.csv", index=True)

In [32]:
model_metrics = model_metrics[cols]

In [33]:
cat = "precision"

model_metrics[model_metrics.index.str.contains("|".join(cat + "_" + m for m in ["mean", "max", "min", "var"]))]

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,LGBMClassifier,XGBoost simple,CatBoost,NeuralNet
precision_mean,0.746331,0.749139,0.752982,0.842694,0.84244,0.853805,0.816171
precision_max,0.782082,0.825342,0.816248,0.878354,0.882353,0.8875,0.861507
precision_min,0.703549,0.675573,0.721966,0.816029,0.807356,0.819672,0.773832
precision_var,0.000462,0.000805,0.000541,0.00024,0.000312,0.000285,0.000491


In [65]:
mean_vals = model_metrics[model_metrics.index.str.contains("mean")].T
mean_cols = list(map(lambda x: x.replace("_mean", "").replace(":", ""), mean_vals.columns.to_list()))
mean_vals.columns = mean_cols
# change dtype to float
mean_vals = mean_vals.astype(float)
mean_vals = mean_vals.round(3)
mean_vals[["precision", "recall", "f1", "auc-pr", "damage_prevented", "Bewertung"]]

Unnamed: 0,precision,recall,f1,auc-pr,damage_prevented,Bewertung
Logistic Regression,0.746,0.385,0.508,0.431,2219.208,-3271.956
Decision Tree,0.749,0.561,0.641,0.655,3648.474,-1585.29
Random Forest,0.753,0.54,0.628,0.681,3484.541,-1748.223
LGBMClassifier,0.843,0.549,0.664,0.729,3524.914,-1020.85
XGBoost simple,0.842,0.552,0.667,0.73,3555.049,-982.715
CatBoost,0.854,0.543,0.664,0.733,3510.251,-978.913
NeuralNet,0.816,0.508,0.626,0.681,3356.139,-1468.625


In [78]:
cm = model_metrics[model_metrics.index.str.contains("cm")].T
cm["TP"] = cm["cm"].apply(lambda x: x[1, 1])
cm["FP"] = cm["cm"].apply(lambda x: x[0, 1])
cm["FN"] = cm["cm"].apply(lambda x: x[1, 0])
cm["TN"] = cm["cm"].apply(lambda x: x[0, 0])
cm.drop(columns=["cm"], inplace=True)
cm

Unnamed: 0,TP,FP,FN,TN
Logistic Regression,329.2,112.2,526.4,28561.6
Decision Tree,480.28,162.0,375.32,28511.8
Random Forest,462.16,152.84,393.44,28520.96
LGBMClassifier,469.4,87.76,386.2,28586.04
XGBoost simple,472.6,88.56,383.0,28585.24
CatBoost,464.56,79.68,391.04,28594.12
NeuralNet,434.72,98.32,420.88,28575.48


In [51]:
# Format DataFrame nicely
df_styled = mean_vals.style.set_table_styles(
    [{'selector': 'th', 'props': [('font-weight', 'bold'), ('text-align', 'center')]}]
)

# Display in Jupyter, then copy manually
df_styled

Unnamed: 0,precision,recall,f1,mcc,auc-pr:,damage_total,damage_prevented,damage_missed,detected bonus,fp penalty,Bewertung
Logistic Regression,0.746331,0.384758,0.507568,0.526531,0.431369,6015.164,2219.208,3795.956,1646.0,1122.0,-3271.956
Decision Tree,0.749139,0.561338,0.641186,0.639296,0.654941,6015.164,3648.4736,2366.6904,2401.4,1620.0,-1585.2904
Random Forest,0.752982,0.54016,0.62809,0.62836,0.680572,6015.164,3484.5408,2530.6232,2310.8,1528.4,-1748.2232
LGBMClassifier,0.842694,0.548623,0.664384,0.672558,0.728871,6015.164,3524.9136,2490.2504,2347.0,877.6,-1020.8504
XGBoost simple,0.84244,0.552363,0.667069,0.674798,0.730086,6015.164,3555.0488,2460.1152,2363.0,885.6,-982.7152
CatBoost,0.853805,0.542967,0.663622,0.673651,0.732612,6015.164,3510.2512,2504.9128,2322.8,796.8,-978.9128
NeuralNet,0.816171,0.50809,0.626019,0.635912,0.68149,6015.164,3356.1388,2659.0252,2173.6,983.2,-1468.6252


In [15]:
# select rows with value "Bewertung_mean", "Bewertung_max" in first (unamed or index) column
model_metrics[model_metrics.index.str.contains("Bewertung_mean|Bewertung_max|Bewertung_min")]

Unnamed: 0,Decision Tree,Random Forest,LGBMClassifier,XGBoost simple,XGBoost,CatBoost,NeuralNet
Bewertung_mean,-1585.2904,-1748.2232,-1020.8504,-982.7152,-1020.0084,-978.9128,-1468.6252
Bewertung_max,-965.78,-1366.7,-527.25,-487.41,-447.84,-406.89,-1047.78
Bewertung_min,-1985.06,-2094.91,-1349.65,-1297.18,-1415.34,-1362.71,-1908.64


In [18]:
cat = ["precision_mean", "recall_mean", "roc_auc_mean", "accuracy_mean"]
model_metrics[model_metrics.index.str.contains("|".join(cat))]

Unnamed: 0,Decision Tree,Random Forest,LGBMClassifier,XGBoost simple,XGBoost,CatBoost,NeuralNet
precision_mean,0.749139,0.752982,0.842694,0.84244,0.843611,0.853805,0.816171
recall_mean,0.561338,0.54016,0.548623,0.552363,0.548529,0.542967,0.50809


In [16]:
model_metrics

Unnamed: 0,Decision Tree,Random Forest,LGBMClassifier,XGBoost simple,XGBoost,CatBoost,NeuralNet
precision_mean,0.749139,0.752982,0.842694,0.84244,0.843611,0.853805,0.816171
precision_max,0.825342,0.816248,0.878354,0.882353,0.887097,0.8875,0.861507
precision_min,0.675573,0.721966,0.816029,0.807356,0.818493,0.819672,0.773832
precision_var,0.000805,0.000541,0.00024,0.000312,0.000269,0.000285,0.000491
recall_mean,0.561338,0.54016,0.548623,0.552363,0.548529,0.542967,0.50809
recall_max,0.621053,0.600467,0.591813,0.578947,0.584795,0.588304,0.530994
recall_min,0.509346,0.468458,0.521028,0.522196,0.514019,0.514019,0.473131
recall_var,0.000521,0.000926,0.000328,0.000272,0.000302,0.000303,0.000273
f1_mean,0.641186,0.62809,0.664384,0.667069,0.664609,0.663622,0.626019
f1_max,0.676944,0.660984,0.694484,0.699153,0.700637,0.702473,0.649351


Das neuronale Netzwerk ist nocht recht unstabil und erzielt nicht immer gute Ergebnisse (in diesem Durchlauf schnitt es aber am besten ab).