In [1]:
import pandas as pd

# Laden der Datensätze

In [2]:
tr_label = pd.read_parquet("transformed_label.parquet", engine="fastparquet")


In [3]:
tr_damage = pd.read_parquet("transformed_damage.parquet", engine="fastparquet")

In [4]:
tr_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148024 entries, 0 to 148023
Data columns (total 55 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   label                                             148024 non-null  object 
 1   cash_desk                                         148024 non-null  object 
 2   total_amount                                      148024 non-null  float64
 3   n_lines                                           148024 non-null  int64  
 4   payment_medium                                    148024 non-null  object 
 5   has_feedback                                      148024 non-null  bool   
 6   feedback_categorical                              148024 non-null  object 
 7   feedback_low                                      148024 non-null  bool   
 8   feedback_middle                                   148024 non-null  bool   
 9   feed

# Auswahl der wesentlichen Merkmale

In [5]:
tr_label = tr_label[['label','transaction_id','n_lines', 'payment_medium',
       'has_feedback', 'feedback_categorical', 'hour',
       'transaction_duration_seconds', 
       'has_voided',  'has_unscanned',
       'has_camera_detected_wrong_product',
       'has_camera_detected_wrong_product_high_certainty',
       'calculated_price_difference', 'has_positive_price_difference',
       'has_snacks']]

In [6]:
tr_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148024 entries, 0 to 148023
Data columns (total 15 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   label                                             148024 non-null  object 
 1   transaction_id                                    148024 non-null  object 
 2   n_lines                                           148024 non-null  int64  
 3   payment_medium                                    148024 non-null  object 
 4   has_feedback                                      148024 non-null  bool   
 5   feedback_categorical                              148024 non-null  object 
 6   hour                                              148024 non-null  int8   
 7   transaction_duration_seconds                      148024 non-null  int64  
 8   has_voided                                        148024 non-null  bool   
 9   has_

# Ermitteln der Datensätze mit sales price = 0, die als Fraud gelabelt wurden, da diese anderweitig ermittelt werden könnten

In [7]:
sales_price_0 = tr_label[(tr_label.has_unscanned == True ) & (tr_label.label == "FRAUD")]

In [8]:
sales_price_0.shape

(377, 15)

# Ermitteln der Datensätze mit rechnerischer Differenz (Rabattbetrug), die als Fraud gelabelt wurden, da diese anderweitig ermittelt werden könnten

In [9]:
rabatt_betrug = tr_label[(tr_label.has_positive_price_difference == True) & (tr_label.label == "FRAUD")]

In [10]:
rabatt_betrug.shape

(2713, 15)

# Ermitteln der Datensätze ohne die oben ermittelten FRAUD-Fälle (die restlcihen FRAUD-Fälle)

In [11]:
# 1. IDs der auszuschließenden Transaktionen ermitteln
exclude_ids = pd.concat([sales_price_0["transaction_id"], rabatt_betrug["transaction_id"]]).unique()

# 2. Alle Transaktionen aus tr_label, die NICHT in exclude_ids sind
tr_label_filtered = tr_label[~tr_label["transaction_id"].isin(exclude_ids)]


In [12]:
tr_label_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144965 entries, 0 to 148022
Data columns (total 15 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   label                                             144965 non-null  object 
 1   transaction_id                                    144965 non-null  object 
 2   n_lines                                           144965 non-null  int64  
 3   payment_medium                                    144965 non-null  object 
 4   has_feedback                                      144965 non-null  bool   
 5   feedback_categorical                              144965 non-null  object 
 6   hour                                              144965 non-null  int8   
 7   transaction_duration_seconds                      144965 non-null  int64  
 8   has_voided                                        144965 non-null  bool   
 9   has_

# Einfügen der Spalte damage, da diese für die Kostenfunktion benötigt wird

In [13]:

df = tr_label_filtered.merge(
    tr_damage[["transaction_id", "damage"]],
    on="transaction_id",
    how="left"  # falls manche keinen Schaden haben
)


# Ggf. 0,00 € als Schaden hinzufügen

In [14]:
df["damage"] = df["damage"].fillna(0)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144965 entries, 0 to 144964
Data columns (total 16 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   label                                             144965 non-null  object 
 1   transaction_id                                    144965 non-null  object 
 2   n_lines                                           144965 non-null  int64  
 3   payment_medium                                    144965 non-null  object 
 4   has_feedback                                      144965 non-null  bool   
 5   feedback_categorical                              144965 non-null  object 
 6   hour                                              144965 non-null  int8   
 7   transaction_duration_seconds                      144965 non-null  int64  
 8   has_voided                                        144965 non-null  bool   
 9   has_

In [16]:
df.damage.sum()

11786.330000000002

# Zielvariable vorbereiten

In [17]:
from sklearn.preprocessing import LabelEncoder

df = df.copy()
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])  # FRAUD = 1, NORMAL = 0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144965 entries, 0 to 144964
Data columns (total 17 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   label                                             144965 non-null  object 
 1   transaction_id                                    144965 non-null  object 
 2   n_lines                                           144965 non-null  int64  
 3   payment_medium                                    144965 non-null  object 
 4   has_feedback                                      144965 non-null  bool   
 5   feedback_categorical                              144965 non-null  object 
 6   hour                                              144965 non-null  int8   
 7   transaction_duration_seconds                      144965 non-null  int64  
 8   has_voided                                        144965 non-null  bool   
 9   has_

# Features und Ziel trennen

In [19]:
X = df.drop(columns=["label", "label_encoded", "transaction_id"])  
y = df["label_encoded"]


In [20]:
X = pd.get_dummies(X, columns=["payment_medium"], drop_first=True)
X = pd.get_dummies(X, columns=["feedback_categorical"], drop_first=True)



In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144965 entries, 0 to 144964
Data columns (total 17 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   n_lines                                           144965 non-null  int64  
 1   has_feedback                                      144965 non-null  bool   
 2   hour                                              144965 non-null  int8   
 3   transaction_duration_seconds                      144965 non-null  int64  
 4   has_voided                                        144965 non-null  bool   
 5   has_unscanned                                     144965 non-null  bool   
 6   has_camera_detected_wrong_product                 144965 non-null  bool   
 7   has_camera_detected_wrong_product_high_certainty  144965 non-null  bool   
 8   calculated_price_difference                       144965 non-null  float64
 9   has_

In [22]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Falls notwendig:
# Bool-Spalten in int umwandeln
bool_cols = X.select_dtypes(include=["bool"]).columns
X[bool_cols] = X[bool_cols].astype(int)

# Zielvariable
y = df["label"].map({"NORMAL": 0, "FRAUD": 1})


In [23]:
X_train, X_test, y_train, y_test, damage_train, damage_test = train_test_split(
    X, y, df["damage"], test_size=0.2, stratify=y, random_state=42
)


In [25]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    class_weight="balanced",  # gleicht Klassenungleichheit aus
    random_state=42
)
model.fit(X_train, y_train)


In [26]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # für Threshold-Tuning später


In [27]:
print("Konfusionsmatrix:")
print(confusion_matrix(y_test, y_pred))

print("\nKlassifikationsbericht:")
print(classification_report(y_test, y_pred))


Konfusionsmatrix:
[[28674     0]
 [    1   318]]

Klassifikationsbericht:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28674
           1       1.00      1.00      1.00       319

    accuracy                           1.00     28993
   macro avg       1.00      1.00      1.00     28993
weighted avg       1.00      1.00      1.00     28993



In [28]:
def cost_score(y_true, y_pred, damage_values):
    cost = 0
    for yt, yp, dmg in zip(y_true, y_pred, damage_values):
        if yt == 1 and yp == 1:       # True Positive
            cost += 5
        elif yt == 1 and yp == 0:     # False Negative
            cost -= dmg
        elif yt == 0 and yp == 1:     # False Positive
            cost -= 10
        # TN → 0 €
    return cost


In [29]:
total_cost = cost_score(y_test, y_pred, damage_test)
print(f"\nGesamtnutzen laut Bewertungsfunktion: {total_cost:,.2f} €")



Gesamtnutzen laut Bewertungsfunktion: 1,589.97 €
