In [221]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.feature_selection import chi2

In [223]:
# Lade die bereinigten Parquet-Dateien
df_damage = pd.read_parquet("transformed_damage_first.parquet")
df_label = pd.read_parquet("transformed_label_first.parquet")

In [225]:
df_damage.head(10)

Unnamed: 0,damage,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,...,has_limited_time_offers,has_fruits_vegetables_pieces,has_beverages,has_long_shelf_life,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
0,0.0,0,56.45,2,CREDIT_CARD,False,False,False,False,False,...,False,True,False,False,False,9.205984,9.205984,2.601115,6.62837,479
1,0.0,3,149.73,25,CREDIT_CARD,False,False,False,False,False,...,False,True,True,True,False,5.579174,13.259153,1.794106,10.571143,479
2,0.0,0,38.33,3,CREDIT_CARD,False,False,False,False,False,...,False,False,True,False,False,8.046765,14.779962,1.183578,4.660375,479
3,0.0,0,57.54,7,CREDIT_CARD,False,False,False,False,False,...,False,True,True,False,False,6.438,10.540794,7.088195,8.478462,479
4,0.0,2,50.9,8,CREDIT_CARD,False,False,False,False,False,...,False,True,True,False,False,6.887883,12.634943,1.037482,9.131956,479
5,0.0,0,267.51,25,CREDIT_CARD,True,False,False,False,True,...,False,True,True,False,False,7.342497,16.359716,7.11674,17.909933,479
6,0.0,0,155.52,22,CREDIT_CARD,False,False,False,False,False,...,False,True,True,True,False,6.425951,14.351451,7.948612,12.095884,479
7,0.0,2,35.31,4,CREDIT_CARD,False,False,False,False,False,...,False,True,True,False,False,6.078169,10.406037,7.029285,14.999726,479
8,0.0,0,63.14,3,CREDIT_CARD,False,False,False,False,False,...,False,False,True,False,False,3.544712,5.00205,4.077807,10.656555,479
9,0.0,1,115.25,12,CREDIT_CARD,True,False,False,False,True,...,False,True,True,False,False,6.893364,15.111831,1.674608,16.217764,479


In [227]:
# Downsampling: z. B. 500 FRAUD und 500 NO_FRAUD
n_fraud = df_label[df_label["label"] == "FRAUD"].shape[0]
n_nofraud = df_label[df_label["label"] == "NORMAL"].shape[0]
n_available = min(n_fraud, n_nofraud)#, 500)  # maximal 500, aber nicht mehr als vorhanden
df_fraud = df_label[df_label["label"] == "FRAUD"].sample(n=n_available, random_state=42)
df_nofraud = df_label[df_label["label"] == "NORMAL"].sample(n=n_available, random_state=42)
df_label = pd.concat([df_fraud, df_nofraud]).sample(frac=1, random_state=42).reset_index(drop=True)

In [229]:
n_fraud

4656

In [231]:
# Gemeinsame Länge bestimmen
#min_len = min(len(df_damage), len(df_label))

In [233]:
# Trimmen auf gleiche Länge
#df_damage = df_damage.iloc[:min_len].reset_index(drop=True)
#df_label = df_label.iloc[:min_len].reset_index(drop=True)

In [235]:
# Dann gemeinsame Permutation
np.random.seed(42)
#permutation = np.random.permutation(min_len)
perm_damage = np.random.RandomState(seed=42).permutation(len(df_damage))
perm_label = np.random.RandomState(seed=42).permutation(len(df_label))
df_damage = df_damage.iloc[perm_damage].reset_index(drop=True)
df_label = df_label.iloc[perm_label].reset_index(drop=True)

In [237]:
len(df_label)

9312

In [239]:
# Aufteilen in Trainings- und Testdaten
df_damage_train, df_damage_test = train_test_split(df_damage, test_size=0.2, random_state=42)
df_label_train, df_label_test = train_test_split(df_label, test_size=0.2, random_state=42)

In [246]:
# Analysefunktion pro Feature
import warnings
from statsmodels.tools.sm_exceptions import PerfectSeparationError, PerfectSeparationWarning

# Unterdrücke Warnungen dieser Art
warnings.simplefilter("ignore", PerfectSeparationWarning)

def analyze_feature_general(df_train, df_test, feature, target_name):
    result = {"feature": feature}
    significance_label = "nicht signifikant"

    try:
        y_train = df_train[target_name]
        y_test = df_test[target_name]

        # Für Klassifikation: binäre Codierung
        if target_name == "label":
            y_train = y_train.map({"NORMAL": 0, "FRAUD": 1})
            y_test = y_test.map({"NORMAL": 0, "FRAUD": 1})

        # Feature vorbereiten
        X_train = df_train[[feature]].copy()
        X_test = df_test[[feature]].copy()

        is_categorical = (
            X_train[feature].dtype in ["object", "bool", "category"]
            or df_train[feature].nunique() <= 10
        )

        if is_categorical:
            X_train = pd.get_dummies(X_train, drop_first=True)
            X_test = pd.get_dummies(X_test, drop_first=True)
            X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

        X_train = X_train.astype("float64")
        X_test = X_test.astype("float64")

        valid_train = np.isfinite(X_train).all(axis=1) & np.isfinite(y_train)
        valid_test = np.isfinite(X_test).all(axis=1) & np.isfinite(y_test)
        X_train = X_train[valid_train]
        y_train = y_train[valid_train]
        X_test = X_test[valid_test]
        y_test = y_test[valid_test]

        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)

        if target_name == "label":
            # Chi²-Test nur für kategoriale Merkmale mit einer Spalte
            if is_categorical and X_train.shape[1] == 1:
                try:
                    chi2_score_val, chi2_p_val = chi2(X_train, y_train)
                    result["chi2_score"] = float(chi2_score_val[0])
                    result["chi2_p_value"] = float(chi2_p_val[0])
                    if chi2_p_val[0] < 0.01:
                        significance_label = "sehr signifikant"
                    elif chi2_p_val[0] < 0.05:
                        significance_label = "weniger signifikant"
                except Exception as e:
                    result["chi2_score"] = np.nan
                    result["chi2_p_value"] = np.nan
                    result["error"] = f"Chi²-Fehler: {str(e)}"
            else:
                result["chi2_score"] = np.nan
                result["chi2_p_value"] = np.nan

            try:
                model = sm.Logit(y_train, X_train_const).fit(disp=False)
                y_test_pred = model.predict(X_test_const)
                pseudo_r2 = 1 - model.llf / model.llnull
                if not np.isfinite(pseudo_r2) or pseudo_r2 > 1e6:
                    pseudo_r2 = np.nan
                    result["model_type"] = "logit_unstable"

                result.update({
                    "pseudo_r2_test": pseudo_r2,
                    "accuracy_test": np.mean((y_test_pred > 0.5) == y_test),
                    "p_value_train": model.pvalues.iloc[1] if len(model.pvalues) > 1 else np.nan,
                    "n_obs_train": len(y_train),
                    "n_obs_test": len(y_test),
                    "model_type": result.get("model_type", "logit")
                })

                # Signifikanz bei kontinuierlichen Merkmalen aus Regression
                if not is_categorical and len(model.pvalues) > 1:
                    p_val = model.pvalues.iloc[1]
                    if p_val < 0.01:
                        significance_label = "sehr signifikant"
                    elif p_val < 0.05:
                        significance_label = "weniger signifikant"

            except PerfectSeparationError:
                result.update({
                    "pseudo_r2_test": np.nan,
                    "accuracy_test": np.nan,
                    "p_value_train": np.nan,
                    "n_obs_train": len(y_train),
                    "n_obs_test": len(y_test),
                    "model_type": "logit_perfect",
                    "error": "Perfect separation – Modell nicht identifizierbar"
                })

        else:
            # Regressionsmodell (z. B. damage)
            model = sm.OLS(y_train, X_train_const).fit()
            y_train_pred = model.predict(X_train_const)
            y_test_pred = model.predict(X_test_const)

            correlation = np.nan
            if X_test.shape[1] == 1:
                correlation = np.sqrt(abs(r2_score(y_test, y_test_pred))) * np.sign(np.corrcoef(X_test.iloc[:, 0], y_test)[0, 1])

            het_pval = sm.stats.diagnostic.het_breuschpagan(model.resid, X_train_const)[1] if X_train_const.shape[1] > 1 else np.nan

            result.update({
                "r2_train": r2_score(y_train, y_train_pred),
                "r2_test": r2_score(y_test, y_test_pred),
                "correlation_test": correlation,
                "p_value_train": model.pvalues.iloc[1] if len(model.pvalues) > 1 else np.nan,
                "heteroskedasticity_pval_train": het_pval,
                "normality_pval_train": stats.normaltest(model.resid)[1] if len(model.resid) >= 8 else np.nan,
                "n_obs_train": len(y_train),
                "n_obs_test": len(y_test),
                "model_type": "ols",
                "chi2_score": np.nan,
                "chi2_p_value": np.nan
            })

            # Signifikanz
            if len(model.pvalues) > 1:
                p_val = model.pvalues.iloc[1]
                if p_val < 0.01:
                    significance_label = "sehr signifikant"
                elif p_val < 0.05:
                    significance_label = "weniger signifikant"

        result["significance"] = significance_label

    except Exception as e:
        result.update({
            "r2_train": np.nan,
            "r2_test": np.nan,
            "correlation_test": np.nan,
            "p_value_train": np.nan,
            "heteroskedasticity_pval_train": np.nan,
            "normality_pval_train": np.nan,
            "accuracy_test": np.nan,
            "pseudo_r2_test": np.nan,
            "n_obs_train": 0,
            "n_obs_test": 0,
            "model_type": "error",
            "chi2_score": np.nan,
            "chi2_p_value": np.nan,
            "significance": "nicht signifikant",
            "error": str(e)
        })

    # Relevanzbewertung nach Modellgüte
    relevance_label = "nicht relevant"

    if target_name == "label":
        if result.get("pseudo_r2_test", 0) >= 0.2:
            relevance_label = "sehr relevant"
        elif result.get("pseudo_r2_test", 0) >= 0.05:
            relevance_label = "weniger relevant"
    else:
        if result.get("r2_test", 0) >= 0.1:
            relevance_label = "sehr relevant"
        elif result.get("r2_test", 0) >= 0.02:
            relevance_label = "weniger relevant"

    result["relevance"] = relevance_label

    return result



In [248]:
# Hauptschleife für beide Targets
# Hauptschleife für beide Targets
for target_name, df_train, df_test in [
    ("damage", df_damage_train, df_damage_test),
    ("label", df_label_train, df_label_test)
]:
    result_rows = []
    for feature in df_train.columns:
        if feature == target_name:
            continue
        result = analyze_feature_general(df_train, df_test, feature, target_name)
        result_rows.append(result)

    # Ergebnis abspeichern
    df_results = pd.DataFrame(result_rows)
    df_results.to_excel(f"feature_analysis_{target_name}.xlsx", index=False)

  c /= stddev[:, None]
  c /= stddev[None, :]
