In [1]:
import pandas as pd
import numpy as np

# ------------------------------------
# 1️⃣ Load your data
file_path = '0_DataPreparation/initialdata/cleaned_data_imputed.csv'  # adjust to your path
df = pd.read_csv(file_path, parse_dates=['Datum'])

# ------------------------------------
# 2️⃣ Prepare outlier flags
product_groups = df['Warengruppe'].dropna().unique()
product_groups = [p for p in product_groups if p != 'Saisonbrot']  # exclude Saisonbrot

# Initialize flag column
df['is_outlier'] = 0

# Loop through each product to compute IQR-based flags
for product in product_groups:
    prod_df = df[df['Warengruppe'] == product]
    umsatz = prod_df['Umsatz'].dropna()
    
    q1 = np.percentile(umsatz, 25)
    q3 = np.percentile(umsatz, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Set flag
    mask = (df['Warengruppe'] == product) & (
        (df['Umsatz'] < lower_bound) | (df['Umsatz'] > upper_bound)
    )
    df.loc[mask, 'is_outlier'] = 1

# ------------------------------------
# 3️⃣ Save to new CSV
output_file = 'cleaned_data_imputed_with_outliers.csv'
df.to_csv(output_file, index=False)
print(f"✅ Done. Saved dataset with 'is_outlier' column to: {output_file}")


✅ Done. Saved dataset with 'is_outlier' column to: cleaned_data_imputed_with_outliers.csv
