In [46]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# 1. Load data
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Keep only product group 4
train = train[train["warengruppe"] == 4].copy()
test = test[test["warengruppe"] == 4].copy()

# 3. Convert date column to datetime
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

# 4. Create weekday, month, day-of-year
train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

train["Month"] = train["date"].dt.month
test["Month"] = test["date"].dt.month

train["dayofyear"] = train["date"].dt.dayofyear
test["dayofyear"] = test["date"].dt.dayofyear

# 5. Seasonal sine/cosine features
train["sin_season"] = np.sin(2 * np.pi * train["dayofyear"] / 365)
train["cos_season"] = np.cos(2 * np.pi * train["dayofyear"] / 365)

test["sin_season"] = np.sin(2 * np.pi * test["dayofyear"] / 365)
test["cos_season"] = np.cos(2 * np.pi * test["dayofyear"] / 365)

# 6. Kieler Woche: replace missing with 0 and cast to int
for df in [train, test]:
    df["KielerWoche"] = df["KielerWoche"].fillna(0).astype(int)

train.head()


Unnamed: 0,date,warengruppe,id,umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,school_holiday,public_holiday,Wochentag,Month,dayofyear,sin_season,cos_season
3,2013-07-01,4,1307014,65.890169,6.0,17.8375,15.0,20.0,0,1,0,Monday,7,182,0.008607,-0.999963
8,2013-07-02,4,1307024,74.543917,3.0,17.3125,10.0,,0,1,0,Tuesday,7,183,-0.008607,-0.999963
13,2013-07-03,4,1307034,69.262728,7.0,21.075,6.0,61.0,0,1,0,Wednesday,7,184,-0.025818,-0.999667
18,2013-07-04,4,1307044,61.490175,7.0,18.85,7.0,20.0,0,1,0,Thursday,7,185,-0.043022,-0.999074
23,2013-07-05,4,1307054,86.759861,5.0,19.975,12.0,,0,1,0,Friday,7,186,-0.060213,-0.998186


In [47]:
# --- DATE FEATURES ---
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

# Weekday name
train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# Month (1–12)
train["Month"] = train["date"].dt.month
test["Month"] = test["date"].dt.month

# Day of year (1–365)
train["dayofyear"] = train["date"].dt.dayofyear
test["dayofyear"] = test["date"].dt.dayofyear

# Seasonal sine/cosine
train["sin_season"] = np.sin(2 * np.pi * train["dayofyear"] / 365)
train["cos_season"] = np.cos(2 * np.pi * train["dayofyear"] / 365)

test["sin_season"] = np.sin(2 * np.pi * test["dayofyear"] / 365)
test["cos_season"] = np.cos(2 * np.pi * test["dayofyear"] / 365)

# Kieler Woche & holidays: replace missing with 0 and cast to int
for df in [train, test]:
    df["KielerWoche"]    = df["KielerWoche"].fillna(0).astype(int)
    df["school_holiday"] = df["school_holiday"].fillna(0).astype(int)
    df["public_holiday"] = df["public_holiday"].fillna(0).astype(int)




In [48]:
# 5. Features und Zielvariable
features = [
    "Temperatur",
    "KielerWoche",
    "school_holiday",
    "public_holiday",
    "Wochentag",
    "Month",
    "sin_season",
    "cos_season",
]
target = "umsatz"

X = train[features]
y = train[target]

X_test = test[features]

# 6. Preprocessing:
#    - Wochentag: Imputer + OneHotEncoder
#    - Temperatur & KielerWoche: numerisch mit Mittelwert-Imputation

categorical_features = ["Wochentag"]
numeric_features = [
    "Temperatur",
    "KielerWoche",
    "school_holiday",
    "public_holiday",
    "Month",
    "sin_season",
    "cos_season",
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features),
        
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), numeric_features),
    ]
)


In [49]:
# 7. Lineares Regressionsmodell + Pipeline
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# Training
model.fit(X, y)

# R² auf Trainingsdaten
y_train_pred = model.predict(X)
r2_train = r2_score(y, y_train_pred)
print(f"R² für Trainingsdaten: {r2_train:.4f}")


R² für Trainingsdaten: 0.3843


In [50]:
# 8. Vorhersage für Testset
test["umsatz_Prediction"] = model.predict(X_test)

print(test[["date", "id", "umsatz_Prediction"]].head())

# 9. Submission als csv speichern
submission = test[["id", "umsatz_Prediction"]].copy()
submission.to_csv("submission_linear_regression_byGulfem.csv", index=False)
submission.head()


         date       id  umsatz_Prediction
3  2018-08-01  1808014          72.789293
8  2018-08-02  1808024          72.506666
13 2018-08-03  1808034          75.859727
18 2018-08-04  1808044          82.541483
23 2018-08-05  1808054         134.833713


Unnamed: 0,id,umsatz_Prediction
3,1808014,72.789293
8,1808024,72.506666
13,1808034,75.859727
18,1808044,82.541483
23,1808054,134.833713


In [51]:
# 10. Koeffizienten extrahieren

# a) Feature-Namen nach dem Preprocessing
ohe = model.named_steps["prep"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)

num_feature_names = np.array(numeric_features)

all_features = np.concatenate([cat_feature_names, num_feature_names])

# b) Koeffizienten aus dem Regressor
coefs = model.named_steps["reg"].coef_
intercept = model.named_steps["reg"].intercept_

coef_df = pd.DataFrame({
    "Feature": all_features,
    "Coefficient": coefs
})

print("Intercept:", intercept)
print("\nKoeffizienten (sortiert):")
display(coef_df.sort_values("Coefficient", ascending=False))


Intercept: 99.44976257036387

Koeffizienten (sortiert):


Unnamed: 0,Feature,Coefficient
10,public_holiday,59.690177
3,Wochentag_Sunday,50.867783
13,cos_season,8.514998
9,school_holiday,4.035333
7,Temperatur,0.169146
8,KielerWoche,-0.540761
2,Wochentag_Saturday,-1.934676
11,Month,-2.29036
12,sin_season,-4.034864
1,Wochentag_Monday,-7.602357


In [52]:
# === RUN ANOVA FOR ALL PRODUCT GROUPS ===

product_groups = [1, 2, 3, 4, 5, 6]

# Load the full training dataset once
train_all = pd.read_csv("analysis/train_split_merged_data_updated.csv")

for wg in product_groups:
    print("\n======================")
    print(f"   Warengruppe {wg}")
    print("======================")

    # Filter for this product group
    df = train_all[train_all["warengruppe"] == wg].copy()

    if df.empty:
        print("No data for this product group.")
        continue

    # --- FEATURE ENGINEERING ---
    df["date"] = pd.to_datetime(df["date"])
    df["Wochentag"] = df["date"].dt.day_name()
    df["Month"] = df["date"].dt.month
    df["dayofyear"] = df["date"].dt.dayofyear
    df["sin_season"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
    df["cos_season"] = np.cos(2 * np.pi * df["dayofyear"] / 365)

    # Clean integer features
    df["KielerWoche"]    = df["KielerWoche"].fillna(0).astype(int)
    df["school_holiday"] = df["school_holiday"].fillna(0).astype(int)
    df["public_holiday"] = df["public_holiday"].fillna(0).astype(int)

    target = "umsatz"

    # --- FULL FEATURE SET (updated with holidays) ---
    full_features = [
        "Temperatur",
        "KielerWoche",
        "school_holiday",
        "public_holiday",
        "Wochentag",
        "Month",
        "sin_season",
        "cos_season",
    ]

    def compute_r2_local(features):
        X = df[features]
        y = df[target]

        categorical_features = [f for f in features if f == "Wochentag"]
        numeric_features = [f for f in features if f != "Wochentag"]

        transformers = []
        if categorical_features:
            transformers.append(
                ("cat", Pipeline([
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore"))
                ]), categorical_features)
            )
        if numeric_features:
            transformers.append(
                ("num", Pipeline([
                    ("imputer", SimpleImputer(strategy="mean"))
                ]), numeric_features)
            )

        preprocessor_local = ColumnTransformer(transformers=transformers)

        model_tmp = Pipeline([
            ("prep", preprocessor_local),
            ("reg", LinearRegression())
        ])

        model_tmp.fit(X, y)
        y_pred = model_tmp.predict(X)
        return r2_score(y, y_pred)

    # Full model R²
    r2_full = compute_r2_local(full_features)
    print(f"Full model R²: {r2_full:.3f}")

    # --- FEATURE GROUPS (updated with holidays) ---
    feature_groups = {
        "Weekday": ["Wochentag"],
        "Temperature": ["Temperatur"],
        "KielerWoche": ["KielerWoche"],
        "School Holiday": ["school_holiday"],
        "Public Holiday": ["public_holiday"],
        "Month": ["Month"],
        "Seasonality": ["sin_season", "cos_season"],
    }

    results = []
    for name, group in feature_groups.items():
        reduced_features = [f for f in full_features if f not in group]
        r2_reduced = compute_r2_local(reduced_features)
        delta_r2 = r2_full - r2_reduced
        results.append([name, delta_r2, r2_reduced])

    importance_df = pd.DataFrame(results, columns=["Feature Group", "ΔR²", "Reduced Model R²"])
    importance_df = importance_df.sort_values("ΔR²", ascending=False)

    print("ANOVA-like variance partitioning:")
    display(importance_df)



   Warengruppe 1
Full model R²: 0.438
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.294034,0.144303
3,School Holiday,0.048521,0.389815
4,Public Holiday,0.028416,0.40992
6,Seasonality,0.012468,0.425869
5,Month,0.001683,0.436653
2,KielerWoche,0.00048,0.437856
1,Temperature,0.000131,0.438205



   Warengruppe 2
Full model R²: 0.713
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.26841,0.444234
3,School Holiday,0.102827,0.609817
6,Seasonality,0.033636,0.679007
4,Public Holiday,0.020258,0.692386
5,Month,0.004247,0.708397
2,KielerWoche,0.004215,0.708428
1,Temperature,0.00351,0.709134



   Warengruppe 3
Full model R²: 0.686
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.132262,0.5538
3,School Holiday,0.101953,0.584108
6,Seasonality,0.068875,0.617186
5,Month,0.012093,0.673968
4,Public Holiday,0.009234,0.676828
1,Temperature,0.00198,0.684082
2,KielerWoche,0.000741,0.68532



   Warengruppe 4
Full model R²: 0.384
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.322933,0.061378
4,Public Holiday,0.029749,0.354561
5,Month,0.015941,0.368369
6,Seasonality,0.011158,0.373152
3,School Holiday,0.001686,0.382624
1,Temperature,0.000223,0.384088
2,KielerWoche,5e-06,0.384306



   Warengruppe 5
Full model R²: 0.129
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.054745,0.074611
3,School Holiday,0.040984,0.088372
6,Seasonality,0.01039,0.118966
4,Public Holiday,0.004812,0.124544
1,Temperature,0.001916,0.12744
2,KielerWoche,0.000425,0.128931
5,Month,0.000394,0.128962



   Warengruppe 6
Full model R²: 0.351
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
6,Seasonality,0.1917859,0.159182
0,Weekday,0.02679419,0.324174
5,Month,0.003624518,0.347343
1,Temperature,0.003596543,0.347371
3,School Holiday,0.0001364255,0.350831
2,KielerWoche,2.220446e-16,0.350968
4,Public Holiday,2.220446e-16,0.350968
