In [32]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# 1. Load data
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Keep only product group 4
train = train[train["warengruppe"] == 4].copy()
test = test[test["warengruppe"] == 4].copy()

# 3. Convert date column to datetime
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

# 4. Create weekday, month, day-of-year
train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

train["Month"] = train["date"].dt.month
test["Month"] = test["date"].dt.month

train["dayofyear"] = train["date"].dt.dayofyear
test["dayofyear"] = test["date"].dt.dayofyear

# 5. Seasonal sine/cosine features
train["sin_season"] = np.sin(2 * np.pi * train["dayofyear"] / 365)
train["cos_season"] = np.cos(2 * np.pi * train["dayofyear"] / 365)

test["sin_season"] = np.sin(2 * np.pi * test["dayofyear"] / 365)
test["cos_season"] = np.cos(2 * np.pi * test["dayofyear"] / 365)

# 6. Kieler Woche: replace missing with 0 and cast to int
for df in [train, test]:
    df["KielerWoche"] = df["KielerWoche"].fillna(0).astype(int)

train.head()


Unnamed: 0,date,warengruppe,id,umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Wochentag,Month,dayofyear,sin_season,cos_season
3,2013-07-01,4,1307014,65.890169,6.0,17.8375,15.0,20.0,0,Monday,7,182,0.008607,-0.999963
8,2013-07-02,4,1307024,74.543917,3.0,17.3125,10.0,,0,Tuesday,7,183,-0.008607,-0.999963
13,2013-07-03,4,1307034,69.262728,7.0,21.075,6.0,61.0,0,Wednesday,7,184,-0.025818,-0.999667
18,2013-07-04,4,1307044,61.490175,7.0,18.85,7.0,20.0,0,Thursday,7,185,-0.043022,-0.999074
23,2013-07-05,4,1307054,86.759861,5.0,19.975,12.0,,0,Friday,7,186,-0.060213,-0.998186


In [33]:
# --- DATE FEATURES ---
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

# Weekday name
train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# Month (1–12)
train["Month"] = train["date"].dt.month
test["Month"] = test["date"].dt.month

# Day of year (1–365)
train["dayofyear"] = train["date"].dt.dayofyear
test["dayofyear"] = test["date"].dt.dayofyear

# Seasonal sine/cosine
train["sin_season"] = np.sin(2 * np.pi * train["dayofyear"] / 365)
train["cos_season"] = np.cos(2 * np.pi * train["dayofyear"] / 365)

test["sin_season"] = np.sin(2 * np.pi * test["dayofyear"] / 365)
test["cos_season"] = np.cos(2 * np.pi * test["dayofyear"] / 365)

# Kieler Woche: replace NaN with 0, make it integer
for df in [train, test]:
    df["KielerWoche"] = df["KielerWoche"].fillna(0).astype(int)


In [34]:
# 5. Features und Zielvariable
features = [
    "Temperatur",
    "KielerWoche",
    "Wochentag",
    "Month",
    "sin_season",
    "cos_season"
]
target = "umsatz"

X = train[features]
y = train[target]

X_test = test[features]

# 6. Preprocessing:
#    - Wochentag: Imputer + OneHotEncoder
#    - Temperatur & KielerWoche: numerisch mit Mittelwert-Imputation

categorical_features = ["Wochentag"]
numeric_features = ["Temperatur", "KielerWoche", "Month", "sin_season", "cos_season"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features),
        
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), numeric_features),
    ]
)


In [35]:
# 7. Lineares Regressionsmodell + Pipeline
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# Training
model.fit(X, y)

# R² auf Trainingsdaten
y_train_pred = model.predict(X)
r2_train = r2_score(y, y_train_pred)
print(f"R² für Trainingsdaten: {r2_train:.4f}")


R² für Trainingsdaten: 0.3529


In [36]:
# 8. Vorhersage für Testset
test["umsatz_Prediction"] = model.predict(X_test)

print(test[["date", "id", "umsatz_Prediction"]].head())

# 9. Submission als csv speichern
submission = test[["id", "umsatz_Prediction"]].copy()
submission.to_csv("submission_linear_regression_byGulfem.csv", index=False)
submission.head()


         date       id  umsatz_Prediction
3  2018-08-01  1808014          70.387085
8  2018-08-02  1808024          71.497570
13 2018-08-03  1808034          73.614325
18 2018-08-04  1808044          80.258261
23 2018-08-05  1808054         132.540429


Unnamed: 0,id,umsatz_Prediction
3,1808014,70.387085
8,1808024,71.49757
13,1808034,73.614325
18,1808044,80.258261
23,1808054,132.540429


In [37]:
# 10. Koeffizienten extrahieren

# a) Feature-Namen nach dem Preprocessing
ohe = model.named_steps["prep"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)

num_feature_names = np.array(numeric_features)

all_features = np.concatenate([cat_feature_names, num_feature_names])

# b) Koeffizienten aus dem Regressor
coefs = model.named_steps["reg"].coef_
intercept = model.named_steps["reg"].intercept_

coef_df = pd.DataFrame({
    "Feature": all_features,
    "Coefficient": coefs
})

print("Intercept:", intercept)
print("\nKoeffizienten (sortiert):")
display(coef_df.sort_values("Coefficient", ascending=False))


Intercept: 100.45269342127146

Koeffizienten (sortiert):


Unnamed: 0,Feature,Coefficient
3,Wochentag_Sunday,50.270864
11,cos_season,7.143013
7,Temperatur,0.139442
9,Month,-2.149384
2,Wochentag_Saturday,-2.428183
8,KielerWoche,-3.167509
10,sin_season,-3.505696
1,Wochentag_Monday,-5.530898
0,Wochentag_Friday,-9.307187
5,Wochentag_Tuesday,-10.141788


In [38]:
# === RUN ANOVA FOR ALL PRODUCT GROUPS ===

product_groups = [1, 2, 3, 4, 5, 6]

# Load the full training dataset once
train_all = pd.read_csv("analysis/train_split_merged_data_updated.csv")

for wg in product_groups:
    print("\n======================")
    print(f"   Warengruppe {wg}")
    print("======================")

    # Filter for this product group
    df = train_all[train_all["warengruppe"] == wg].copy()

    if df.empty:
        print("No data for this product group.")
        continue

    # --- FEATURE ENGINEERING ---
    df["date"] = pd.to_datetime(df["date"])
    df["Wochentag"] = df["date"].dt.day_name()
    df["Month"] = df["date"].dt.month
    df["dayofyear"] = df["date"].dt.dayofyear
    df["sin_season"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
    df["cos_season"] = np.cos(2 * np.pi * df["dayofyear"] / 365)
    df["KielerWoche"] = df["KielerWoche"].fillna(0).astype(int)

    target = "umsatz"

    full_features = [
        "Temperatur",
        "KielerWoche",
        "Wochentag",
        "Month",
        "sin_season",
        "cos_season",
    ]

    def compute_r2_local(features):
        X = df[features]
        y = df[target]

        categorical_features = [f for f in features if f == "Wochentag"]
        numeric_features = [f for f in features if f != "Wochentag"]

        transformers = []
        if categorical_features:
            transformers.append(
                ("cat", Pipeline([
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore"))
                ]), categorical_features)
            )
        if numeric_features:
            transformers.append(
                ("num", Pipeline([
                    ("imputer", SimpleImputer(strategy="mean"))
                ]), numeric_features)
            )

        preprocessor_local = ColumnTransformer(transformers=transformers)

        model_tmp = Pipeline(steps=[
            ("prep", preprocessor_local),
            ("reg", LinearRegression())
        ])

        model_tmp.fit(X, y)
        y_pred = model_tmp.predict(X)
        return r2_score(y, y_pred)

    # Full model R²
    r2_full = compute_r2_local(full_features)
    print(f"Full model R²: {r2_full:.3f}")

    feature_groups = {
        "Weekday": ["Wochentag"],
        "Temperature": ["Temperatur"],
        "KielerWoche": ["KielerWoche"],
        "Month": ["Month"],
        "Seasonality": ["sin_season", "cos_season"],
    }

    results = []
    for name, group in feature_groups.items():
        reduced_features = [f for f in full_features if f not in group]
        r2_reduced = compute_r2_local(reduced_features)
        delta_r2 = r2_full - r2_reduced
        results.append([name, delta_r2, r2_reduced])

    importance_df = pd.DataFrame(results, columns=["Feature Group", "ΔR²", "Reduced Model R²"])
    importance_df = importance_df.sort_values("ΔR²", ascending=False)

    print("ANOVA-like variance partitioning:")
    display(importance_df)



   Warengruppe 1
Full model R²: 0.361
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.295867,0.065468
4,Seasonality,0.010808,0.350528
3,Month,0.001764,0.359571
1,Temperature,0.000162,0.361173
2,KielerWoche,4e-06,0.361331



   Warengruppe 2
Full model R²: 0.590
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.242652,0.346985
4,Seasonality,0.040786,0.548852
1,Temperature,0.006879,0.582759
3,Month,0.002252,0.587385
2,KielerWoche,0.000552,0.589086



   Warengruppe 3
Full model R²: 0.575
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.115961,0.458966
4,Seasonality,0.078038,0.496889
3,Month,0.008901,0.566027
1,Temperature,0.004819,0.570108
2,KielerWoche,0.000164,0.574764



   Warengruppe 4
Full model R²: 0.353
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.317036,0.035824
3,Month,0.014066,0.338793
4,Seasonality,0.008088,0.344771
2,KielerWoche,0.000161,0.352698
1,Temperature,0.000152,0.352707



   Warengruppe 5
Full model R²: 0.084
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
0,Weekday,0.046915,0.036669
4,Seasonality,0.012846,0.070739
1,Temperature,0.000814,0.082771
3,Month,9e-05,0.083494
2,KielerWoche,2.5e-05,0.083559



   Warengruppe 6
Full model R²: 0.351
ANOVA-like variance partitioning:


Unnamed: 0,Feature Group,ΔR²,Reduced Model R²
4,Seasonality,0.201496,0.149335
0,Weekday,0.026896,0.323935
3,Month,0.004043,0.346789
1,Temperature,0.003657,0.347174
2,KielerWoche,0.0,0.350831
