In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# =====================================================
# 1. Load train / test
# =====================================================

train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test  = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# Keep ids for submission
test_ids = test["id"].copy()

# Mark dataset type and concatenate so that lags/rolling
# are computed consistently over time per warengruppe.
train["dataset"] = "train"
test["dataset"]  = "test"
df_all = pd.concat([train, test], ignore_index=True)

# =====================================================
# 2. Basic date & calendar features
# =====================================================

df_all["date"] = pd.to_datetime(df_all["date"])

df_all["Wochentag"] = df_all["date"].dt.day_name()
df_all["Month"]     = df_all["date"].dt.month
df_all["dayofyear"] = df_all["date"].dt.dayofyear # goes from 1 to 365 then jumps back to 1, 
# Additionally a linear model can’t naturally understand that Dec 31 and Jan 1 are “close”. Numerically they look far apart (365 vs 1)
# That's why we use sine and cosine transformations to encode seasonality in a way that reflects the cyclical nature of the data.
# The linear regression can now learn smooth yearly patterns like: high sales in summer, low sales in winter, spikes around certain times of year, etc.

df_all["sin_season"] = np.sin(2 * np.pi * df_all["dayofyear"] / 365)
df_all["cos_season"] = np.cos(2 * np.pi * df_all["dayofyear"] / 365)

df_all["is_weekend"] = df_all["Wochentag"].isin(["Saturday", "Sunday"]).astype(int)

# Integer/calendar flags
for col in ["KielerWoche", "school_holiday", "public_holiday"]:
    df_all[col] = df_all[col].fillna(0).astype(int)

# =====================================================
# 3. Lag features & rolling statistics per warengruppe
# =====================================================

# Sort so that groupby().shift() and rolling are correct in time
df_all = df_all.sort_values(["warengruppe", "date"])

# Lags (memory of the past) of target (umsatz): daily and weekly patterns
# Lags are used because bakery demand is often autocorrelated; past sales influence future sales. 
# e.g. if croissant sales were high yesterday, they are likely to be high today too.
# So with lags , the model also sees "recent demand level"
for lag in [1, 2, 7, 14]:
    df_all[f"lag_{lag}"] = (
        df_all
        .groupby("warengruppe")["umsatz"]
        .shift(lag)
    )

# Rolling mean & std of past sales; smoothes the demand and cature "recent level" and "volatility"
# "shift(1)" to avoid using current day's value in rolling stats, which would be cheating and avoids data leakage.
# Rolling mean says "on average, hoe much has this product group sold recently?": smoothes out day-to-day noise.
# Rolling std says "how much does sales vary recently?" or "How stable is demand recently?": captures volatility/ how noisy the data is.
for window in [7, 14, 30]:
    df_all[f"roll{window}_mean"] = (
        df_all
        .groupby("warengruppe")["umsatz"]
        .shift(1)
        .rolling(window)
        .mean()
    )
    df_all[f"roll{window}_std"] = (
        df_all
        .groupby("warengruppe")["umsatz"]
        .shift(1)
        .rolling(window)
        .std()
    )

# =====================================================
# 4. One-hot encode weekday (on full df), then split back
# =====================================================

df_all = pd.get_dummies(df_all, columns=["Wochentag"], drop_first=True)

train_fe = df_all[df_all["dataset"] == "train"].copy()
test_fe  = df_all[df_all["dataset"] == "test"].copy()

# =====================================================
# 5. Define feature set
# =====================================================

weekday_cols = [c for c in train_fe.columns if c.startswith("Wochentag_")]

feature_cols = [
    "Temperatur",
    "KielerWoche",
    "school_holiday",
    "public_holiday",
    "Month",
    "sin_season",
    "cos_season",
    "is_weekend",
    "lag_1",
    "lag_2",
    "lag_7",
    "lag_14",
    "roll7_mean",
    "roll7_std",
    "roll14_mean",
    "roll14_std",
    "roll30_mean",
    "roll30_std",
] + weekday_cols

target_col = "umsatz"

# =====================================================
# 6. Train one LinearRegression model per product group
# =====================================================

product_groups = sorted(train_fe["warengruppe"].unique())
pred_list = []
models_by_wg = {}

for wg in product_groups:
    print(f"\n==============================")
    print(f" Training model for wg = {wg}")
    print("==============================")

    train_wg = train_fe[train_fe["warengruppe"] == wg].copy()
    test_wg  = test_fe[test_fe["warengruppe"] == wg].copy()

    X_train = train_wg[feature_cols]
    y_train = train_wg[target_col]
    X_test  = test_wg[feature_cols]

    # ColumnTransformer for numeric vs weekday dummies
    # (weekday dummies are already numeric, but we can treat all as numeric
    #  and simply impute; no categorical encoding needed anymore.)
    numeric_features = feature_cols  # all are numeric after get_dummies

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imputer", SimpleImputer(strategy="mean"))
            ]), numeric_features),
        ]
    )

    model = Pipeline(steps=[
        ("prep", preprocessor),
        ("reg", LinearRegression())
    ])

    model.fit(X_train, y_train)

    # Store model (for example, to inspect coefficients later)
    models_by_wg[wg] = model

    # R² on training data
    y_train_pred = model.predict(X_train)
    r2 = r2_score(y_train, y_train_pred)
    print(f"Train R² for wg {wg}: {r2:.4f}")

    # Predict on this group's test rows
    y_test_pred = model.predict(X_test)

    pred_list.append(
        pd.DataFrame({
            "id": test_wg["id"].values,
            "umsatz_Prediction": y_test_pred
        })
    )

# =====================================================
# 7. Build submission file
# =====================================================

submission = pd.concat(pred_list, ignore_index=True)
submission = submission.sort_values("id")  # nice ordering

submission.to_csv("submission_linear_regression_byGulfem_2.csv", index=False)
submission.head()



 Training model for wg = 1
Train R² for wg 1: 0.5143

 Training model for wg = 2
Train R² for wg 2: 0.8545

 Training model for wg = 3
Train R² for wg 3: 0.8449

 Training model for wg = 4
Train R² for wg 4: 0.5228

 Training model for wg = 5
Train R² for wg 5: 0.2480

 Training model for wg = 6
Train R² for wg 6: 0.4082


Unnamed: 0,id,umsatz_Prediction
0,1808011,148.362227
355,1808012,554.938936
710,1808013,287.473863
1065,1808014,85.512216
1419,1808015,271.070706
