In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as smf



In [2]:
features = pd.read_csv("../../0_DataPreparation/features.csv")
features

Unnamed: 0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,CPI,...,Weather_Other,Weather_Rain,Weather_Showers_and_Thunderstorms,Weather_Snow_and_Ice,Group_brot,Group_brotchen,Group_croissant,Group_konditorei,Group_kuchen,Group_saisonbrot
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,93.5,...,0,0,0,0,1,0,0,0,0,0
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,93.5,...,1,0,0,0,1,0,0,0,0,0
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,93.5,...,0,1,0,0,1,0,0,0,0,0
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,93.5,...,0,0,0,0,1,0,0,0,0,0
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,93.5,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,98.5,...,1,0,0,0,0,0,0,0,0,1
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,98.5,...,0,1,0,0,0,0,0,0,0,1
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,98.5,...,0,0,0,0,0,0,0,0,0,1
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,98.5,...,0,0,0,0,0,0,0,0,0,1


## baseline model

In [3]:

train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'


train_data = features[features['Datum']<=train_end_date].set_index("id")
vali_data = features[(features['Datum']> train_end_date) & (features['Datum']<=validation_end_date)].set_index("id")
test_data = features[(features['Datum']> validation_end_date)].set_index("id")


In [4]:
# Set 'id' as index
df = train_data

# Drop rows with missing values for simplicity in this baseline modeling
df_clean = df.dropna()

groups = {
    "Zeit" : "Wochentag + Monat + Wochenende",
    "Wetter" : "Temperatureclass_enc + Cloudclass_enc + Windclass_enc ",
    "Wetter_Group" : "Weather_Other + Weather_Rain + Weather_After_Rain + Weather_Showers_and_Thunderstorms + Weather_Snow_and_Ice + Weather_Fog",
    "Product" : "Warengruppe + Group_brot + Group_brotchen + Group_croissant + Group_konditorei + Group_kuchen + Group_saisonbrot",
    "Ökonomie": "CPI + Unemployment + GDP",
    "Holiday" : "Schulferien"
}

# Define a list of formulas for different baseline models
formulas = {
    "Zeit": f"Umsatz ~ {groups['Zeit']}",
    "Wetter": f"Umsatz ~ {groups['Wetter']}",
    "Product": f"Umsatz ~ {groups['Product']}",
    "Ökonomie": f"Umsatz ~ {groups['Ökonomie']}",
    "Zeit+Wetter": f"Umsatz ~ {groups['Zeit']} + {groups['Wetter']}",
    "Zeit+Product": f"Umsatz ~ {groups['Zeit']} + {groups['Product']}",
    "Zeit+Ökonomie": f"Umsatz ~ {groups['Zeit']} + {groups['Ökonomie']}",
    "Zeit+Holiday": f"Umsatz ~ {groups['Zeit']} + {groups['Holiday']}",
    "Zeit+Wetter+Product": f"Umsatz ~ {groups['Zeit']} + {groups['Wetter']} + {groups['Product']}",
    "Vollmodell": f"Umsatz ~ {groups['Zeit']} + {groups['Wetter']} + {groups['Product']} + {groups['Ökonomie']} + {groups['Holiday']}",
    "Vollmodell+": f"Umsatz ~ {groups['Zeit']} + {groups['Wetter']} + {groups['Wetter_Group']} + {groups['Product']} + {groups['Ökonomie']} + {groups['Holiday']}"
}

# Fit models and collect performance metrics
results = []
for name, formula in formulas.items():
    model = smf.ols(formula, data=df_clean).fit()
    results.append({
        "Modell": name,
        "R²": round(model.rsquared, 4),
        "AIC": round(model.aic, 2),
        "BIC": round(model.bic, 2),
        "Anzahl Parameter": int(model.df_model + 1)
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)
print(results_df)

                 Modell      R²       AIC       BIC  Anzahl Parameter
10          Vollmodell+  0.7194  62142.34  62280.82                21
9            Vollmodell  0.7192  62137.33  62242.83                16
8   Zeit+Wetter+Product  0.6936  62599.34  62678.46                12
5          Zeit+Product  0.6640  63091.62  63150.97                 9
2               Product  0.6305  63603.67  63656.42                 8
4           Zeit+Wetter  0.0705  68581.33  68627.49                 7
7          Zeit+Holiday  0.0624  68624.24  68657.21                 5
6         Zeit+Ökonomie  0.0572  68658.48  68704.64                 7
1                Wetter  0.0436  68729.67  68756.04                 4
0                  Zeit  0.0304  68803.33  68829.70                 4
3              Ökonomie  0.0268  68823.69  68850.07                 4


In [5]:
model = smf.ols(formulas['Vollmodell+'], data=train_data.dropna()).fit()
model.summary()

0,1,2,3
Dep. Variable:,Umsatz,R-squared:,0.719
Model:,OLS,Adj. R-squared:,0.718
Method:,Least Squares,F-statistic:,689.5
Date:,"Tue, 17 Jun 2025",Prob (F-statistic):,0.0
Time:,16:27:10,Log-Likelihood:,-31050.0
No. Observations:,5399,AIC:,62140.0
Df Residuals:,5378,BIC:,62280.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-606.8412,147.554,-4.113,0.000,-896.107,-317.575
Wochentag,1.4730,0.851,1.731,0.083,-0.195,3.141
Monat,1.3425,0.456,2.944,0.003,0.449,2.236
Wochenende,49.7730,3.766,13.216,0.000,42.390,57.156
Temperatureclass_enc,25.4082,1.986,12.796,0.000,21.515,29.301
Cloudclass_enc,2.9769,2.377,1.252,0.210,-1.683,7.637
Windclass_enc,0.5607,1.126,0.498,0.619,-1.647,2.769
Weather_Other,-105.6750,24.734,-4.273,0.000,-154.163,-57.187
Weather_Rain,-105.4655,24.616,-4.284,0.000,-153.723,-57.208

0,1,2,3
Omnibus:,6161.743,Durbin-Watson:,1.158
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2068743.864
Skew:,5.423,Prob(JB):,0.0
Kurtosis:,98.281,Cond. No.,1.78e+18


In [6]:
pred = model.predict(test_data).clip(lower=0)
pred_df = pred.rename("Umsatz").reset_index()
pred_df.to_csv("prognose_kaggle_baseline.csv", index=False)

pred_df

Unnamed: 0,id,Umsatz
0,1808011,174.991319
1,1808021,201.872462
2,1808031,203.345448
3,1808041,254.591405
4,1808051,227.888795
...,...,...
1825,1812226,118.447012
1826,1812236,145.537631
1827,1812246,63.326599
1828,1812276,93.153713


In [8]:
# Separating features and labels
training_features = train_data.drop('Umsatz', axis=1)
validation_features = vali_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = train_data[['Umsatz']]
validation_labels = vali_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam

# model = Sequential([
#   InputLayer(shape=(training_features.shape[1], )),
#   BatchNormalization(),
#   Dense(10, activation='relu'),
#   Dense(4, activation='relu'),
#   Dense(1, activation='relu')
# ])

# model.summary()


: 

: 