In [None]:
##preparation of wether data --> definig for each day if temperature is low, medium or high based on standard deviation
import pandas as pd
import numpy as np
import os

# Set working directory
os.chdir('C:\\Users\\julia\\Documents\\Uni_lokal\\ML')  

# Load weather data
wetter= pd.read_csv('wetter.csv')
wetter['Datum'] = pd.to_datetime(wetter['Datum'])

# Function to categorize temperature based on standard deviation
# extract month and date from the date column and add into one new column
wetter['monat'] = wetter['Datum'].dt.month
wetter['tag'] = wetter['Datum'].dt.day
wetter['kalendertag'] = wetter['Datum'].dt.strftime('%m-%d')  # Format: '01-15' für 15. Januar

# calculate historical stats for each calendar day
historical_stats = wetter.groupby('kalendertag')['Temperatur'].agg([
    ('mittelwert', 'mean'),
    ('std_abweichung', 'std')
]).reset_index()

# Merge historical stats back to the original dataframe
wetter = pd.merge(wetter, historical_stats, on='kalendertag', how='left')

#calculation normalized deviation
wetter['normalized_dev'] = (wetter['Temperatur'] - wetter['mittelwert']) / wetter['std_abweichung']

# Define temperature categories based on normalized deviation with thresholds at -1 and +1 standard deviations
def categorize_temperature(dev):
    if dev < -1:
        return 'low'
    elif dev > 1:
        return 'high'
    else:
        return 'medium' 
wetter['temp_category'] = wetter['normalized_dev'].apply(categorize_temperature)
#wetter = wetter.drop(columns=['mittelwert', 'std_abweichung', 'normalized_dev', 'monat', 'tag', 'kalendertag'])


print(wetter.head())

# Save the processed weather data
wetter.to_csv('wetter_processed.csv', index=False)



       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  monat  \
0 2012-01-01         8.0      9.8250                   14        58.0      1   
1 2012-01-02         7.0      7.4375                   12         NaN      1   
2 2012-01-03         8.0      5.5375                   18        63.0      1   
3 2012-01-04         4.0      5.6875                   19        80.0      1   
4 2012-01-05         6.0      5.3000                   23        80.0      1   

   tag kalendertag  mittelwert  std_abweichung  normalized_dev temp_category  
0    1       01-01    5.908929        1.968869        1.988996          high  
1    2       01-02    4.667857        2.954021        0.937584        medium  
2    3       01-03    3.564286        4.489864        0.439482        medium  
3    4       01-04    3.776786        4.404173        0.433842        medium  
4    5       01-05    3.291071        4.597229        0.436987        medium  


In [9]:
# 5. Weitere nützliche Features: rolling window
wetter['temp_delta_vortag'] = wetter['Temperatur'].diff()
wetter['temp_rolling_7d'] = wetter['Temperatur'].rolling(window=7, min_periods=1).mean()

print(wetter.head())



       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  monat  \
0 2012-01-01         8.0      9.8250                   14        58.0      1   
1 2012-01-02         7.0      7.4375                   12         NaN      1   
2 2012-01-03         8.0      5.5375                   18        63.0      1   
3 2012-01-04         4.0      5.6875                   19        80.0      1   
4 2012-01-05         6.0      5.3000                   23        80.0      1   

   tag kalendertag  mittelwert  std_abweichung  normalized_dev temp_category  \
0    1       01-01    5.908929        1.968869        1.988996          high   
1    2       01-02    4.667857        2.954021        0.937584        medium   
2    3       01-03    3.564286        4.489864        0.439482        medium   
3    4       01-04    3.776786        4.404173        0.433842        medium   
4    5       01-05    3.291071        4.597229        0.436987        medium   

   temp_delta_vortag  temp_rolling_7d 

In [21]:
# aggrigate wettercode to coarser classes

def aggregate_weather_dwd(ww_code):
    if pd.isna(ww_code):
        return '-999'
    if ww_code in [0, 1, 2, 3]:  
        return '1' # good weather
    elif ww_code in [50, 51, 53, 55, 56, 57, 58, 61, 63]:
        return '3' #moderate rain
    elif ww_code in [65, 66, 67, 77, 80, 81, 82, 95, 96, 97]:
        return '4' # bad weather
    elif ww_code in [70, 71, 73, 75, 77, 85, 86]:
        return '5' # snow
    else:
        return '2' # cloudy
wetter['wettercode_agg'] = wetter['Wettercode'].apply(aggregate_weather_dwd)
print(wetter.head())




       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2012-01-01         8.0      9.8250                   14        58.0   
1 2012-01-02         7.0      7.4375                   12         NaN   
2 2012-01-03         8.0      5.5375                   18        63.0   
3 2012-01-04         4.0      5.6875                   19        80.0   
4 2012-01-05         6.0      5.3000                   23        80.0   

  temp_category  temp_delta_vortag  temp_rolling_7d wettercode_agg  
0          high                NaN         9.825000              3  
1        medium            -2.3875         8.631250           -999  
2        medium            -1.9000         7.600000              3  
3        medium             0.1500         7.121875              4  
4        medium            -0.3875         6.757500              4  


In [None]:
# delete unnecessary columns
wetter = wetter.drop(columns=['mittelwert', 'std_abweichung', 'normalized_dev', 'monat', 'tag', 'kalendertag'])


# save as csv
wetter.to_csv('wetter_edited.csv', index=False, sep =';')

In [None]:
# join to other merged data

#load data
total_data = pd.read_csv('total_weekdays.csv', sep=';')
total_data['Datum'] = pd.to_datetime(total_data['Datum'])
feiertage = pd.read_csv('Feiertage_holidays_sh_2013_2019.csv', sep=',')
feiertage['Datum'] = pd.to_datetime(feiertage['Datum'])


total_data = total_data.merge(wetter[['Datum','temp_category', 'temp_rolling_7d', 'wettercode_agg']], on="Datum", how="left")
total_data = total_data.merge(feiertage[['Datum','is_holiday']], on="Datum", how="left")

print(total_data.head())
total_data.to_csv('total_weekdays_wetter.csv', index=False, sep=';')



   Unnamed: 0      Datum      Umsatz  Bewoelkung  Temperatur  \
0           0 2013-07-01  148.828353         6.0     17.8375   
1           1 2013-07-01  535.856285         6.0     17.8375   
2           2 2013-07-01  201.198426         6.0     17.8375   
3           3 2013-07-01   65.890169         6.0     17.8375   
4           4 2013-07-01  317.475875         6.0     17.8375   

   Windgeschwindigkeit  Wettercode  KielerWoche  Warengruppe         id  \
0                 15.0        20.0       -999.0          1.0  1307011.0   
1                 15.0        20.0       -999.0          2.0  1307012.0   
2                 15.0        20.0       -999.0          3.0  1307013.0   
3                 15.0        20.0       -999.0          4.0  1307014.0   
4                 15.0        20.0       -999.0          5.0  1307015.0   

   Wochentag temp_category  temp_rolling_7d wettercode_agg  is_holiday  
0          0        medium        14.469643              2           0  
1          0      

In [41]:
## split data into train, validation and test sets for linear regression
# splitting 
train_split = total_data[total_data["Datum"] <= "2017-07-31"]
validation_split = total_data[(total_data["Datum"] > "2017-07-31")
                          & (total_data["Datum"] <= "2018-07-31")]
test_split = total_data[(total_data["Datum"] > "2018-07-31")
                  & (total_data["Datum"] <= "2019-07-31")]

import statsmodels.formula.api as smf

#omit -999 clumnes for the model

train_clean = train_split.replace(-999, pd.NA)



In [42]:
#simple linear regression model with Wochentag as predictor
model = smf.ols(formula='Umsatz ~ wettercode_agg + Wochentag + temp_category + is_holiday', data=train_clean).fit()
print(model.summary())

# use model to predict Umsatz for test set
test_clean = test_split.replace(-999, pd.NA)
umsatz_pred_test = model.predict(test_clean)
print(umsatz_pred_test)

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     26.27
Date:                Thu, 11 Dec 2025   Prob (F-statistic):           4.12e-45
Time:                        11:15:53   Log-Likelihood:                -47909.
No. Observations:                7487   AIC:                         9.584e+04
Df Residuals:                    7477   BIC:                         9.591e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 