In [9]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('merged_data_clean.csv')

# Convert date column to datetime
df['Datum'] = pd.to_datetime(df['Datum'])

# Add weekday column right after date
df.insert(1, 'Weekday', df['Datum'].dt.day_name())

# Extract year and calendar week (temporary for calculation)
df['Year'] = df['Datum'].dt.year
df['Week'] = df['Datum'].dt.isocalendar().week

# Calculate average temperature per calendar week across all years
weekly_avg_temp = df.groupby('Week')['Temperatur'].mean().reset_index()
weekly_avg_temp.columns = ['Week', 'AverageTemp']

# Add average temperatures to main dataframe
df = df.merge(weekly_avg_temp, on='Week', how='left')

# Calculate temperature deviation
df['Temp_Deviation'] = df['Temperatur'] - df['AverageTemp']

# Categorize temperature deviation
def categorize_temp_deviation(deviation):
    if abs(deviation) <= 0.5:
        return "Normal"
    elif 0.5 < deviation <= 1.5:
        return "Slightly warmer"
    elif deviation > 1.5:
        return "Significantly warmer"
    elif -1.5 <= deviation < -0.5:
        return "Slightly colder"
    else:  # deviation < -1.5
        return "Significantly colder"

df['Temp_vs_Avg'] = df['Temp_Deviation'].apply(categorize_temp_deviation)

# Drop temporary columns
df = df.drop(['Year', 'Week'], axis=1)

# Replace Warengruppe numbers with product names
warengruppe_mapping = {
    1: 'Brot',
    2: 'Brötchen', 
    3: 'Croissant',
    4: 'Konditorei',
    5: 'Kuchen',
    6: 'Saisonbrot'
}
df['Warengruppe'] = df['Warengruppe'].map(warengruppe_mapping)

# Display first few rows with new features
print("First few rows with new features:")
print(df.head(10))

# Save the updated DataFrame to a new CSV file
df.to_csv('merged_data_with_temperature.csv', index=False)

First few rows with new features:
       Datum    Weekday  Bewoelkung  Temperatur  Windgeschwindigkeit  \
0 2012-01-01     Sunday         8.0        9.82                   14   
1 2012-01-02     Monday         7.0        7.44                   12   
2 2012-01-03    Tuesday         8.0        5.54                   18   
3 2012-01-04  Wednesday         4.0        5.69                   19   
4 2012-01-05   Thursday         6.0        5.30                   23   
5 2012-01-06     Friday         3.0        2.62                   10   
6 2012-01-07   Saturday         7.0        6.53                   14   
7 2012-01-08     Sunday         7.0        5.96                   10   
8 2012-01-09     Monday         8.0        5.15                   12   
9 2012-01-10    Tuesday         6.0        6.11                   10   

   Wettercode  id Warengruppe  Umsatz  KielerWoche  AverageTemp  \
0        58.0 NaN         NaN     NaN          NaN     6.504342   
1         NaN NaN         NaN     NaN  

In [10]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('merged_data_clean.csv')

# Convert date column to datetime
df['Datum'] = pd.to_datetime(df['Datum'])

# Add weekday column right after date
df.insert(1, 'Weekday', df['Datum'].dt.day_name())

# Extract year and calendar week (temporary for calculation)
df['Year'] = df['Datum'].dt.year
df['Week'] = df['Datum'].dt.isocalendar().week

# Calculate average temperature per calendar week across all years
weekly_avg_temp = df.groupby('Week')['Temperatur'].mean().reset_index()
weekly_avg_temp.columns = ['Week', 'AverageTemp']

# Add average temperatures to main dataframe
df = df.merge(weekly_avg_temp, on='Week', how='left')

# Calculate temperature deviation
df['Temp_Deviation'] = df['Temperatur'] - df['AverageTemp']

# Categorize temperature deviation
def categorize_temp_deviation(deviation):
    if abs(deviation) <= 0.5:
        return "Normal"
    elif 0.5 < deviation <= 1.5:
        return "Slightly warmer"
    elif deviation > 1.5:
        return "Significantly warmer"
    elif -1.5 <= deviation < -0.5:
        return "Slightly colder"
    else:  # deviation < -1.5
        return "Significantly colder"

df['Temp_vs_Avg'] = df['Temp_Deviation'].apply(categorize_temp_deviation)

# Drop temporary columns
df = df.drop(['Year', 'Week'], axis=1)

# Replace Warengruppe numbers with product names
warengruppe_mapping = {
    1: 'Brot',
    2: 'Brötchen', 
    3: 'Croissant',
    4: 'Konditorei',
    5: 'Kuchen',
    6: 'Saisonbrot'
}
df['Warengruppe'] = df['Warengruppe'].map(warengruppe_mapping)

# Display first few rows with new features
print("First few rows with new features:")
print(df[['Datum', 'Weekday', 'Temperatur', 'AverageTemp', 'Temp_Deviation', 'Temp_vs_Avg']].head(10))

print(f"\nDistribution of temperature categories:")
print(df['Temp_vs_Avg'].value_counts())

print(f"\nTemperature deviation statistics:")
print(df['Temp_Deviation'].describe())

First few rows with new features:
       Datum    Weekday  Temperatur  AverageTemp  Temp_Deviation  \
0 2012-01-01     Sunday        9.82     6.504342        3.315658   
1 2012-01-02     Monday        7.44     3.062303        4.377697   
2 2012-01-03    Tuesday        5.54     3.062303        2.477697   
3 2012-01-04  Wednesday        5.69     3.062303        2.627697   
4 2012-01-05   Thursday        5.30     3.062303        2.237697   
5 2012-01-06     Friday        2.62     3.062303       -0.442303   
6 2012-01-07   Saturday        6.53     3.062303        3.467697   
7 2012-01-08     Sunday        5.96     3.062303        2.897697   
8 2012-01-09     Monday        5.15     4.240262        0.909738   
9 2012-01-10    Tuesday        6.11     4.240262        1.869738   

            Temp_vs_Avg  
0  Significantly warmer  
1  Significantly warmer  
2  Significantly warmer  
3  Significantly warmer  
4  Significantly warmer  
5                Normal  
6  Significantly warmer  
7  Signif

In [11]:
import pandas as pd

# 1. Load the CSV file
df = pd.read_csv("merged_data_with_temperature.csv")

# 2. Fill missing weather codes with 0
df['Wettercode'] = df['Wettercode'].fillna(0)

# 3. Function to classify weather codes into categories
def weather_rating(code):
    try:
        code = int(code)
    except:
        return "unknown"
    
    if code == 0:
        return "very good"
    elif code in [1, 2, 3, 45, 48]:
        return "good"
    elif code in range(51, 58) or code in [61, 62]:
        return "okay"
    elif code in range(63, 68) or code in [80, 81, 82] or code in range(71, 78):
        return "bad"
    elif code in [85, 86] or code in range(95, 100):
        return "very bad"
    else:
        return "unknown"

# 4. Apply the classification and create a new column
df['Weather_Impression'] = df['Wettercode'].apply(weather_rating)

# Optional: Preview the result
print(df[['Wettercode', 'Weather_Impression']].head(10))

# 5. Optional: Save the updated file
df.to_csv("merged_data_with_weather_impression.csv", index=False)
print("New file saved as: merged_data_with_weather_impression.csv")


   Wettercode Weather_Impression
0        58.0            unknown
1         0.0          very good
2        63.0                bad
3        80.0                bad
4        80.0                bad
5         0.0          very good
6        61.0               okay
7        80.0                bad
8        61.0               okay
9         0.0          very good
New file saved as: merged_data_with_weather_impression.csv


In [15]:
import pandas as pd
import statsmodels.api as sm

# Load your enriched data
df = pd.read_csv("merged_data_with_weather_impression.csv")

# Drop missing values in 'Umsatz' (target variable)
df = df.dropna(subset=['Umsatz'])

# Define predictors
predictors_weather = ['Weather_Impression']
additional_vars = ['Bewoelkung', 'Windgeschwindigkeit']

# One-hot encode 'Weather_Impression' (categorical variable)
weather_dummies = pd.get_dummies(df[predictors_weather], drop_first=True, dtype=int)

# Prepare target variable
y = df['Umsatz']

# Modell A: nur Weather_Impression
X_A = weather_dummies.copy()
X_A = sm.add_constant(X_A)

# Remove rows with NaN in predictors or target
valid_idx_A = y.notna() & X_A.notna().all(axis=1)
y_A = y[valid_idx_A]
X_A = X_A.loc[valid_idx_A]

model_A = sm.OLS(y_A, X_A).fit()

# Modell B: Weather_Impression + Bewoelkung + Windgeschwindigkeit
# Concatenate weather dummies und numerische Variablen
X_B = pd.concat([weather_dummies, df[additional_vars]], axis=1)
X_B = sm.add_constant(X_B)

# Remove rows with NaN in predictors or target
valid_idx_B = y.notna() & X_B.notna().all(axis=1)
y_B = y[valid_idx_B]
X_B = X_B.loc[valid_idx_B]

model_B = sm.OLS(y_B, X_B).fit()

# Compare results
print("Model A (only Weather_Impression): R² =", model_A.rsquared)
print("Model B (+ Bewoelkung & Windgeschwindigkeit): R² =", model_B.rsquared)

# Optional: print detailed summaries
# print(model_A.summary())
# print(model_B.summary())



Model A (only Weather_Impression): R² = 0.007074879585455829
Model B (+ Bewoelkung & Windgeschwindigkeit): R² = 0.011598883806447247


In [17]:
import pandas as pd
import statsmodels.api as sm

# Load enriched data with temperature categories
df = pd.read_csv("merged_data_with_temperature.csv")

# Drop rows with missing Umsatz (target variable)
df = df.dropna(subset=['Umsatz'])

# One-hot encode temperature deviation categories, ensure dtype int
temp_dummies = pd.get_dummies(df['Temp_vs_Avg'], drop_first=True).astype(int)

# Define predictors with constant
X = sm.add_constant(temp_dummies)
y = df['Umsatz']

# Fit linear regression model
model = sm.OLS(y, X).fit()

# Print R-squared and optional summary
print("Model with temperature deviation categories: R² =", model.rsquared)
#print(model.summary())



Model with temperature deviation categories: R² = 0.0005627975133447594
