In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('merged_data_clean.csv')

# Convert date column to datetime
df['Datum'] = pd.to_datetime(df['Datum'])

# Add weekday column right after date
df.insert(1, 'Weekday', df['Datum'].dt.day_name())

# Extract year and calendar week (temporary for calculation)
df['Year'] = df['Datum'].dt.year
df['Week'] = df['Datum'].dt.isocalendar().week

# Calculate average temperature per calendar week across all years
weekly_avg_temp = df.groupby('Week')['Temperatur'].mean().reset_index()
weekly_avg_temp.columns = ['Week', 'AverageTemp']

# Add average temperatures to main dataframe
df = df.merge(weekly_avg_temp, on='Week', how='left')

# Calculate temperature deviation
df['Temp_Deviation'] = df['Temperatur'] - df['AverageTemp']

# Categorize temperature deviation
def categorize_temp_deviation(deviation):
    if abs(deviation) <= 0.5:
        return "Normal"
    elif 0.5 < deviation <= 1.5:
        return "Slightly warmer"
    elif deviation > 1.5:
        return "Significantly warmer"
    elif -1.5 <= deviation < -0.5:
        return "Slightly colder"
    else:  # deviation < -1.5
        return "Significantly colder"

df['Temp_vs_Avg'] = df['Temp_Deviation'].apply(categorize_temp_deviation)

# Drop temporary columns
df = df.drop(['Year', 'Week'], axis=1)

# Replace Warengruppe numbers with product names
warengruppe_mapping = {
    1: 'Brot',
    2: 'Brötchen', 
    3: 'Croissant',
    4: 'Konditorei',
    5: 'Kuchen',
    6: 'Saisonbrot'
}
df['Warengruppe'] = df['Warengruppe'].map(warengruppe_mapping)

# Display first few rows with new features
print("First few rows with new features:")
print(df.head(10))

# Save the updated DataFrame to a new CSV file
df.to_csv('merged_data_with_temperature.csv', index=False)

First few rows with new features:
       Datum    Weekday  Bewoelkung  Temperatur  Windgeschwindigkeit  \
0 2012-01-01     Sunday         8.0        9.82                   14   
1 2012-01-02     Monday         7.0        7.44                   12   
2 2012-01-03    Tuesday         8.0        5.54                   18   
3 2012-01-04  Wednesday         4.0        5.69                   19   
4 2012-01-05   Thursday         6.0        5.30                   23   
5 2012-01-06     Friday         3.0        2.62                   10   
6 2012-01-07   Saturday         7.0        6.53                   14   
7 2012-01-08     Sunday         7.0        5.96                   10   
8 2012-01-09     Monday         8.0        5.15                   12   
9 2012-01-10    Tuesday         6.0        6.11                   10   

   Wettercode  id Warengruppe  Umsatz  KielerWoche  AverageTemp  \
0        58.0 NaN         NaN     NaN          NaN     6.504342   
1         NaN NaN         NaN     NaN  

In [10]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('merged_data_clean.csv')

# Convert date column to datetime
df['Datum'] = pd.to_datetime(df['Datum'])

# Add weekday column right after date
df.insert(1, 'Weekday', df['Datum'].dt.day_name())

# Extract year and calendar week (temporary for calculation)
df['Year'] = df['Datum'].dt.year
df['Week'] = df['Datum'].dt.isocalendar().week

# Calculate average temperature per calendar week across all years
weekly_avg_temp = df.groupby('Week')['Temperatur'].mean().reset_index()
weekly_avg_temp.columns = ['Week', 'AverageTemp']

# Add average temperatures to main dataframe
df = df.merge(weekly_avg_temp, on='Week', how='left')

# Calculate temperature deviation
df['Temp_Deviation'] = df['Temperatur'] - df['AverageTemp']

# Categorize temperature deviation
def categorize_temp_deviation(deviation):
    if abs(deviation) <= 0.5:
        return "Normal"
    elif 0.5 < deviation <= 1.5:
        return "Slightly warmer"
    elif deviation > 1.5:
        return "Significantly warmer"
    elif -1.5 <= deviation < -0.5:
        return "Slightly colder"
    else:  # deviation < -1.5
        return "Significantly colder"

df['Temp_vs_Avg'] = df['Temp_Deviation'].apply(categorize_temp_deviation)

# Drop temporary columns
df = df.drop(['Year', 'Week'], axis=1)

# Replace Warengruppe numbers with product names
warengruppe_mapping = {
    1: 'Brot',
    2: 'Brötchen', 
    3: 'Croissant',
    4: 'Konditorei',
    5: 'Kuchen',
    6: 'Saisonbrot'
}
df['Warengruppe'] = df['Warengruppe'].map(warengruppe_mapping)

# Display first few rows with new features
print("First few rows with new features:")
print(df[['Datum', 'Weekday', 'Temperatur', 'AverageTemp', 'Temp_Deviation', 'Temp_vs_Avg']].head(10))

print(f"\nDistribution of temperature categories:")
print(df['Temp_vs_Avg'].value_counts())

print(f"\nTemperature deviation statistics:")
print(df['Temp_Deviation'].describe())

First few rows with new features:
       Datum    Weekday  Temperatur  AverageTemp  Temp_Deviation  \
0 2012-01-01     Sunday        9.82     6.504342        3.315658   
1 2012-01-02     Monday        7.44     3.062303        4.377697   
2 2012-01-03    Tuesday        5.54     3.062303        2.477697   
3 2012-01-04  Wednesday        5.69     3.062303        2.627697   
4 2012-01-05   Thursday        5.30     3.062303        2.237697   
5 2012-01-06     Friday        2.62     3.062303       -0.442303   
6 2012-01-07   Saturday        6.53     3.062303        3.467697   
7 2012-01-08     Sunday        5.96     3.062303        2.897697   
8 2012-01-09     Monday        5.15     4.240262        0.909738   
9 2012-01-10    Tuesday        6.11     4.240262        1.869738   

            Temp_vs_Avg  
0  Significantly warmer  
1  Significantly warmer  
2  Significantly warmer  
3  Significantly warmer  
4  Significantly warmer  
5                Normal  
6  Significantly warmer  
7  Signif

In [10]:
import pandas as pd

# 1. Load the CSV file
df = pd.read_csv("merged_data_with_temperature.csv")

# 2. Fill missing weather codes with 0
df['Wettercode'] = df['Wettercode'].fillna(0)

# 3. Classify weather codes into categories
def weather_rating_revised(code):
    try:
        code = int(code)
    except:
        return "unknown"
    
    if code in [0, 1, 2]:
        return "very good"
    elif code in [3, 4, 5, 6, 7, 8, 10, 11, 12] + list(range(28, 50)):
        return "good"
    elif code in [20, 21, 24, 25] + list(range(50, 60)) + list(range(60, 66)) + [91, 92]:
        return "okay"
    elif code in [13,22, 23, 26, 27] + list(range(68, 80)) + list(range(83, 89)) + [93, 94]:
        return "bad"
    elif code in [9, 17, 18, 19, 29] + list(range(30, 40)) + [80, 81, 82, 85, 86] + list(range(95, 100)):
        return "very bad"
    else:
        return "unknown"

# 4. Apply classification
df['Weather_Impression'] = df['Wettercode'].apply(weather_rating_revised)


# 5. Save the updated file
df.to_csv("merged_data_temperature+weather_impression.csv", index=False)
print("New file saved as: merged_data_temperature+weather_impression.csv")




New file saved as: merged_data_temperature+weather_impression.csv


Addition of German holidays into csv

In [13]:
import pandas as pd

# 1. Load main data
df_main = pd.read_csv("merged_data_temperature+weather_impression.csv")

# 2. Load holidays data
df_holidays = pd.read_csv("german_holidays.csv")  

# 3. Convert 'Datum' columns to datetime
df_main['Datum'] = pd.to_datetime(df_main['Datum'])
df_holidays['Datum'] = pd.to_datetime(df_holidays['Datum'])

# 4. Merge on 'Datum'
df_merged = pd.merge(df_main, df_holidays[['Datum', 'Is_Holiday']], on='Datum', how='left')

# 5. Fill NaNs in 'Is_Holiday' with 0 (non-holiday)
df_merged['Is_Holiday'] = df_merged['Is_Holiday'].fillna(0).astype(int)

# 6. Save the updated file
df_merged.to_csv("/workspaces/Bakery_predictions/0_DataPreparation/initialdata/merged_data_temperature+holidays+weather_impressions.csv", index=False)

print("Merged file saved with holiday information.")


Merged file saved with holiday information.
