Create common Pandas Dataframe with all Data to do further analysis

In [52]:
import pandas as pd
import numpy as np

DATA_DIR = "/workspaces/bakery_sales_prediction/0_DataPreparation/raw-data"

## Holidays for SH
- Request Holidays from API
- Combine single dataframe with the holidays

In [4]:
all_holidays = []

for year in range(2013, 2020):
    holidays = pd.read_json(f'https://feiertage-api.de/api/?jahr={year}&nur_land=SH').T
    for holiday_name, row in holidays.iterrows():
        all_holidays.append({
        'datum': row['datum'],
        'holiday_name': holiday_name
        })
    
df_holiday = pd.DataFrame(all_holidays)
df_holiday['datum'] = pd.to_datetime(df_holiday['datum'])
df_holiday = df_holiday.sort_values('datum')

In [None]:
# Load the CSV files
# file1 = f'{DATA_DIR}/umsatzdaten_gekuerzt.csv'
file2 = f'{DATA_DIR}/wetter.csv'
file3 = f'{DATA_DIR}/kiwo.csv'
file4 = f'{DATA_DIR}/holidays.school.sh.csv'
file5 = f'{DATA_DIR}/holstein_kiel.csv'
file6 = f'{DATA_DIR}/thw_kiel_heimspiel.csv'
# umsatz = pd.read_csv(file1)
wetter = pd.read_csv(file2, parse_dates=['Datum'])
kiwo = pd.read_csv(file3, parse_dates=['Datum'])
school_holidays = pd.read_csv(file4, sep=";", usecols=["StartDate","EndDate","Name"], parse_dates=["StartDate","EndDate"])
holstein_kiel = pd.read_csv(file5, sep=";", index_col=False, parse_dates=['Datum'], usecols=["Datum","Heimspiel"], dayfirst=True)
thw_kiel = pd.read_csv(file6, sep=";", parse_dates=['Datum'], dayfirst=True)

# print(wetter['Wettercode'].value_counts())

# print(wetter.info())
# print(kiwo.info())
# print(school_holidays.info())
# print(holstein_kiel.info())
# print(thw_kiel.info())


In [19]:
niederschlag_art_codes = {
    # "Kein Niederschlag": [0, 1, 2, 3, 5, 10, 13, 17, 20, 21, 22, 25, 26, 28, 29, 43, 45, 47, 49],
    "Regen": [51, 53, 55, 58, 60, 61, 63, 65, 80, 81, 91],
    "Schnee": [71, 72, 73, 75, 77, 78, 85],
    "Gemischt": [68, 69, 79, 95]
}

niederschlag_art_number = {
    "Regen": 1,
    "Schnee": 2,
    "Gemischt": 3
}

niederschlag_intensität_codes = {
    "Leicht": [51, 60, 61, 68, 71, 80, 85],
    "Mittel": [53, 63, 69, 73, 81],
    "Stark": [55, 65, 75]    
}

niederschlag_intensität_number = {
    "Leicht": 1,
    "Mittel": 2,
    "Stark": 3 
}

gewitter_codes = [91, 95]

In [20]:
def kategorisiere_temperatur(temperatur, monat):
    if monat in [12, 1, 2]:  # Winter
        if temperatur < -2:
            return "kalt"
        elif -2 <= temperatur <= 5:
            return "mild"
        else:
            return "warm"
    elif monat in [3, 4, 5, 9, 10, 11]:  # Frühling/Herbst
        if temperatur < 8:
            return "kalt"
        elif 8 <= temperatur <= 15:
            return "mild"
        else:
            return "warm"
    else:  # Sommer
        if temperatur < 16:
            return "kalt"
        elif 16 <= temperatur <= 22:
            return "mild"
        else:
            return "warm"

def get_niederschlag_art(code):
    for key, values in niederschlag_art_codes.items():
        if code in values:
            return key
    return "Kein Niederschlag"

def get_niederschlag_intensität(code):
    for key, values in niederschlag_intensität_codes.items():
        if code in values:
            return key
    return "Keine"

def is_gewitter(code):
    return 1 if code in gewitter_codes else 0

def get_category_number(code, category_dict, number_dict):
    # Iterate through the dictionary
    for category, codes in category_dict.items():
        if code in codes:  # Check if the code exists in the list
            return number_dict[category]  # Return the corresponding number
    return 0  # Return None if no match is found

### Anpassung Wetterdaten

In [59]:
merged_df = wetter.copy()

merged_df['Niederschlag'] = merged_df['Wettercode'].apply(lambda x: get_category_number(x, niederschlag_art_codes, niederschlag_art_number))
merged_df['Niederschlag_Intesitaet'] = merged_df['Wettercode'].apply(lambda x: get_category_number(x, niederschlag_intensität_codes, niederschlag_intensität_number))
merged_df['is_Gewitter'] = merged_df['Wettercode'].apply(lambda x: is_gewitter(x)) 
merged_df['Temperatur_Kategorie'] = merged_df.apply(lambda row: kategorisiere_temperatur(row['Temperatur'], row['Datum'].month), axis=1)
merged_df = merged_df.drop(columns=['Wettercode'])

Merge data sets

In [60]:
# Merge the datasets on the "Datum" column
merged_df = pd.merge(merged_df, kiwo, on='Datum', how='left')
merged_df = merged_df.rename(columns={"KielerWoche": "is_KielerWoche"})
merged_df['is_KielerWoche'] = merged_df['is_KielerWoche'].fillna(0)

merged_df = pd.merge(merged_df, df_holiday, left_on='Datum', right_on="datum", how='left')
merged_df.drop('datum', axis=1, inplace=True)
merged_df['is_Holiday'] = np.where(merged_df['holiday_name'].notna(), 1, merged_df['holiday_name'])

merged_df = pd.merge(merged_df, thw_kiel, on='Datum', how='left')
merged_df = merged_df.rename(columns={"He": "is_KielerWoche"})



# Display the first few rows of the merged dataframe
merged_df.head()

Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Niederschlag,Niederschlag_Intesitaet,is_Gewitter,Temperatur_Kategorie,is_KielerWoche,holiday_name,is_Holiday
0,2012-01-01,8.0,9.825,14,1,0,False,warm,0.0,,
1,2012-01-02,7.0,7.4375,12,0,0,False,warm,0.0,,
2,2012-01-03,8.0,5.5375,18,1,2,False,warm,0.0,,
3,2012-01-04,4.0,5.6875,19,1,1,False,warm,0.0,,
4,2012-01-05,6.0,5.3,23,1,1,False,warm,0.0,,


Create overview about missing data

In [None]:
import matplotlib.pyplot as plt

# Prepare data for visualization
columns = merged_df.columns.drop('Datum')  # Exclude the date column
dates = pd.to_datetime(merged_df['Datum'])

# Create a binary representation for data presence (1 if data is present, 0 otherwise)
presence_data = merged_df[columns].notna().astype(int)
presence_data['Datum'] = dates

# Reshape data for plotting
melted_data = presence_data.melt(id_vars='Datum', var_name='Column', value_name='Has Data')

# Plotting
plt.figure(figsize=(15, 8))
for i, column in enumerate(columns):
    column_data = melted_data[melted_data['Column'] == column]
    plt.scatter(
        column_data['Datum'], 
        [i] * len(column_data), 
        c=column_data['Has Data'], 
        cmap='coolwarm', 
        label=column, 
        marker='|'
    )

# Customizing the plot
plt.yticks(range(len(columns)), columns)
plt.xlabel("Date")
plt.ylabel("Columns")
plt.title("Data Presence Across Columns Over Time")
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.legend(title="Columns", loc='upper left', bbox_to_anchor=(1.05, 1), fontsize='small')
plt.tight_layout()
plt.show()


save common data frame

In [11]:
# Save the merged dataframe to a CSV file
merged_df.to_csv('/workspaces/bakery_sales_prediction/0_DataPreparation/01_merged_data.csv', index=False)