In [43]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import statsmodels.formula.api as smf



# 1 Initial Data
### 1.1 Import all datasets

#### Original data

In [44]:
sales = pd.read_csv("./umsatzdaten_gekuerzt.csv")
weather = pd.read_csv("./wetter.csv")
kiwo = pd.read_csv("./kiwo.csv")
test_ids = pd.read_csv("./test.csv")


# Convert all date columns to datetime format
sales["Datum"] = pd.to_datetime(sales["Datum"])
weather["Datum"] = pd.to_datetime(weather["Datum"])
kiwo["Datum"] = pd.to_datetime(kiwo["Datum"])
test_ids["Datum"] = pd.to_datetime(test_ids["Datum"])


#### Economic indicators

In [6]:
# Load the dataset 
cpi_url = 'https://www.destatis.de/static/de_/opendata/data/verbraucherpreisindex_gesamtindex_bv41.csv'
gdp_url = 'https://www.destatis.de/static/de_/opendata/data/bruttoinlandsprodukt_originalwert.csv'
unemp_url = 'https://www.destatis.de/static/de_/opendata/data/arbeitslosenquote_deutschland_originalwert.csv'

# Read the economic data from the URLs
cpi = pd.read_csv(cpi_url, sep=';', skiprows=1)
gdp = pd.read_csv(gdp_url, sep=';', skiprows=1)
unemp = pd.read_csv(unemp_url, sep=';', encoding='latin-1', skiprows=1, on_bad_lines='skip')

# Process the data
cpi['Month'] = pd.to_datetime(cpi['Datum'], format='%d/%m/%Y').dt.to_period('M').dt.start_time
unemp['Month'] = pd.to_datetime(unemp['Datum'], format='%d/%m/%Y').dt.to_period('M').dt.start_time
gdp['Quarter'] = pd.to_datetime(gdp['Datum'], format='%d/%m/%Y').dt.to_period('Q').dt.start_time

# Keep the first 3 columns and rename them
cpi = cpi[['Month', 'Originalwert, 2020=100']]
gdp = gdp[['Quarter', 'in jeweiligen Preisen, Mrd. EUR, Originalwert']]
unemp = unemp[['Month', 'Arbeitslosenquote aller zivilen Erwerbspersonen, insgesamt in %']]

# Rename the columns
cpi.columns = ['Month', 'CPI']
gdp.columns = ['Quarter', 'GDP']
unemp.columns = ['Month', 'Unemployment']


In [7]:
# Merge dataframes
econ = pd.merge(cpi, unemp, on='Month', how='outer')
econ['Quarter'] = econ['Month'].dt.to_period('Q').dt.start_time
econ = pd.merge(econ, gdp, on='Quarter', how='outer')
# Keep the rows where date is between 01-01-2012 and 01-08-2019
econ = econ[(econ['Month'] >= '2012-01-01') & (econ['Month'] <= '2019-08-01')]

# Convert object to float64 
econ['GDP'] = econ['GDP'].str.replace(',', '.').astype(float)
econ['CPI'] = econ['CPI'].str.replace(',', '.').astype(float)
econ['Unemployment'] = econ['Unemployment'].str.replace(',', '.').astype(float)

### 1.2 Merge all the dataframes step by step
One dataframe merges sales with weater and kiwo.

a second dataframe merges the test ids with weater and kiwo.

both get concated to `feature_collection`. here we can collect all aditional features we want in our model.

we will split the feature df later into training, validation and test data.

In [8]:
weather

  final_merged = pd.concat([merged, test_merged], ignore_index=True)


Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode
0,2012-01-01,8.0,9.8250,14,58.0
1,2012-01-02,7.0,7.4375,12,
2,2012-01-03,8.0,5.5375,18,63.0
3,2012-01-04,4.0,5.6875,19,80.0
4,2012-01-05,6.0,5.3000,23,80.0
...,...,...,...,...,...
2596,2019-07-28,3.0,23.3500,14,5.0
2597,2019-07-29,6.0,25.2500,7,61.0
2598,2019-07-30,7.0,20.7375,8,61.0
2599,2019-07-31,6.0,20.4500,7,61.0


### 1.2 Merge all the dataframes step by step
One dataframe merges sales with weater and kiwo.

a second dataframe merges the test ids with weater and kiwo.

both get concated to `feature_collection`. here we can collect all aditional features we want in our model.

we will split the feature df later into training, validation and test data.

In [46]:
# 1. Merge sales data with weather data
merged = pd.merge(sales, weather, on="Datum", how="left")

# 2. Merge with Kieler Woche data
merged = pd.merge(merged, kiwo, on="Datum", how="left")
merged["KielerWoche"] = merged["KielerWoche"].fillna(0).astype(int)

# 3. Merge test data with weather and Kieler Woche data
test_merged = pd.merge(test_ids, weather, on="Datum", how="left")
test_merged = pd.merge(test_merged, kiwo, on="Datum", how="left")
test_merged["KielerWoche"] = test_merged["KielerWoche"].fillna(0).astype(int)

# 4. Add empty sales column to test dataset
test_merged["Umsatz"] = pd.NA

# 5. Combine training and test data into one DataFrame
final_merged = pd.concat([merged, test_merged], ignore_index=True)

# 6. Merge with economic data
final_merged['Month'] = final_merged['Datum'].dt.to_period('M').dt.start_time
feature_collection = pd.merge(final_merged, econ, on='Month', how='left') 

# 6. Make id the index of the dataframe
#feature_collection = feature_collection.set_index("id")
feature_collection

Unnamed: 0_level_0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,Unemployment,Quarter,GDP
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,4.9,2018-10-01,881.52
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52


In [9]:
feature_collection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11164 entries, 1307011 to 1812286
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Datum                11164 non-null  datetime64[ns]
 1   Warengruppe          11164 non-null  int64         
 2   Umsatz               9334 non-null   float64       
 3   Bewoelkung           11029 non-null  float64       
 4   Temperatur           11083 non-null  float64       
 5   Windgeschwindigkeit  11083 non-null  float64       
 6   Wettercode           8502 non-null   float64       
 7   KielerWoche          11164 non-null  int64         
 8   Month                11164 non-null  datetime64[ns]
 9   CPI                  11164 non-null  float64       
 10  Unemployment         11164 non-null  float64       
 11  Quarter              11164 non-null  datetime64[ns]
 12  GDP                  11164 non-null  float64       
dtypes: datetime64[ns](3), float6

# 2 Feature Engineering
In this section we add aditional feature to our initial dataset. 
### 2.1 Weather Categorisation
Meteorological data can be highly variable and noisy. Therefore, it can be useful to categorize weather variables into broader classes (e.g., "Rain", "Sunny") to reduce noise and make patterns more interpretable for the model. This approach helps to generalize the effect of weather on sales and can improve the robustness of the predictions.

In [10]:
def group_weather(code):
    if pd.isna(code):
        return "Other"
    try:
        code = int(code)
    except:
        return "Other"

    if code in range(50, 69):  # Drizzle, rain, freezing rain, sleet
        return "Rain"
    if code in range(20, 29):  # After rain
        return "After Rain"
    elif code in range(80, 85):  # Showers, sleet showers
        return "Showers and Thunderstorms"
    elif code in range(85, 91):  # Snow showers, hail showers
        return "Snow and Ice"
    elif code in range(91, 100):  # Thunderstorms
        return "Showers and Thunderstorms"
    elif code in list(range(10, 13)) + list(range(40, 50)):  # Haze, fog
        return "Fog"
    elif code in range(70, 80):  # Continuous snowfall, ice needles, snow grains, etc.
        return "Snow and Ice"
    elif code in range(66, 69):  # Sleet
        return "Snow and Ice"
    elif code in range(76, 80):  # Ice needles, snow grains, ice pellets
        return "Snow and Ice"
    else:
        return "Other"
    
def temperature_class(temp):
    if pd.isna(temp):
        return "Unknown_temp"
    elif temp < 5:
        return "cold"
    elif temp < 15:
        return "cool"
    elif temp < 25:
        return "mild"
    else:
        return "warm"

def bewoelkung_klasse(value):
    if pd.isna(value):
        return "Unknown_cloud"
    elif value <= 6:
        return "sunny"
    else:
        return "cloudy"

def windklasse(wind):
    if pd.isna(wind):
        return "Unknown_wind"
    elif wind < 10:
        return "breeze"
    elif wind < 20:
        return "wind"
    else:
        return "storm"

In [11]:
feature_collection["Weathercategorie"] = feature_collection["Wettercode"].apply(group_weather)
feature_collection["Temperatureclass"] = feature_collection["Temperatur"].apply(temperature_class)
feature_collection["Cloudclass"] = feature_collection["Bewoelkung"].apply(bewoelkung_klasse)
feature_collection["Windclass"] = feature_collection["Windgeschwindigkeit"].apply(windklasse)
feature_collection

Unnamed: 0_level_0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,Unemployment,Quarter,GDP,Weathercategorie,Temperatureclass,Cloudclass,Windclass
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,After Rain,mild,sunny,wind
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Other,mild,sunny,wind
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Rain,mild,cloudy,breeze
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,After Rain,mild,cloudy,breeze
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Other,mild,sunny,wind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,4.9,2018-10-01,881.52,Other,cold,cloudy,breeze
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,Rain,cool,cloudy,breeze
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,After Rain,cold,cloudy,wind
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,After Rain,cool,cloudy,wind


### 2.2 Day of the Week
Information about the day of the week and whether it is a weekend could be relevant for the sales prediction model. Purchasing behavior and sales often differ significantly between weekdays and weekends. Therefore, we will add new features for the day of the week and a weekend flag.


In [12]:
feature_collection["Wochentag"] = feature_collection["Datum"].dt.weekday
feature_collection["Wochenende"] = feature_collection["Wochentag"].isin([5, 6]).astype(int)
feature_collection["Monat"] = feature_collection["Datum"].dt.month
feature_collection

Unnamed: 0_level_0,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,Unemployment,Quarter,GDP,Weathercategorie,Temperatureclass,Cloudclass,Windclass,Wochentag,Wochenende,Monat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,After Rain,mild,sunny,wind,0,0,7
1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Other,mild,sunny,wind,1,0,7
1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Rain,mild,cloudy,breeze,2,0,7
1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,After Rain,mild,cloudy,breeze,3,0,7
1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,Other,mild,sunny,wind,4,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,4.9,2018-10-01,881.52,Other,cold,cloudy,breeze,5,1,12
1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,Rain,cool,cloudy,breeze,6,1,12
1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,After Rain,cold,cloudy,wind,0,0,12
1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,After Rain,cool,cloudy,wind,3,0,12


### 2.3  School Holidays


In [13]:
feature_collection["Wochentag"] = feature_collection["Datum"].dt.weekday
feature_collection["Wochenende"] = feature_collection["Wochentag"].isin([5, 6]).astype(int)
feature_collection["Monat"] = feature_collection["Datum"].dt.month
feature_collection


Unnamed: 0,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,...,Quarter,GDP,Weathercategorie,Temperatureclass,Cloudclass,Windclass,Wochentag,Wochenende,Monat,Unnamed: 21
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,...,2013-07-01,728.23,After Rain,mild,sunny,wind,0,0,7
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,...,2013-07-01,728.23,Other,mild,sunny,wind,1,0,7
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,...,2013-07-01,728.23,Rain,mild,cloudy,breeze,2,0,7
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,...,2013-07-01,728.23,After Rain,mild,cloudy,breeze,3,0,7
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,...,2013-07-01,728.23,Other,mild,sunny,wind,4,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,Other,cold,cloudy,breeze,5,1,12,,,,,
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,Rain,cool,cloudy,breeze,6,1,12,,,,,
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,After Rain,cold,cloudy,wind,0,0,12,,,,,
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,After Rain,cool,cloudy,wind,3,0,12,,,,,


### 2.3  School Holidays


In [50]:
school = pd.read_csv("./school_holidays.csv")
school["Datum"] = pd.to_datetime(school["Datum"])

# if statement for protection fo multiple merges in the notebook
if "Schulferien" not in feature_collection.columns:
    feature_collection = pd.merge(feature_collection,school.drop(columns=['Wochentag']), on="Datum", how="left")
feature_collection


Unnamed: 0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Weathercategorie,Temperatureclass,Cloudclass,Windclass,Wochentag,Wochenende,Monat,Schulferien,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,After Rain,mild,sunny,wind,0,0,7,1,,,,,
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,Other,mild,sunny,wind,1,0,7,1,,,,,
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,Rain,mild,cloudy,breeze,2,0,7,1,,,,,
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,After Rain,mild,cloudy,breeze,3,0,7,1,,,,,
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,Other,mild,sunny,wind,4,0,7,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,,,,,
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,...,2018-10-01,881.52,Other,cold,cloudy,breeze,5.0,1.0,12.0,1.0
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,...,2018-10-01,881.52,Rain,cool,cloudy,breeze,6.0,1.0,12.0,1.0
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,...,2018-10-01,881.52,After Rain,cold,cloudy,wind,0.0,0.0,12.0,1.0
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,...,2018-10-01,881.52,After Rain,cool,cloudy,wind,3.0,0.0,12.0,1.0


In [51]:
# feature_collection = feature_collection.set_index("id")
# feature_collection

### 2.4 Further Dataset - Ideas

- Public Holidays
- Christmas Market
- Inflation Rate, GDP, CPI, Unenployment
- Cruise Ships
- Rolling mean 7-day sales (sales trend)
- Rolling mean 7-day temperature (temperature trend)


# 3 Feature Preparation
For the final training dataframe, we need to replace all strings with numerical data. There are two main approaches to achieve this:

### 3.1 Encoding
#### Label Encoding
Each category gets a label, e.g., cold = 0, cool = 1, etc.  
For this approach, it is important that the data is continuous, meaning the labeled categories have a fixed relationship to each other.  
For example: cold < cool < mild < warm, 0 < 1 < 2 < 3

#### One-Hot Encoding
For non-continuous data like the weather code, one-hot encoding is more suitable. In this approach, each category gets its own column, filled with 1 or 0 depending on whether the category applies or not.


In [14]:
# Kopiere DataFrame zur Bearbeitung
feature_encoded_1 = feature_collection.copy()

# Label Encoding für ordinale Kategorien
label_encoders = {}
for col in ["Temperatureclass", "Cloudclass", "Windclass"]:
    le = LabelEncoder()
    feature_collection[col + "_enc"] = le.fit_transform(feature_encoded_1[col])
    label_encoders[col] = le  # falls später wieder dekodiert werden soll

# One-Hot Encoding für nicht-ordinale Kategorie
feature_encoded_1 = pd.get_dummies(feature_collection, columns=["Weathercategorie"], prefix="Weather")
feature_encoded_1

Unnamed: 0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,...,Schulferien,Temperatureclass_enc,Cloudclass_enc,Windclass_enc,Weather_After Rain,Weather_Fog,Weather_Other,Weather_Rain,Weather_Showers and Thunderstorms,Weather_Snow and Ice
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,...,1,3,2,3,True,False,False,False,False,False
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,...,1,3,2,3,False,False,True,False,False,False
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,...,1,3,1,1,False,False,False,True,False,False
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,...,1,3,1,1,True,False,False,False,False,False
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,...,1,3,2,3,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,
11159,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,...,1,1,1,1,False,False,True,False,False,False,
11160,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,...,1,2,1,1,False,False,False,True,False,False,
11161,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,...,1,1,1,3,True,False,False,False,False,False,
11162,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,...,1,2,1,3,True,False,False,False,False,False,


In [15]:
# Map the 'Warengruppe' to product names
mapping = {1: 'brot', 2: 'brotchen', 3: 'croissant', 4: 'konditorei', 5: 'kuchen', 6: 'saisonbrot'}
feature_encoded_1['Product'] = feature_encoded_1['Warengruppe'].map(mapping)
feature_encoded = pd.get_dummies(feature_encoded_1, columns=["Product"], prefix="Group")

In [16]:
# Show all columns of feature_encoded
pd.set_option('display.max_columns', None)
feature_encoded

Unnamed: 0,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Month,CPI,Unemployment,Quarter,GDP,Temperatureclass,Cloudclass,Windclass,Wochentag,Wochenende,Monat,Schulferien,Temperatureclass_enc,Cloudclass_enc,Windclass_enc,Weather_After Rain,Weather_Fog,Weather_Other,Weather_Rain,Weather_Showers and Thunderstorms,Weather_Snow and Ice,Group_brot,Group_brotchen,Group_croissant,Group_konditorei,Group_kuchen,Group_saisonbrot,Unnamed: 36
0,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,mild,sunny,wind,0,0,7,1,3,2,3,True,False,False,False,False,False,True,False,False,False,False,False,
1,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,mild,sunny,wind,1,0,7,1,3,2,3,False,False,True,False,False,False,True,False,False,False,False,False,
2,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,mild,cloudy,breeze,2,0,7,1,3,1,1,False,False,False,True,False,False,True,False,False,False,False,False,
3,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,2013-07-01,93.5,6.8,2013-07-01,728.23,mild,cloudy,breeze,3,0,7,1,3,1,1,True,False,False,False,False,False,True,False,False,False,False,False,
4,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,2013-07-01,93.5,6.8,2013-07-01,728.23,mild,sunny,wind,4,0,7,1,3,2,3,False,False,True,False,False,False,True,False,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,2018-12-01,98.5,4.9,2018-10-01,881.52,cold,cloudy,breeze,5,1,12,1,1,1,1,False,False,True,False,False,False,False,False,False,False,False,True
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,cool,cloudy,breeze,6,1,12,1,2,1,1,False,False,False,True,False,False,False,False,False,False,False,True
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,cold,cloudy,wind,0,0,12,1,1,1,3,True,False,False,False,False,False,False,False,False,False,False,True
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,2018-12-01,98.5,4.9,2018-10-01,881.52,cool,cloudy,wind,3,0,12,1,2,1,3,True,False,False,False,False,False,False,False,False,False,False,True


### 3.2 Clean Up
Drop all columns that schould not be in the trainings dataset. 

In [17]:
features = feature_encoded.copy()
features = features.drop(columns=[	'Temperatureclass',	'Cloudclass',	'Windclass', 'Month', 'Quarter'])
pd.set_option('display.max_columns', None)
features

Unnamed: 0,id,Datum,Warengruppe,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,CPI,Unemployment,GDP,Wochentag,Wochenende,Monat,Schulferien,Temperatureclass_enc,Cloudclass_enc,Windclass_enc,Weather_After Rain,Weather_Fog,Weather_Other,Weather_Rain,Weather_Showers and Thunderstorms,Weather_Snow and Ice,Group_brot,Group_brotchen,Group_croissant,Group_konditorei,Group_kuchen,Group_saisonbrot
0,1307011,2013-07-01,1,148.828353,6.0,17.8375,15.0,20.0,0,93.5,6.8,728.23,0,0,7,1,3,2,3,True,False,False,False,False,False,True,False,False,False,False,False
1,1307021,2013-07-02,1,159.793757,3.0,17.3125,10.0,,0,93.5,6.8,728.23,1,0,7,1,3,2,3,False,False,True,False,False,False,True,False,False,False,False,False
2,1307031,2013-07-03,1,111.885594,7.0,21.0750,6.0,61.0,0,93.5,6.8,728.23,2,0,7,1,3,1,1,False,False,False,True,False,False,True,False,False,False,False,False
3,1307041,2013-07-04,1,168.864941,7.0,18.8500,7.0,20.0,0,93.5,6.8,728.23,3,0,7,1,3,1,1,True,False,False,False,False,False,True,False,False,False,False,False
4,1307051,2013-07-05,1,171.280754,5.0,19.9750,12.0,,0,93.5,6.8,728.23,4,0,7,1,3,2,3,False,False,True,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,
11159,1812226,2018-12-22,6,,8.0,4.3000,4.0,,0,98.5,4.9,881.52,5,1,12,1,1,1,1,False,False,True,False,False,False,False,False,False,False,False,True
11160,1812236,2018-12-23,6,,7.0,6.4500,9.0,61.0,0,98.5,4.9,881.52,6,1,12,1,2,1,1,False,False,False,True,False,False,False,False,False,False,False,True
11161,1812246,2018-12-24,6,,7.0,2.5000,10.0,22.0,0,98.5,4.9,881.52,0,0,12,1,1,1,3,True,False,False,False,False,False,False,False,False,False,False,True
11162,1812276,2018-12-27,6,,7.0,7.1250,12.0,20.0,0,98.5,4.9,881.52,3,0,12,1,2,1,3,True,False,False,False,False,False,False,False,False,False,False,True


# 4 Time Split
We Split the dataset into training, validation and test data


In [18]:
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'


train_data = features[features['Datum']<=train_end_date].set_index("id")
vali_data = features[(features['Datum']> train_end_date) & (features['Datum']<=validation_end_date)].set_index("id")
test_data = features[(features['Datum']> validation_end_date)].set_index("id")

# 5 Simple Linear Regression Model

In this simple test model, I fit the sales to a combination of product group and day of the week.

### 5.1 Create a model with the train_data

In [19]:
# Fit the linear regression model
model = smf.ols('Umsatz ~ Warengruppe + Wochentag', data=train_data).fit()

# Modellzusammenfassung ausgeben
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     83.29
Date:                Wed, 04 Jun 2025   Prob (F-statistic):           1.67e-36
Time:                        09:23:06   Log-Likelihood:                -47982.
No. Observations:                7493   AIC:                         9.597e+04
Df Residuals:                    7490   BIC:                         9.599e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     198.0404      4.629     42.780      

### 5.3 Predict the Sales of the test_data


In [20]:
pred = model.predict(test_data)
pred

id
1808011    211.946204
1808021    221.880669
1808031    231.815134
1808041    241.749599
1808051    251.684063
              ...    
1812226    211.933919
1812236    221.868383
1812246    162.261595
1812276    192.064989
1812286    201.999454
Length: 1830, dtype: float64

### 5.4 CSV output for kaggel



In [None]:
pred_df = pred.rename("Umsatz").reset_index()
pred_df.to_csv("prognose_kaggle.csv", index=False)

In [21]:
features.to_csv("features.csv", index=False) 