### Import Libraries and Data


In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam


2025-06-28 20:20:37.150085: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-28 20:20:37.187243: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-28 20:20:37.459287: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-28 20:20:37.583624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751142037.883636    2428 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751142037.97

In [2]:
# print current working directory
print(os.getcwd())

# Import data
pdata = pd.read_csv("/workspaces/Bakery_predictions/1_DatasetCharacteristics/merged_data_clean.csv")
pdata.head() 


/workspaces/Bakery_predictions/3_Model/Neural_Net_Melissa


Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,Umsatz,KielerWoche
0,2012-01-01,8.0,9.82,14,58.0,,,,
1,2012-01-02,7.0,7.44,12,,,,,
2,2012-01-03,8.0,5.54,18,63.0,,,,
3,2012-01-04,4.0,5.69,19,80.0,,,,
4,2012-01-05,6.0,5.3,23,80.0,,,,


### Data Preparation


In [4]:
# change to datetime format
pdata['Datum'] = pd.to_datetime(pdata['Datum'])

# fill nan values with 0 in KielerWoche, id and warengruppe
pdata['KielerWoche'] = pdata['KielerWoche'].fillna(0)
pdata['id'] = pdata['id'].fillna(0)
pdata['Warengruppe'] = pdata['Warengruppe'].fillna(0)

# 


In [5]:
# Definition of categorical features
cat_feat = ['Warengruppe', 'KielerWoche', 'Wettercode']

# Definition of numerical features
num_feat = ['Bewoelkung', 'Temperatur', 'Windgeschwindigkeit']

# Check for categorical type in categorical feature columns
for col in cat_feat:
    pdata[col] = pdata[col].astype('category')

# check unique values in categorical columns 
print(pdata[cat_feat].dtypes)
print(pdata[cat_feat].apply(lambda x: x.unique()))

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(pdata[cat_feat], drop_first=True, dtype=int)

# Include numerical columns which are not categorical
for col in num_feat:
    features[col] = pdata[col]
features['Datum'] = pdata['Datum']

# construct prepared dataset including dependent variable
prepared_data = pd.concat([pdata[['Umsatz']], features], axis=1)

# Drop rows with NaN values
prepared_data = prepared_data.dropna()

# Print shape of prepared dataset
print(prepared_data.shape)
prepared_data.head()





Warengruppe    category
KielerWoche    category
Wettercode     category
dtype: object
Warengruppe    [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Categories...
KielerWoche       [0.0, 1.0]
Categories (2, float64): [0.0, 1.0]
Wettercode     [58.0, NaN, 63.0, 80.0, 61.0, ..., 75.0, 53.0,...
dtype: object
(9264, 52)


Unnamed: 0,Umsatz,Warengruppe_1.0,Warengruppe_2.0,Warengruppe_3.0,Warengruppe_4.0,Warengruppe_5.0,Warengruppe_6.0,KielerWoche_1.0,Wettercode_1.0,Wettercode_2.0,...,Wettercode_79.0,Wettercode_80.0,Wettercode_81.0,Wettercode_85.0,Wettercode_91.0,Wettercode_95.0,Bewoelkung,Temperatur,Windgeschwindigkeit,Datum
394,148.83,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
395,535.86,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
396,201.2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
397,65.89,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
398,317.48,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01


### Selection of Training, Validation and Test Data


In [6]:
# Random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the data into training, validation, and test sets
train_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training + n_validation]
test_data = prepared_data.iloc[n_training + n_validation:]

# Separating features and labels
training_features = train_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = train_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)

print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)

Training features dimensions: (6484, 51)
Validation features dimensions: (1852, 51)
Test features dimensions: (928, 51)
Training labels dimensions: (6484, 1)
Validation labels dimensions: (1852, 1)
Test labels dimensions: (928, 1)


### Data Export

In [7]:
# Create subdirectory for pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Save the dataframes to pickle files
training_features.to_pickle(os.path.join(subdirectory, "training_features.pkl"))
training_labels.to_pickle(os.path.join(subdirectory, "training_labels.pkl"))
validation_features.to_pickle(os.path.join(subdirectory, "validation_features.pkl"))
validation_labels.to_pickle(os.path.join(subdirectory, "validation_labels.pkl"))
test_features.to_pickle(os.path.join(subdirectory, "test_features.pkl"))
test_labels.to_pickle(os.path.join(subdirectory, "test_labels.pkl"))
