In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from scikeras.wrappers import KerasRegressor
from sklearn.base import BaseEstimator, RegressorMixin


# 1. Daten einlesen
data = pd.read_csv("/workspaces/bakery_sales_prediction/0_DataPreparation/01_merged_data.csv", parse_dates=['Datum'])

# 2. Datenvorbereitung
relevant_columns = ["id", "Datum", "Warengruppe", "Umsatz", "Bewoelkung", "Temperatur", "Windgeschwindigkeit", "wochentag", 
                    "is_niederschlag", "niederschlag_intesitaet", "is_gewitter", "temperatur_cat", "is_kielerWoche", 
                    "is_holiday", "is_heimspiel_thw", "is_heimspiel_holstein", "is_school_holidays"]

data = data.dropna(subset=[col for col in data.columns if col != 'Umsatz'])

data['is_gewitter'] = data['is_gewitter'].astype(bool)
data['is_kielerWoche'] = data['is_kielerWoche'].astype(bool)
data['is_holiday'] = data['is_holiday'].astype(bool)
data['is_heimspiel_thw'] = data['is_heimspiel_thw'].astype(bool)
data['is_heimspiel_holstein'] = data['is_heimspiel_holstein'].astype(bool)
data['is_school_holidays'] = data['is_school_holidays'].astype(bool)

data['Bewoelkung'] = data['Bewoelkung'].astype(int)
data['wochentag'] = data['wochentag'].astype(int)
data['is_niederschlag'] = data['is_niederschlag'].astype(int)
data['niederschlag_intesitaet'] = data['niederschlag_intesitaet'].astype(int)
data['wochentag'] = data['wochentag'].astype(int)
data['temperatur_cat'] = data['temperatur_cat'].astype(int)

data['holiday_name'] = data['holiday_name'].astype(str)
data['school_holiday_name'] = data['school_holiday_name'].astype(str)

print(data.info())

# Trennen der Daten in Train, Validation und Test
train_data = data[(data['Datum'] >= '2013-07-01') & (data['Datum'] <= '2017-07-31')]
val_data = data[(data['Datum'] >= '2017-08-01') & (data['Datum'] <= '2018-07-31')]
test_data = data[(data['Datum'] >= '2018-08-01') & (data['Datum'] <= '2019-07-31')]

# Features und Ziel definieren
features = [col for col in relevant_columns if col not in ['id', 'Datum', 'Umsatz']]
target = "Umsatz"

X_train = train_data[features]
y_train = train_data[target]
X_val = val_data[features]
y_val = val_data[target]
X_test = test_data[features]

# 3. Feature Engineering und Skalierung der Features
numeric_features = ["Bewoelkung", "Temperatur", "Windgeschwindigkeit"]
categorical_features = ["Warengruppe", "wochentag", "is_niederschlag", "niederschlag_intesitaet", "temperatur_cat"]
binary_features = [ "is_gewitter", "is_kielerWoche", "is_holiday", "is_heimspiel_thw", "is_heimspiel_holstein", "is_school_holidays"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
    ],
    remainder='passthrough'
)

# 4. Neuronales Netz definieren
def create_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])
    return model

class KerasWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim):
        self.input_dim = input_dim
        self.model = None
    
    def fit(self, X, y):
        self.model = create_model(self.input_dim)
        self.model.fit(X, y, epochs=100, batch_size=32, verbose=0)
        return self
    
    def predict(self, X):
        return self.model.predict(X).flatten()

# Get the input dimension
X_train_preprocessed = preprocessor.fit_transform(X_train)
input_dim = X_train_preprocessed.shape[1]

# Create the KerasRegressor
model = KerasRegressor(build_fn=lambda: create_model(input_dim), epochs=1000, batch_size=32, verbose=1)

# Bestimmen Sie die Eingabedimension nach der Vorverarbeitung
X_train_preprocessed = preprocessor.fit_transform(X_train)
input_dim = X_train_preprocessed.shape[1]

# Erstellen Sie die Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KerasWrapper(input_dim))
])

# Trainieren Sie das Modell
pipeline.fit(X_train, y_train)

# 8. Modell bewerten
val_predictions = pipeline.predict(X_val)
mae = np.mean(np.abs(val_predictions - y_val))
mse = np.mean((val_predictions - y_val)**2)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

# 9. Vorhersagen für Testdaten
predictions = pipeline.predict(X_test)

# 10. Ergebnisse in Submission-Format bringen
predictions_df = pd.DataFrame({
    'id': test_data['id'],
    'Umsatz': predictions
})

submission = pd.read_csv("0_DataPreparation/raw-data/test.csv", usecols=["id"])

submission = pd.merge(submission, predictions_df, on="id", how=left)


print(submission.info())
print(submission.head())

# Optional: Submission.csv speichern
submission.to_csv("/workspaces/bakery_sales_prediction/submission.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 289 to 10816
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       30 non-null     int64         
 1   Datum                    30 non-null     datetime64[ns]
 2   Warengruppe              30 non-null     int64         
 3   Umsatz                   25 non-null     float64       
 4   Bewoelkung               30 non-null     int64         
 5   Temperatur               30 non-null     float64       
 6   Windgeschwindigkeit      30 non-null     float64       
 7   wochentag                30 non-null     int64         
 8   is_niederschlag          30 non-null     int64         
 9   niederschlag_intesitaet  30 non-null     int64         
 10  is_gewitter              30 non-null     bool          
 11  temperatur_cat           30 non-null     int64         
 12  is_kielerWoche           30 non-null  

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Mean Absolute Error: 192.11792673960696
Mean Squared Error: 60024.620253129586




ValueError: Found unknown categories [np.int64(2)] in column 1 during transform