In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import optuna

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=4)
    data = data[pd.to_datetime(data.date) > last_date]

    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    # Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus und Cosinus Transformation für "hour" und "dayOfMonth"
    data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

train_data.to_csv("train_data_till_2023_preprocessed.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import optuna

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=4)
    data = data[pd.to_datetime(data.date) > last_date]

    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    # Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus und Cosinus Transformation für "hour" und "dayOfMonth"
    data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

# Optuna-Objektiv-Funktion
def objective(trial, X_train, y_train, X_val, y_val):
    n_hidden_layers = trial.suggest_int("n_hidden_layers", 1, 5)
    hidden_layer_sizes = []
    for i in range(n_hidden_layers):
        n_neurons = trial.suggest_int(f"n_neurons_layer_{i}", 10, 200)
        hidden_layer_sizes.append(n_neurons)

    mlp = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation="relu",
        solver="adam",
        alpha=0.0001,
        batch_size="auto",
        learning_rate="constant",
        learning_rate_init=0.001,
        max_iter=1000,
        shuffle=True,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
    )

    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_val)

    # Entferne Zeilen mit y_val == 0 vor der MAPE-Berechnung
    nonzero_mask = y_val != 0
    y_val = y_val[nonzero_mask]
    y_pred = y_pred[nonzero_mask]

    mape = np.mean(np.abs((y_val - y_pred) / y_val)) * 100
    return mape  # Optuna maximiert den Wert, daher nehmen wir das negative MAPE

# Erstelle mehrere Train-Validierungs-Splits
train_data['date'] = pd.to_datetime(train_data['date'])

# Finde den Startmonat und das Startjahr
start_year = train_data['date'].dt.year.min()
start_month = train_data['date'][train_data['date'].dt.year == start_year].dt.month.min()

# Berechne den absoluten Monat für jeden Eintrag
train_data['abs_month'] = (train_data['date'].dt.year - start_year) * 12 + (train_data['date'].dt.month - start_month)

num_splits = 5
train_val_splits = []

for i in range(num_splits):
    # Da wir die ersten 19 Monate nutzen wollen, adjustiere die Grenzen entsprechend
    train_months_end = 18 + i # Der 19. Monat (0-basiert)
    val_month = 19 + i  # Der Validierungsmonat folgt auf die ersten 19 Monate

    # Filtere die Daten
    train_data_split = train_data[train_data['abs_month'] <= train_months_end]
    val_data_split = train_data[train_data['abs_month'] == val_month]

    # Entferne die Hilfsspalten für die Aufteilung
    train_data_split = train_data_split.drop(columns=["date", "abs_month"])
    val_data_split = val_data_split.drop(columns=["date", "abs_month"])

    # Teile die Daten in Features und Zielvariable
    X_train = train_data_split.drop(columns=["revenue"])
    y_train = train_data_split["revenue"]
    X_val = val_data_split.drop(columns=["revenue"])
    y_val = val_data_split["revenue"]

    train_val_splits.append((X_train, y_train, X_val, y_val))

# Führe die Optimierung mit Optuna durch
study = optuna.create_study(direction="minimize")

# Definiere, wie viele Trials pro Datensatz durchgeführt werden sollen
trials_per_dataset = 40

# Iteriere durch die Trainings-Validierungs-Splits
for X_train, y_train, X_val, y_val in train_val_splits:
    # Führe Optimierung mit der aktuellen Datenpartition durch, nutze das existierende Studienobjekt
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=trials_per_dataset, n_jobs=-1)

# Nachdem alle Optimierungen durchgeführt wurden, extrahiere die besten Parameter
best_params = study.best_params  # Nutze .best_params direkt vom Studienobjekt
best_score = study.best_value

print(best_params)

# Extrahiere die besten Hyperparameter
n_hidden_layers = best_params["n_hidden_layers"]
hidden_layer_sizes = [best_params[f"n_neurons_layer_{i}"] for i in range(n_hidden_layers)]

# Trainiere das finale Modell mit den besten Hyperparametern
mlp = MLPRegressor(
    hidden_layer_sizes=hidden_layer_sizes,
    activation="relu",
    solver="adam",
    alpha=0.0001,
    batch_size="auto",
    learning_rate="constant",
    learning_rate_init=0.001,
    max_iter=1000,
    shuffle=True,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
)

mlp.fit(X_train, y_train)

[I 2024-03-18 10:37:09,236] A new study created in memory with name: no-name-de070d33-5d62-4d41-ad25-d41708152f32


[I 2024-03-18 10:37:12,364] Trial 4 finished with value: 109.68225992078592 and parameters: {'n_hidden_layers': 2, 'n_neurons_layer_0': 17, 'n_neurons_layer_1': 12}. Best is trial 4 with value: 109.68225992078592.
[I 2024-03-18 10:37:24,536] Trial 5 finished with value: 106.2334073578606 and parameters: {'n_hidden_layers': 1, 'n_neurons_layer_0': 54}. Best is trial 5 with value: 106.2334073578606.


In [9]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import optuna

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=4)
    data = data[pd.to_datetime(data.date) > last_date]

    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    # Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus und Cosinus Transformation für "hour" und "dayOfMonth"
    # data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    # data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    # data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    # data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    # data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

# Optuna-Objektiv-Funktion
def objective(trial, X_train, y_train, X_val, y_val):
    n_hidden_layers = trial.suggest_int("n_hidden_layers", 1, 5)
    hidden_layer_sizes = []
    for i in range(n_hidden_layers):
        n_neurons = trial.suggest_int(f"n_neurons_layer_{i}", 10, 200)
        hidden_layer_sizes.append(n_neurons)

    mlp = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation="relu",
        solver="adam",
        alpha=0.0001,
        batch_size="auto",
        learning_rate="constant",
        learning_rate_init=0.001,
        max_iter=1000,
        shuffle=True,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
    )

    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_val)

    # Entferne Zeilen mit y_val == 0 vor der MAPE-Berechnung
    nonzero_mask = y_val != 0
    y_val = y_val[nonzero_mask]
    y_pred = y_pred[nonzero_mask]

    mape = np.mean(np.abs((y_val - y_pred) / y_val)) * 100
    return mape  # Optuna maximiert den Wert, daher nehmen wir das negative MAPE

# Erstelle mehrere Train-Validierungs-Splits
train_data['date'] = pd.to_datetime(train_data['date'])

# Finde den Startmonat und das Startjahr
start_year = train_data['date'].dt.year.min()
start_month = train_data['date'][train_data['date'].dt.year == start_year].dt.month.min()

# Berechne den absoluten Monat für jeden Eintrag
train_data['abs_month'] = (train_data['date'].dt.year - start_year) * 12 + (train_data['date'].dt.month - start_month)

num_splits = 5
train_val_splits = []

for i in range(num_splits):
    # Da wir die ersten 19 Monate nutzen wollen, adjustiere die Grenzen entsprechend
    train_months_end = 18 + i # Der 19. Monat (0-basiert)
    val_month = 19 + i  # Der Validierungsmonat folgt auf die ersten 19 Monate

    # Filtere die Daten
    train_data_split = train_data[train_data['abs_month'] <= train_months_end]
    val_data_split = train_data[train_data['abs_month'] == val_month]

    # Entferne die Hilfsspalten für die Aufteilung
    train_data_split = train_data_split.drop(columns=["date", "abs_month"])
    val_data_split = val_data_split.drop(columns=["date", "abs_month"])

    # Teile die Daten in Features und Zielvariable
    X_train = train_data_split.drop(columns=["revenue"])
    y_train = train_data_split["revenue"]
    X_val = val_data_split.drop(columns=["revenue"])
    y_val = val_data_split["revenue"]

    train_val_splits.append((X_train, y_train, X_val, y_val))

# Führe die Optimierung mit Optuna durch
study = optuna.create_study(direction="minimize")

# Definiere, wie viele Trials pro Datensatz durchgeführt werden sollen
trials_per_dataset = 40

# Iteriere durch die Trainings-Validierungs-Splits
for X_train, y_train, X_val, y_val in train_val_splits:
    # Führe Optimierung mit der aktuellen Datenpartition durch, nutze das existierende Studienobjekt
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=trials_per_dataset, n_jobs=-1)

# Nachdem alle Optimierungen durchgeführt wurden, extrahiere die besten Parameter
best_params = study.best_params  # Nutze .best_params direkt vom Studienobjekt
best_score = study.best_value

print(best_params)

# Extrahiere die besten Hyperparameter
n_hidden_layers = best_params["n_hidden_layers"]
hidden_layer_sizes = [best_params[f"n_neurons_layer_{i}"] for i in range(n_hidden_layers)]

# Trainiere das finale Modell mit den besten Hyperparametern
mlp = MLPRegressor(
    hidden_layer_sizes=hidden_layer_sizes,
    activation="relu",
    solver="adam",
    alpha=0.0001,
    batch_size="auto",
    learning_rate="constant",
    learning_rate_init=0.001,
    max_iter=1000,
    shuffle=True,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
)

mlp.fit(X_train, y_train)

[I 2024-03-15 13:33:33,329] A new study created in memory with name: no-name-87142464-5897-43d7-977d-a7ee11302845
[I 2024-03-15 13:33:36,750] Trial 0 finished with value: 102.64726108988573 and parameters: {'n_hidden_layers': 1, 'n_neurons_layer_0': 22}. Best is trial 0 with value: 102.64726108988573.
[I 2024-03-15 13:33:54,559] Trial 5 finished with value: 88.90524122710488 and parameters: {'n_hidden_layers': 2, 'n_neurons_layer_0': 83, 'n_neurons_layer_1': 105}. Best is trial 5 with value: 88.90524122710488.
[I 2024-03-15 13:34:12,993] Trial 3 finished with value: 121.0128408858345 and parameters: {'n_hidden_layers': 4, 'n_neurons_layer_0': 159, 'n_neurons_layer_1': 71, 'n_neurons_layer_2': 143, 'n_neurons_layer_3': 23}. Best is trial 5 with value: 88.90524122710488.
[I 2024-03-15 13:34:21,331] Trial 6 finished with value: 84.83949045462683 and parameters: {'n_hidden_layers': 2, 'n_neurons_layer_0': 81, 'n_neurons_layer_1': 179}. Best is trial 6 with value: 84.83949045462683.
[I 2024

{'n_hidden_layers': 5, 'n_neurons_layer_0': 190, 'n_neurons_layer_1': 133, 'n_neurons_layer_2': 68, 'n_neurons_layer_3': 75, 'n_neurons_layer_4': 65}


In [6]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import optuna
from lightgbm import LGBMRegressor

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=4)
    data = data[pd.to_datetime(data.date) > last_date]

    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    # Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus und Cosinus Transformation für "hour" und "dayOfMonth"
    data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

# Optuna-Objektiv-Funktion
def objective(trial, X_train, y_train, X_val, y_val):
    lgbm = LGBMRegressor(verbose=-1, importance_type="gain")

    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_val)

    # Entferne Zeilen mit y_val == 0 vor der MAPE-Berechnung
    nonzero_mask = y_val != 0
    y_val = y_val[nonzero_mask]
    y_pred = y_pred[nonzero_mask]

    mape = np.mean(np.abs((y_val - y_pred) / y_val)) * 100
    return -mape  # Optuna maximiert den Wert, daher nehmen wir das negative MAPE

# Erstelle mehrere Train-Validierungs-Splits
train_data['date'] = pd.to_datetime(train_data['date'])

# Finde den Startmonat und das Startjahr
start_year = train_data['date'].dt.year.min()
start_month = train_data['date'][train_data['date'].dt.year == start_year].dt.month.min()

# Berechne den absoluten Monat für jeden Eintrag
train_data['abs_month'] = (train_data['date'].dt.year - start_year) * 12 + (train_data['date'].dt.month - start_month)

num_splits = 5
train_val_splits = []

for i in range(num_splits):
    # Da wir die ersten 19 Monate nutzen wollen, adjustiere die Grenzen entsprechend
    train_months_end = 18 + i # Der 19. Monat (0-basiert)
    val_month = 19 + i  # Der Validierungsmonat folgt auf die ersten 19 Monate

    # Filtere die Daten
    train_data_split = train_data[train_data['abs_month'] <= train_months_end]
    val_data_split = train_data[train_data['abs_month'] == val_month]

    # Entferne die Hilfsspalten für die Aufteilung
    train_data_split = train_data_split.drop(columns=["date", "abs_month"])
    val_data_split = val_data_split.drop(columns=["date", "abs_month"])

    # Teile die Daten in Features und Zielvariable
    X_train = train_data_split.drop(columns=["revenue"])
    y_train = train_data_split["revenue"]
    X_val = val_data_split.drop(columns=["revenue"])
    y_val = val_data_split["revenue"]

    train_val_splits.append((X_train, y_train, X_val, y_val))

# Führe die Optimierung mit Optuna durch
best_params = None
best_score = np.inf
for X_train, y_train, X_val, y_val in train_val_splits:
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=2, n_jobs=-1)

    score = study.best_value
    if score < best_score:
        best_score = score
        best_params = study.best_trial.params

print(best_params)

# Extrahiere die besten Hyperparameter
n_hidden_layers = best_params["n_hidden_layers"]
hidden_layer_sizes = [best_params[f"n_neurons_layer_{i}"] for i in range(n_hidden_layers)]

# Trainiere das finale Modell mit den besten Hyperparametern
mlp = MLPRegressor(
    hidden_layer_sizes=hidden_layer_sizes,
    activation="relu",
    solver="adam",
    alpha=0.0001,
    batch_size="auto",
    learning_rate="constant",
    learning_rate_init=0.001,
    max_iter=1000,
    shuffle=True,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
)

mlp.fit(X_train, y_train)

[I 2024-03-15 11:53:18,062] A new study created in memory with name: no-name-88105b49-8269-481e-8045-3f67ac100733
[I 2024-03-15 11:53:18,695] Trial 1 finished with value: -82.53301220769768 and parameters: {}. Best is trial 1 with value: -82.53301220769768.
[I 2024-03-15 11:53:18,757] Trial 0 finished with value: -82.53301220769768 and parameters: {}. Best is trial 1 with value: -82.53301220769768.
[I 2024-03-15 11:53:18,758] A new study created in memory with name: no-name-03f3cf09-acfa-44de-9adb-9cebccbeeb65
[I 2024-03-15 11:53:19,352] Trial 0 finished with value: -78.37508818297174 and parameters: {}. Best is trial 0 with value: -78.37508818297174.
[I 2024-03-15 11:53:19,393] Trial 1 finished with value: -78.37508818297174 and parameters: {}. Best is trial 0 with value: -78.37508818297174.
[I 2024-03-15 11:53:19,395] A new study created in memory with name: no-name-4d28fed1-bcd8-4bac-9e10-8daec014ad08
[I 2024-03-15 11:53:19,924] Trial 0 finished with value: -74.21867471572082 and pa

{}


KeyError: 'n_hidden_layers'