In [1]:
import pandas as pd
import numpy as np

def preprocess_data(data):    
    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    #Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus and Cosinus Transformation for "hour" and "dayOfMonth"
    # data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    # data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    # data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    # data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    # data = data.drop(columns=["hour", "dayOfMonth"])

    return data

def split_train_test_data(data, target_col="revenue"):
    # date in datetime umwandeln
    data["date"] = pd.to_datetime(data["date"])

    # Auswählen des Testdatensatzes (letzter Monat)
    last_month = data['date'].dt.month.max()
    test_data = data[data['date'].dt.month == last_month]
    train_data = data[data['date'].dt.month != last_month]

    # Reduziere die Trainingsdaten auf die letzten 2 Jahre
    last_date = train_data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=2)
    train_data = train_data[pd.to_datetime(train_data.date) > last_date]

    # Entferne Spalte "date"
    train_data = train_data.drop(columns=["date"])
    test_data = test_data.drop(columns=["date"])
    
    return train_data, test_data

    # x, y trennen
    # X_train = train_data.drop(columns=[target_col])
    # y_train = train_data[target_col]
    # X_test = test_data.drop(columns=[target_col])
    # y_test = test_data[target_col]

    # return X_train, y_train, X_test, y_test
    
    

In [2]:
train_data = pd.read_csv("train_data_till_2023.csv")

train_data = preprocess_data(train_data)
train_data.to_csv("train_data_till_2023_preprocessed.csv", index=False)

In [6]:
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared, DotProduct
import optuna

# Vorverarbeitung deiner Daten
X_train = np.column_stack([train_data['day_x_hour'], train_data['hour_sin'], train_data['hour_cos'], 
                           train_data['dayOfMonth_sin'], train_data['dayOfMonth_cos']])
y_train = (train_data['revenue'] - train_data['revenue'].mean()) / train_data['revenue'].std()

# Definiere die Kernelkomponenten
kernel_components = [
    RBF(length_scale=1.0, length_scale_bounds=(1e-5, 100.0)),
    WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 100.0)),
    RationalQuadratic(length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-5, 100.0), alpha_bounds=(1e-5, 100.0)),
    ExpSineSquared(length_scale=1.0, periodicity=1.0, length_scale_bounds=(1e-5, 100.0), periodicity_bounds=(1e-5, 100.0)),
    DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-5, 100.0))
]

# Optuna-Objektiv-Funktion
def objective(trial):
    kernel = 1.0
    for component in kernel_components:
        if trial.suggest_categorical(f"use_{component.__class__.__name__}", [0, 1]):
            if isinstance(component, RBF):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(length_scale=length_scale)
            elif isinstance(component, WhiteKernel):
                noise_level = trial.suggest_float(f"noise_level_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(noise_level=noise_level)
            elif isinstance(component, RationalQuadratic):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                alpha = trial.suggest_float(f"alpha_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(length_scale=length_scale, alpha=alpha)
            elif isinstance(component, ExpSineSquared):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                periodicity = trial.suggest_float(f"periodicity_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(length_scale=length_scale, periodicity=periodicity)
            elif isinstance(component, DotProduct):
                sigma_0 = trial.suggest_float(f"sigma_0_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(sigma_0=sigma_0)
            kernel *= component
        
    gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0, normalize_y=True)
    gp.fit(X_train, y_train)
    
    X_val = np.column_stack([val_data['day_x_hour'], val_data['hour_sin'], val_data['hour_cos'],
                             val_data['dayOfMonth_sin'], val_data['dayOfMonth_cos']])
    y_val = (val_data['revenue'] - val_data['revenue'].mean()) / val_data['revenue'].std()
    
    y_pred, _ = gp.predict(X_val, return_std=False)
    score = -np.mean((y_val - y_pred) ** 2)  # Negatives durchschnittliches quadratisches Fehler
    return score

# Führe die Optimierung mit Optuna durch
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Extrahiere die besten Hyperparameter und Kernelzusammensetzung
best_params = study.best_trial.params
kernel = 1.0
for component in kernel_components:
    if best_params.get(f"use_{component.__class__.__name__}", 0):
        component_params = {param: best_params[param] for param in best_params if param.startswith(f"{component.__class__.__name__}")}
        component.set_params(**component_params)
        kernel *= component

# Trainiere das endgültige Modell mit den besten Hyperparametern
gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0, normalize_y=True)
gp.fit(X_train, y_train)

# Mache Vorhersagen auf Testdaten
X_test = np.column_stack([test_data['day_x_hour'], test_data['hour_sin'], test_data['hour_cos'],
                          test_data['dayOfMonth_sin'], test_data['dayOfMonth_cos']])
y_pred, _ = gp.predict(X_test, return_std=False)
y_pred = y_pred * test_data['revenue'].std() + test_data['revenue'].mean()

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-09 21:55:30,972] A new study created in memory with name: no-name-eb9166c0-8113-40a3-bac7-255db6fb7ba9
[W 2024-03-09 21:59:05,596] Trial 0 failed with parameters: {'use_RBF': 1, 'length_scale_RBF': 0.39609252380096555, 'use_WhiteKernel': 1, 'noise_level_WhiteKernel': 13.527684959285995, 'use_RationalQuadratic': 1, 'length_scale_RationalQuadratic': 0.0023366099412723555, 'alpha_RationalQuadratic': 0.060521280345881, 'use_ExpSineSquared': 1, 'length_scale_ExpSineSquared': 0.5116120480590125, 'periodicity_ExpSineSquared': 0.00015876594548668886, 'use_DotProduct': 0} because of the following error: NameError("name 'val_data' is not defined").
Traceback (most recent call last):
  File "/home/tc/vscode/BA_XAI/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_495460/1667466441.py", line 47, in objective
    X_val = np.column_stack([val_data

NameError: name 'val_data' is not defined

In [3]:
import GPy
from scipy.stats import norm, gamma

# Definiere die Kernel-Komposition
kernel = GPy.kern.Sum(
    GPy.kern.Periodic(1, period=10, lengthscale=1.0, lower=1e-5, upper=100.0),  # Täglicher Zyklus (10 Stunden)
    GPy.kern.Periodic(1, period=70, lengthscale=1.0, lower=1e-5, upper=100.0), # Wöchentlicher Zyklus (10 Stunden * 7 Tage)
    GPy.kern.Linear(),  
    GPy.kern.RBF(1, lengthscale=1.0, lower=1e-5, upper=100.0),
    GPy.kern.Spectral1D(1, lengthscale=1.0, lower=1e-5, upper=100.0), 
    GPy.kern.Spectral1D(1, lengthscale=1.0, lower=1e-5, upper=100.0)
)

num_timeseries = len(train_data) // 100 * 3 # Verwende z.B. 3% der Daten für das hierarchische Modell
with pm.Model() as hier_model:
    # Hyperpriors
    nu_s = pm.Normal('nu_s', mu=0, sigma=5)
    nu_r = pm.Normal('nu_r', mu=0, sigma=5)
    nu_p1 = pm.Normal('nu_p1', mu=0, sigma=5)
    nu_p2 = pm.Normal('nu_p2', mu=0, sigma=5)
    nu_m1 = pm.Normal('nu_m1', mu=-1.5, sigma=5)
    nu_m2 = pm.Normal('nu_m2', mu=0, sigma=5)
    lambda_s = pm.Gamma('lambda_s', alpha=1, beta=1)
    lambda_l = pm.Gamma('lambda_l', alpha=1, beta=1)

    # Hierarchical priors
    sigma_s = pm.LogNormal('sigma_s', nu_s, lambda_s, shape=num_timeseries)
    l_rbf = pm.LogNormal('l_rbf', nu_r, lambda_l, shape=num_timeseries)
    l_per1 = pm.LogNormal('l_per1', nu_p1, lambda_l, shape=num_timeseries)
    l_per2 = pm.LogNormal('l_per2', nu_p2, lambda_l, shape=num_timeseries)
    l_sm1 = pm.LogNormal('l_sm1', nu_m1, lambda_l, shape=num_timeseries)
    l_sm2 = pm.LogNormal('l_sm2', nu_m2, lambda_l, shape=num_timeseries)

    # Likelihood (Beispiel für eine Normalverteilung)
    y_obs = pm.Normal('y_obs', mu=0, sigma=sigma_s, observed=y_train)

    # Inferenz
    trace = pm.sample(1000, chains=2, cores=2)

# Extrahiere die a-posteriori Verteilungen der Hyperparameter
nu_s_post = trace['nu_s'].mean()
nu_r_post = trace['nu_r'].mean()
nu_p1_post = trace['nu_p1'].mean()
nu_p2_post = trace['nu_p2'].mean()
nu_m1_post = trace['nu_m1'].mean()
nu_m2_post = trace['nu_m2'].mean()
lambda_s_post = trace['lambda_s'].mean()
lambda_l_post = trace['lambda_l'].mean()

print(nu_s_post, nu_r_post, nu_p1_post, nu_p2_post, nu_m1_post, nu_m2_post, lambda_s_post, lambda_l_post)

# Erstelle das GP-Modell mit den Priors
gp = GPy.models.GPRegression(X_train, y_train, kernel=kernel)
gp.kern.rbf.lengthscale.unconstrain()
gp.kern.periodic.lengthscales.unconstrain()
gp.kern.spectral1d_1.lengthscale.unconstrain()
gp.kern.spectral1d_2.lengthscale.unconstrain()

# Setze die Priors mit den geschätzten Werten
gp.kern.rbf.lengthscale.set_prior(norm(nu_r_post, lambda_l_post))
gp.kern.periodic.lengthscales.set_prior(norm([nu_p1_post, nu_p2_post], lambda_l_post))
gp.kern.spectral1d_1.lengthscale .set_prior(norm(nu_m1_post, lambda_l_post))
gp.kern.spectral1d_2.lengthscale.set_prior(norm(nu_m2_post, lambda_l_post))

gp.kern.rbf.variance.set_prior(gamma(shape=lambda_s_post, scale=1/nu_s_post))
gp.kern.periodic.variance.set_prior(gamma(shape=lambda_s_post, scale=1/nu_s_post))
gp.kern.linear.variances.set_prior(gamma(shape=lambda_s_post, scale=1/nu_s_post))
gp.kern.spectral1d_1.variance.set_prior(gamma(shape=lambda_s_post, scale=1/nu_s_post))
gp.kern.spectral1d_2.variance.set_prior(gamma(shape=lambda_s_post, scale=1/nu_s_post))

# Trainiere das GP-Modell
gp.optimize()

# Mache Vorhersagen
y_pred, y_var = gp.predict(X_test)

ModuleNotFoundError: No module named 'GPy'

In [11]:
import pickle

# Speichere das Kernel-Objekt
with open('optimized_kernel.pkl', 'wb') as file:
    pickle.dump(gp.kern, file)


print("RBF Kernel:")
print(f"Lengthscale: {gp.kern.rbf.lengthscale.values}")
print(f"Variance: {gp.kern.rbf.variance.values}")

print("Periodischer Kernel 1 (täglich):")
print(f"Periode: {gp.kern.periodic.period}")  
print(f"Lengthscale: {gp.kern.periodic.lengthscale.values}")
print(f"Variance: {gp.kern.periodic.variance.values}")
print("Periodischer Kernel 2 (wöchentlich):")
print(f"Periode: {gp.kern.periodic.period}")
print(f"Lengthscale: {gp.kern.periodic.lengthscale.values}")
print(f"Variance: {gp.kern.periodic.variance.values}")
print("Linear Kernel:")
print(f"Variances: {gp.kern.linear.variances.values}")
print("Spectral Kernel 1:")
print(f"Lengthscale: {gp.kern.spectral1d_1.lengthscale.values}")
print(f"Variance: {gp.kern.spectral1d_1.variance.values}")
print("Spectral Kernel 2:")
print(f"Lengthscale: {gp.kern.spectral1d_2.lengthscale.values}")


# Laden Sie es später wieder ein
# with open('optimized_kernel.pkl', 'rb') as file:
#     optimized_kernel = pickle.load(file)

# # Verwenden Sie das geladene Kernel in einem neuen GPy-Modell
# new_gp = GPy.models.GPRegression(X_train, y_train, kernel=optimized_kernel)



NameError: name 'gp' is not defined

In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared, DotProduct
import optuna

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(years=4)
    data = data[pd.to_datetime(data.date) > last_date]

    
    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    #Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus and Cosinus Transformation for "hour" and "dayOfMonth"
    data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

# Definiere die Kernelkomponenten
kernel_components = [
    # RBF(length_scale=1.0, length_scale_bounds=(1e-5, 100.0)),
    WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 100.0)),
    RationalQuadratic(length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-5, 100.0), alpha_bounds=(1e-5, 100.0)),
    ExpSineSquared(length_scale=1.0, periodicity=10.0, length_scale_bounds=(1e-5, 100.0), periodicity_bounds=(1e-5, 100.0)),
    ExpSineSquared(length_scale=1.0, periodicity=70.0, length_scale_bounds=(1e-5, 100.0), periodicity_bounds=(1e-5, 100.0)),
    DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-5, 100.0))
]

# Optuna-Objektiv-Funktion
def objective(trial, X_train, y_train, X_val, y_val):
    kernel = 1.0
    for component in kernel_components:
        if trial.suggest_categorical(f"use_{component.__class__.__name__}", [0, 1]):
            if isinstance(component, RBF):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(length_scale=length_scale)
            elif isinstance(component, WhiteKernel):
                noise_level = trial.suggest_float(f"noise_level_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(noise_level=noise_level)
            elif isinstance(component, RationalQuadratic):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                alpha = trial.suggest_float(f"alpha_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(length_scale=length_scale, alpha=alpha)
            elif isinstance(component, ExpSineSquared):
                length_scale = trial.suggest_float(f"length_scale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                periodicity = component.periodicity  # Keine Optimierung der Periodizität
                component.set_params(length_scale=length_scale)
            elif isinstance(component, DotProduct):
                sigma_0 = trial.suggest_float(f"sigma_0_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.set_params(sigma_0=sigma_0)
            kernel *= component
        
    gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
    gp.fit(X_train, y_train)
    
    y_pred = gp.predict(X_val, return_std=False)
    score = -np.mean((y_val - y_pred) ** 2)  # Negatives durchschnittliches quadratisches Fehler
    return score

# Erstelle mehrere Train-Validierungs-Splits
train_data['date'] = pd.to_datetime(train_data['date'])

# Finde den Startmonat und das Startjahr
start_year = train_data['date'].dt.year.min()
start_month = train_data['date'][train_data['date'].dt.year == start_year].dt.month.min()

# Berechne den absoluten Monat für jeden Eintrag
train_data['abs_month'] = (train_data['date'].dt.year - start_year) * 12 + (train_data['date'].dt.month - start_month)

num_splits = 5
train_val_splits = []

for i in range(num_splits):
    # Da wir die ersten 19 Monate nutzen wollen, adjustiere die Grenzen entsprechend
    train_months_end = 18 + i # Der 19. Monat (0-basiert)
    val_month = 19 + i  # Der Validierungsmonat folgt auf die ersten 19 Monate

    # Filtere die Daten
    train_data_split = train_data[train_data['abs_month'] <= train_months_end]
    val_data_split = train_data[train_data['abs_month'] == val_month]

    # Entferne die Hilfsspalten für die Aufteilung
    train_data_split = train_data_split.drop(columns=["date", "abs_month"])
    val_data_split = val_data_split.drop(columns=["date", "abs_month"])

    # Teile die Daten in Features und Zielvariable
    X_train = train_data_split.drop(columns=["revenue"])
    y_train = train_data_split["revenue"]
    X_val = val_data_split.drop(columns=["revenue"])
    y_val = val_data_split["revenue"]

    train_val_splits.append((X_train, y_train, X_val, y_val))

# Führe die Optimierung mit Optuna durch
best_params = None
best_score = -np.inf
for X_train, y_train, X_val, y_val in train_val_splits:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=100, n_jobs=1)
    
    score = study.best_value
    if score > best_score:
        best_score = score
        best_params = study.best_trial.params

# Extrahiere die besten Hyperparameter und Kernelzusammensetzung
kernel = 1.0
for component in kernel_components:
    if best_params.get(f"use_{component.__class__.__name__}", 0):
        component_params = {param: best_params[param] for param in best_params if param.startswith(f"{component.__class__.__name__}")}
        component.set_params(**component_params)
        kernel *= component




  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-10 17:43:16,963] A new study created in memory with name: no-name-7d279b24-f2c5-46be-b9b0-ac05c7b70e31
[I 2024-03-10 17:46:45,378] Trial 0 finished with value: -434177.04080032464 and parameters: {'use_WhiteKernel': 1, 'noise_level_WhiteKernel': 0.0014986089319315565, 'use_RationalQuadratic': 1, 'length_scale_RationalQuadratic': 1.4640211525479963e-05, 'alpha_RationalQuadratic': 8.236409913565002, 'use_ExpSineSquared': 0, 'use_DotProduct': 0}. Best is trial 0 with value: -434177.04080032464.
[W 2024-03-10 17:46:57,260] Trial 1 failed with parameters: {'use_WhiteKernel': 0, 'use_RationalQuadratic': 0, 'use_ExpSineSquared': 1, 'length_scale_ExpSineSquared': 0.025046943972269153, 'use_DotProduct': 0} because of the following error: LinAlgError("The kernel, 1**2 * ExpSineSquared(length_scale=0.025, periodicity=10) * ExpSineSquared(length_scale=0.025, periodicity=70), is not returning a positive definite matrix. Try gradually incr

LinAlgError: ("The kernel, 1**2 * ExpSineSquared(length_scale=0.025, periodicity=10) * ExpSineSquared(length_scale=0.025, periodicity=70), is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.", '130-th leading minor of the array is not positive definite')

In [None]:
# gebe die besten Hyperparameter und Kernelzusammensetzung aus
print("Beste Hyperparameter:")
print(best_params)
print("Beste Kernelzusammensetzung:")
print(kernel)

In [None]:

# Trainiere das endgültige Modell mit den besten Hyperparametern
# X_train = np.column_stack([train_data['day_x_hour'], train_data['hour_sin'], train_data['hour_cos'],
#                            train_data['dayOfMonth_sin'], train_data['dayOfMonth_cos']])
# y_train = (train_data['revenue'] - train_data['revenue'].mean()) / train_data['revenue'].std()

# gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0, normalize_y=True)
# gp.fit(X_train, y_train)

# # Mache Vorhersagen auf Testdaten
# test_data = train_data[train_data['date'].dt.month == train_data['date'].dt.month.max()]
# X_test = np.column_stack([test_data['day_x_hour'], test_data['hour_sin'], test_data['hour_cos'],
#                           test_data['dayOfMonth_sin'], test_data['dayOfMonth_cos']])
# y_pred, _ = gp.predict(X_test, return_std=False)
# y_pred = y_pred * test_data['revenue'].std() + test_data['revenue'].mean()

In [1]:
import pandas as pd
import numpy as np
import optuna
import GPy

def preprocess_data(data):
    # Reduziere die Trainingsdaten auf die letzten 4 Jahre
    last_date = data["date"].max()
    last_date = pd.to_datetime(last_date)
    last_date = last_date - pd.DateOffset(months=25)
    data = data[pd.to_datetime(data.date) > last_date]

    
    # Entferne aus den Trainingsdaten die Zeilen, die um 23 Uhr sind, weil wir nur an Samstag Sonntag bis 23 Uhr offen haben, das aber unsere
    # Periodizität kaputt macht
    data = data[data["hour"] != 23]

    #Entferne die Spalten "year", "week", "coronaImpact", weil diese keinen Impact haben
    data = data.drop(columns=["year", "week", "coronaImpact"])

    # Sinus and Cosinus Transformation for "hour" and "dayOfMonth"
    data["hour_sin"] = np.sin(2 * np.pi * data["hour"] / 24)
    data["hour_cos"] = np.cos(2 * np.pi * data["hour"] / 24)
    data["dayOfMonth_sin"] = np.sin(2 * np.pi * data["dayOfMonth"] / 31)
    data["dayOfMonth_cos"] = np.cos(2 * np.pi * data["dayOfMonth"] / 31)
    data = data.drop(columns=["hour", "dayOfMonth"])

    return data

# Vorverarbeitung deiner Daten
train_data = pd.read_csv("train_data_till_2023.csv")
train_data = preprocess_data(train_data)

# Definiere die Kernelkomponenten
kernel_components = [
    GPy.kern.StdPeriodic(1, period=10, lengthscale=1.0, variance=1.0), # Täglicher Zyklus (10 Stunden)
    GPy.kern.StdPeriodic(1, period=70, lengthscale=1.0, variance=1.0), # Wöchentlicher Zyklus (10 Stunden * 7 Tage)
    GPy.kern.Linear(1),
    GPy.kern.RBF(1, lengthscale=1.0),
]

# Optuna-Objektiv-Funktion
def objective(trial, X_train, y_train, X_val, y_val):
    kernel_list = []
    for component in kernel_components:
        if trial.suggest_categorical(f"use_{component.__class__.__name__}", [0, 1]):
            if isinstance(component, GPy.kern.StdPeriodic):
                lengthscale = trial.suggest_float(f"lengthscale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.lengthscale = lengthscale
            elif isinstance(component, GPy.kern.RBF):
                lengthscale = trial.suggest_float(f"lengthscale_{component.__class__.__name__}", 1e-5, 100.0, log=True)
                component.lengthscale = lengthscale
            kernel_list.append(component)
    
    print("end=", kernel_list)
    kernel = kernel_list[0]
    for k in kernel_list[1:]:
        kernel += k

    gp = GPy.models.GPRegression(X_train, y_train, kernel=kernel)
    gp.optimize()
    
    y_pred, y_var = gp.predict(X_val)
    score = -np.mean((y_val - y_pred) ** 2)  # Negatives durchschnittliches quadratisches Fehler
    return score


# Erstelle mehrere Train-Validierungs-Splits
train_data['date'] = pd.to_datetime(train_data['date'])

# Finde den Startmonat und das Startjahr
start_year = train_data['date'].dt.year.min()
start_month = train_data['date'][train_data['date'].dt.year == start_year].dt.month.min()

# Berechne den absoluten Monat für jeden Eintrag
train_data['abs_month'] = (train_data['date'].dt.year - start_year) * 12 + (train_data['date'].dt.month - start_month)

num_splits = 5
train_val_splits = []

for i in range(num_splits):
    # Da wir die ersten 19 Monate nutzen wollen, adjustiere die Grenzen entsprechend
    train_months_end = 18 + i # Der 19. Monat (0-basiert)
    val_month = 19 + i  # Der Validierungsmonat folgt auf die ersten 19 Monate

    # Filtere die Daten
    train_data_split = train_data[train_data['abs_month'] <= train_months_end]
    val_data_split = train_data[train_data['abs_month'] == val_month]

    # Entferne die Hilfsspalten für die Aufteilung
    train_data_split = train_data_split.drop(columns=["date", "abs_month"])
    val_data_split = val_data_split.drop(columns=["date", "abs_month"])

    # Teile die Daten in Features und Zielvariable
    X_train = train_data_split.drop(columns=["revenue"])
    y_train = train_data_split["revenue"]
    X_val = val_data_split.drop(columns=["revenue"])
    y_val = val_data_split["revenue"]

    train_val_splits.append((X_train, y_train, X_val, y_val))

# Führe die Optimierung mit Optuna durch
best_params = None
best_score = -np.inf
for X_train, y_train, X_val, y_val in train_val_splits:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=100, n_jobs=1)
    
    score = study.best_value
    if score > best_score:
        best_score = score
        best_params = study.best_trial.params

# Extrahiere die besten Hyperparameter und Kernelzusammensetzung
# kernel_list = []
# for component in kernel_components:
#     if best_params.get(f"use_{component.__class__.__name__}", 0):
#         component_params = {param: best_params[param] for param in best_params if param.startswith(f"{component.__class__.__name__}")}
#         component.set_params(**component_params)
#         kernel_list.append(component)

# kernel = GPy.kern.Sum(*kernel_list)

# # Trainiere das GP-Modell mit dem optimierten Kernel
# gp = GPy.models.GPRegression(X_train, y_train, kernel=kernel)
# gp.optimize()

# # Mache Vorhersagen
# y_pred, y_var = gp.predict(X_test)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-10 18:54:58,017] A new study created in memory with name: no-name-52dfaa71-84f7-4f60-ae09-b443489a3f9d
[W 2024-03-10 18:54:58,024] Trial 0 failed with parameters: {'use_StdPeriodic': 0, 'use_Linear': 1, 'use_RBF': 1, 'lengthscale_RBF': 1.2915675217069649} because of the following error: AssertionError().
Traceback (most recent call last):
  File "/home/tc/vscode/BA_XAI/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_70759/3252712527.py", line 107, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=100, n_jobs=1)
  File "/tmp/ipykernel_70759/3252712527.py", line 60, in objective
    gp = GPy.models.GPRegression(X_train, y_train, kernel=kernel)
  File "/home/tc/vscode/BA_XAI/.venv/lib/python3.10/site-packages/paramz/parameterized.py", line 53, in __call__
    self = super(Parameter

end= [<GPy.kern.src.linear.Linear object at 0x7fae0c4a01f0>, <GPy.kern.src.rbf.RBF object at 0x7fadc2d2f4c0>]


AssertionError: 