In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
import xarray as xr
import plotly.express as px
import statsmodels.formula.api as smf
from tqdm import tqdm
from datetime import datetime
from tensorflow.keras.layers import LSTM, Dropout, Dense, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Input
import os
from itertools import product

In [2]:
# Funktion zur Vorbereitung von CSV-Daten (Konvertierung der 'dtm' Spalte in datetime-Format)
def pre_csv(df):
    df.dtm = pd.to_datetime(df.dtm)  # Konvertiert die 'dtm' Spalte in das Datumsformat
    return df

# Funktion zur Vorbereitung von DWD-Daten (Konvertierung und Bereinigung)
def pre_dwd(df):
    df = df.to_dataframe().reset_index().rename(columns={"ref_datetime": "reference_time", "valid_datetime": "valid_time"})
    df.reference_time = df.reference_time.dt.tz_localize("UTC")  # Lokalisierung der 'reference_time' auf UTC
    df.valid_time = df.reference_time + df.valid_time * pd.Timedelta(1, "h")  # Berechnung von 'valid_time' basierend auf Stunden
    return df

# Platzhalter für zukünftige NCEP-Datenvorbereitung
def pre_ncep(df):
    return df

# Öffnen und Vorbereiten mehrerer DWD-Datensätze (PES)
df_pes_0 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_pes10_20200920_20231027.nc"))
df_pes_1 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_pes10_20231027_20240108.nc"))
df_pes_2 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_pes10_20240108_20240129.nc"))
df_pes_3 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_pes10_20240129_20240519.nc"))

# Zusammenführen, Sortieren und Bereinigen der PES-Daten
df_pes = pd.concat([df_pes_0, df_pes_1, df_pes_2, df_pes_3]).sort_values(["reference_time", "valid_time"]).reset_index(drop=True)
del df_pes_0, df_pes_1, df_pes_2, df_pes_3

df_pes = df_pes.groupby(["reference_time", "valid_time"]).mean().reset_index().drop(columns=["point", "longitude", "latitude"])

# Interpolation auf 30-Minuten-Intervalle
df_pes = df_pes.set_index("valid_time").groupby(["reference_time"]).resample("30min").interpolate("linear").drop(columns="reference_time").reset_index()

# Gleicher Prozess für Hornsea-Daten
df_hornsea_0 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_hornsea_1_20200920_20231027.nc"))
df_hornsea_1 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_hornsea_1_20231027_20240108.nc"))
df_hornsea_2 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_hornsea_1_20240108_20240129.nc"))
df_hornsea_3 = pre_dwd(xr.open_dataset("data/dwd_icon_eu_hornsea_1_20240129_20240519.nc"))

df_hornsea = pd.concat([df_hornsea_0, df_hornsea_1, df_hornsea_2, df_hornsea_3]).sort_values(["reference_time", "valid_time"]).reset_index(drop=True)
del df_hornsea_0, df_hornsea_1, df_hornsea_2, df_hornsea_3

df_hornsea = df_hornsea.groupby(["reference_time", "valid_time"]).mean().reset_index().drop(columns=["longitude", "latitude"])

df_hornsea = df_hornsea.set_index("valid_time").groupby(["reference_time"]).resample("30min").interpolate("linear").drop(columns="reference_time").reset_index()

# Einlesen und Vorbereiten von CSV-Daten
df_0 = pre_csv(pd.read_csv("data/Energy_Data_20200920_20240118.csv"))
df_1 = pre_csv(pd.read_csv("data/Energy_Data_20240119_20240519.csv"))

# Zusammenführen und Sortieren der CSV-Daten
df = pd.concat([df_0, df_1]).sort_values(["dtm"]).reset_index(drop=True)
del df_0, df_1

# Berechnen von Wind- und Solar-MWh-Krediten
df["Wind_MWh_credit"] = 0.5 * df["Wind_MW"] - df["boa_MWh"]
df["Solar_MWh_credit"] = 0.5 * df["Solar_MW"]

# Zusammenführen der PES- und Hornsea-Daten
df_full = pd.merge(df_pes, df_hornsea, on=["reference_time", "valid_time"])

# Zusammenführen mit CSV-Daten basierend auf der 'valid_time'
df_full = df_full.merge(df[["dtm", "Wind_MWh_credit", "Solar_MWh_credit"]], left_on="valid_time", right_on="dtm", how="left")

# Berechnung zusätzlicher Spalten
df_full["forcast_hours"] = (df_full.valid_time - df_full.reference_time) / pd.Timedelta(1, "h")
df_full["year"] = df_full.valid_time.dt.year
df_full["month"] = df_full.valid_time.dt.month
df_full["day"] = df_full.valid_time.dt.day
df_full["hour"] = df_full.valid_time.dt.hour

# Berechnung der Gesamterzeugung (MWh)
df_full["total_generation_MWh"] = df_full["Wind_MWh_credit"] + df_full["Solar_MWh_credit"]

df_train = df_full.loc[df_full.reference_time < "2023-05-20"]
df_test = df_full.loc[df_full.reference_time >= "2023-05-20"]


In [7]:
df_test.shape

(351137, 20)

In [10]:
def pinball(y, q, alpha):
    return (y - q) * (alpha - (y < q))

def pinball_score(df, model_idx):
    score = []
    for qu in range(10, 100, 10):
        y_true = df['total_generation_MWh']
        y_pred = df[f'Model_{model_idx}_q{qu}']
        score.append(pinball(y_true, y_pred, qu / 100).mean())
    return score

# Annahme: df_full ist bereits geladen und enthält die Spalte 'reference_time'
df_train = df_full.loc[df_full.reference_time < "2023-05-20"]
df_test = df_full.loc[df_full.reference_time >= "2023-05-20"]

data_sizes = [299008]#, 200000, 300000]  # Hier kannst du weitere Größen hinzufügen
regularizers_faktor = [0.4,  0.1, ]
dropout_rates_1 = [0.3, ]
dropout_rates_2 = [0.1, ]
epochs_options = [5, ]
batch_sizes = [2048]  #1024
sequence_lengths = [12]
lstm_units_1 = [64]  
lstm_units_2 = [32] 

current_time = datetime.datetime.now().strftime("%d_%H-%M-%S")
results_file_name = f"data/results_{current_time}.csv"

# Sicherstellen, dass das Verzeichnis existiert
os.makedirs(os.path.dirname(results_file_name), exist_ok=True)

with open(results_file_name, 'w') as f:
    f.write('Epochs,Batch_Size,Sequence_Length,Dropout_Rate_1,Dropout_Rate_2,Regularizers_Faktor,LSTM_Units_1,LSTM_Units_2,Data_Size,Pinball_Score_Train,Pinball_Score_Test\n')

for data_size in data_sizes:
    df_train_subset = df_train.head(data_size)
    df_test_subset = df_test.head(data_size)

    features = ['CloudCover', 'SolarDownwardRadiation', 'Temperature_x',
                'RelativeHumidity', 'Temperature_y', 'WindDirection',
                'WindSpeed', 'forcast_hours', 'year', 'month', 'day', 'hour']
    target = 'total_generation_MWh'

    scaler = MinMaxScaler()
    scaled_train_data = scaler.fit_transform(df_train_subset[features + [target]])

    # Sequenzen für Trainingsdaten erstellen
    X_train, y_train = [], []
    for i in range(sequence_lengths[0], len(scaled_train_data)):
        X_train.append(scaled_train_data[i-sequence_lengths[0]:i, :-1])
        y_train.append(scaled_train_data[i, -1])
    X_train, y_train = np.array(X_train), np.array(y_train)

    scaled_test_data = scaler.transform(df_test_subset[features + [target]])

    # Sequenzen für Testdaten erstellen
    X_test, y_test = [], []
    for i in range(sequence_lengths[0], len(scaled_test_data)):
        X_test.append(scaled_test_data[i-sequence_lengths[0]:i, :-1])
        y_test.append(scaled_test_data[i, -1])
    X_test, y_test = np.array(X_test), np.array(y_test)

    # Parameterkombinationen definieren, einschließlich der LSTM-Einheiten
    
    param_combinations = list(product(
        epochs_options,
        batch_sizes,
        dropout_rates_1,
        dropout_rates_2,
        regularizers_faktor,
        lstm_units_1,
        lstm_units_2  
    ))

    for epochs, batch_size, dropout_rate_1, dropout_rate_2, reg_factor, units_1, units_2 in param_combinations:
        print(f"Trainiere Modell mit {units_1} {units_2} LSTM-Einheiten, Epochs: {epochs}, Batch Size: {batch_size}, "
            f"regularizers_faktor: {reg_factor}, Dropout Rate 1: {dropout_rate_1}, Dropout Rate 2: {dropout_rate_2}")

        model = Sequential()
        # Hier fügen wir den Input-Layer hinzu
        model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
        
        model.add(LSTM(units_1, return_sequences=True,
                    kernel_regularizer=tf.keras.regularizers.l2(reg_factor)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate_1))
        model.add(LSTM(units_2, return_sequences=False, kernel_regularizer=tf.keras.regularizers.l2(reg_factor)))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate_2))
        model.add(Dense(1))

        if tf.config.list_physical_devices('GPU'):
            print("GPU gefunden. Das Modell wird auf der GPU trainiert.")
        else:
            print("Keine GPU gefunden. Das Modell wird auf der CPU trainiert.")

        model.compile(optimizer='adam', loss='mean_squared_error')
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2,
                  callbacks=[early_stopping], verbose=0)

        df_train_subset['Model_1_q50'] = np.nan
        train_predictions = model.predict(X_train)
        df_train_subset.loc[df_train_subset.index[-len(train_predictions):], 'Model_1_q50'] = train_predictions.flatten()

        for qu in range(10, 100, 10):
            df_train_subset[f"Model_1_q{qu}"] = df_train_subset['Model_1_q50'] * (qu / 100)

        train_score = pinball_score(df_train_subset, 1)
        overall_pinball_score_train = sum(train_score) / len(train_score)

        df_test_subset['Model_1_q50'] = np.nan
        test_predictions = model.predict(X_test)
        df_test_subset.loc[df_test_subset.index[-len(test_predictions):], 'Model_1_q50'] = test_predictions.flatten()

        for qu in range(10, 100, 10):
            df_test_subset[f"Model_1_q{qu}"] = df_test_subset['Model_1_q50'] * (qu / 100)

        test_score = pinball_score(df_test_subset, 1)
        overall_pinball_score_test = sum(test_score) / len(test_score)

        # Ergebnisse in die CSV schreiben, einschließlich der LSTM-Einheiten
        with open(results_file_name, 'a') as f:
            f.write(f"{epochs},{batch_size},{sequence_lengths[0]},{dropout_rate_1},{dropout_rate_2},{reg_factor},{units_1},{units_2},{data_size},{overall_pinball_score_train},{overall_pinball_score_test}\n")

        results_df = pd.read_csv(results_file_name)
        best_result = results_df.loc[results_df['Pinball_Score_Test'].idxmin()]
        print(f"Beste Kombination: Epochs: {best_result['Epochs']}, Batch Size: {best_result['Batch_Size']}, "
              f"regularizers_faktor: {best_result['Regularizers_Faktor']}, LSTM Units 1: {best_result['LSTM_Units_1']},LSTM Units 2: {best_result['LSTM_Units_2']}, "
              f"Dropout Rate 1: {best_result['Dropout_Rate_1']}, Dropout Rate 2: {best_result['Dropout_Rate_2']}, "
              f"Pinball Score Test: {best_result['Pinball_Score_Test']}")


Trainiere Modell mit 64 32 LSTM-Einheiten, Epochs: 5, Batch Size: 2048, regularizers_faktor: 0.4, Dropout Rate 1: 0.3, Dropout Rate 2: 0.1
Keine GPU gefunden. Das Modell wird auf der CPU trainiert.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset['Model_1_q50'] = np.nan


[1m9344/9344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset[f"Model_1_q{qu}"] = df_train_subset['Model_1_q50'] * (qu / 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset[f"Model_1_q{qu}"] = df_train_subset['Model_1_q50'] * (qu / 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset[f"Model_1_q{qu}"] = df_train_

[1m9344/9344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_subset[f"Model_1_q{qu}"] = df_test_subset['Model_1_q50'] * (qu / 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_subset[f"Model_1_q{qu}"] = df_test_subset['Model_1_q50'] * (qu / 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_subset[f"Model_1_q{qu}"] = df_test_subset

Beste Kombination: Epochs: 5.0, Batch Size: 2048.0, regularizers_faktor: 0.4, LSTM Units 1: 64.0,LSTM Units 2: 32.0, Dropout Rate 1: 0.3, Dropout Rate 2: 0.1, Pinball Score Test: 190.5070803122052
Trainiere Modell mit 64 32 LSTM-Einheiten, Epochs: 5, Batch Size: 2048, regularizers_faktor: 0.1, Dropout Rate 1: 0.3, Dropout Rate 2: 0.1
Keine GPU gefunden. Das Modell wird auf der CPU trainiert.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset['Model_1_q50'] = np.nan


[1m9344/9344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_subset[f"Model_1_q{qu}"] = df_train_subset['Model_1_q50'] * (qu / 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_subset['Model_1_q50'] = np.nan


[1m9344/9344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step
Beste Kombination: Epochs: 5.0, Batch Size: 2048.0, regularizers_faktor: 0.1, LSTM Units 1: 64.0,LSTM Units 2: 32.0, Dropout Rate 1: 0.3, Dropout Rate 2: 0.1, Pinball Score Test: 190.50482848337572


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_subset[f"Model_1_q{qu}"] = df_test_subset['Model_1_q50'] * (qu / 100)
