# LSTM Modell erstellen

In [1]:
from statsmodels.tsa.arima.model import ARIMA
import pandas as pd
import numpy as np

In [2]:
# Daten laden
data = pd.read_csv('cleaned_data.csv', parse_dates=['timestamp'])

In [3]:
data

Unnamed: 0,box_id,sensor_id,timestamp,value,sensor_title,sensor_unit,hour,weekday
0,5a8c3d36bc2d4100190c49fb,5a8c3d36bc2d4100190c49ff,2022-06-22 13:54:28.029,2.22,PM10,µg/m³,13,2
1,5a8c3d36bc2d4100190c49fb,5a8c3d36bc2d4100190c49ff,2022-06-22 13:57:20.241,1.90,PM10,µg/m³,13,2
2,5a8c3d36bc2d4100190c49fb,5a8c3d36bc2d4100190c49ff,2022-06-22 14:00:13.641,2.35,PM10,µg/m³,14,2
3,5a8c3d36bc2d4100190c49fb,5a8c3d36bc2d4100190c49fe,2022-06-22 14:00:13.641,1.73,PM2.5,µg/m³,14,2
4,5a8c3d36bc2d4100190c49fb,5a8c3d36bc2d4100190c49ff,2022-06-22 14:03:15.741,1.88,PM10,µg/m³,14,2
...,...,...,...,...,...,...,...,...
3195350,5984c712e3b1fa0010691509,5984c712e3b1fa001069150d,2024-06-20 13:12:17.745,2.20,PM10,µg/m³,13,3
3195351,5984c712e3b1fa0010691509,5984c712e3b1fa001069150c,2024-06-20 13:14:56.923,1.37,PM2.5,µg/m³,13,3
3195352,5984c712e3b1fa0010691509,5984c712e3b1fa001069150c,2024-06-20 13:17:42.294,1.50,PM2.5,µg/m³,13,3
3195353,5984c712e3b1fa0010691509,5984c712e3b1fa001069150c,2024-06-20 13:20:16.131,2.57,PM2.5,µg/m³,13,3


In [4]:
# Pivotieren der Daten, wobei Timestamp, Hour und Weekday als Index verwendet werden
pivoted_data = data.pivot_table(index='timestamp', 
                                columns='sensor_title', 
                                values='value', 
                                aggfunc='first').reset_index()

In [5]:
pivoted_data

sensor_title,timestamp,PM10,PM2.5,Temperatur,rel. Luftfeuchte
0,2022-06-22 13:54:28.029,2.22,,,
1,2022-06-22 13:57:20.241,1.90,,,
2,2022-06-22 14:00:13.641,2.35,1.73,,
3,2022-06-22 14:03:15.741,1.88,1.70,,
4,2022-06-22 14:05:49.439,6.10,,,
...,...,...,...,...,...
1181125,2024-06-20 13:12:17.745,2.20,1.20,,
1181126,2024-06-20 13:14:56.923,,1.37,,
1181127,2024-06-20 13:17:42.294,,1.50,,
1181128,2024-06-20 13:20:16.131,,2.57,,


In [6]:
pivoted_data['hour'] = pivoted_data['timestamp'].dt.hour
pivoted_data['weekday'] = pivoted_data['timestamp'].dt.dayofweek
pivoted_data['month'] = pivoted_data['timestamp'].dt.month

# Anwendung von Sinus- und Kosinus-Transformationen auf 'hour'
pivoted_data['hour_sin'] = np.sin(2 * np.pi * pivoted_data['hour'] / 24)
pivoted_data['hour_cos'] = np.cos(2 * np.pi * pivoted_data['hour'] / 24)

In [7]:
# Entfernen der ursprünglichen 'hour' Spalte
pivoted_data.drop('hour', axis=1, inplace=True)

In [8]:
# One-Hot-Encoding für 'weekday' und 'month'
pivoted_data = pd.get_dummies(pivoted_data, columns=['weekday', 'month'], prefix=['weekday', 'month'])

In [9]:
pivoted_data= pivoted_data.set_index('timestamp')

In [10]:
pivoted_data

Unnamed: 0_level_0,PM10,PM2.5,Temperatur,rel. Luftfeuchte,hour_sin,hour_cos,weekday_0,weekday_1,weekday_2,weekday_3,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-22 13:54:28.029,2.22,,,,-0.258819,-0.965926,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 13:57:20.241,1.90,,,,-0.258819,-0.965926,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:00:13.641,2.35,1.73,,,-0.500000,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:03:15.741,1.88,1.70,,,-0.500000,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:05:49.439,6.10,,,,-0.500000,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-20 13:12:17.745,2.20,1.20,,,-0.258819,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 13:14:56.923,,1.37,,,-0.258819,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 13:17:42.294,,1.50,,,-0.258819,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 13:20:16.131,,2.57,,,-0.258819,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [11]:
# Entfernen aller Zeilen, die NaN-Werte enthalten
cleaned_data = pivoted_data.dropna()

In [12]:
cleaned_data

Unnamed: 0_level_0,PM10,PM2.5,Temperatur,rel. Luftfeuchte,hour_sin,hour_cos,weekday_0,weekday_1,weekday_2,weekday_3,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-22 14:19:36.070,6.28,1.40,31.0,38.8,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:22:06.691,3.47,1.15,31.0,39.2,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:24:39.047,3.65,1.10,31.0,38.9,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:27:10.928,3.00,1.23,31.0,38.5,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:29:43.294,4.15,1.42,31.1,38.8,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-20 11:52:12.327,1.33,0.98,22.4,49.5,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:54:46.598,1.88,1.00,22.3,49.7,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:57:15.518,1.60,0.88,22.5,49.2,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:59:48.684,2.30,1.90,22.5,49.0,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [13]:
# Skalieren der kontinuierlichen Umweltmesswerte
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_to_scale = ['PM10', 'PM2.5', 'Temperatur', 'rel. Luftfeuchte']
cleaned_data.loc[:, features_to_scale] = scaler.fit_transform(cleaned_data[features_to_scale])

# Zusammenführen der skalierten Daten mit den One-Hot-encoded und Sinus-/Kosinus-transformierten Daten
final_features = cleaned_data[features_to_scale + \
                              ['hour_sin', 'hour_cos'] +\
                              cleaned_data.columns[cleaned_data.columns.str.startswith('weekday_')].tolist() + \
                              cleaned_data.columns[cleaned_data.columns.str.startswith('month_')].tolist()]
data_scaled = final_features.values

In [14]:
final_features

Unnamed: 0_level_0,PM10,PM2.5,Temperatur,rel. Luftfeuchte,hour_sin,hour_cos,weekday_0,weekday_1,weekday_2,weekday_3,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-22 14:19:36.070,0.003140,0.00140,0.795501,0.222646,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:22:06.691,0.001735,0.00115,0.795501,0.227735,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:24:39.047,0.001825,0.00110,0.795501,0.223919,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:27:10.928,0.001500,0.00123,0.795501,0.218830,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
2022-06-22 14:29:43.294,0.002075,0.00142,0.797546,0.222646,-5.000000e-01,-0.866025,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-20 11:52:12.327,0.000665,0.00098,0.619632,0.358779,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:54:46.598,0.000940,0.00100,0.617587,0.361323,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:57:15.518,0.000800,0.00088,0.621677,0.354962,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2024-06-20 11:59:48.684,0.001150,0.00190,0.621677,0.352417,2.588190e-01,-0.965926,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [15]:
cleaned_data[features_to_scale].iloc[::,0:2]

Unnamed: 0_level_0,PM10,PM2.5
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-06-22 14:19:36.070,0.003140,0.00140
2022-06-22 14:22:06.691,0.001735,0.00115
2022-06-22 14:24:39.047,0.001825,0.00110
2022-06-22 14:27:10.928,0.001500,0.00123
2022-06-22 14:29:43.294,0.002075,0.00142
...,...,...
2024-06-20 11:52:12.327,0.000665,0.00098
2024-06-20 11:54:46.598,0.000940,0.00100
2024-06-20 11:57:15.518,0.000800,0.00088
2024-06-20 11:59:48.684,0.001150,0.00190


In [16]:
# Funktion zum Erstellen von Zeitfenstern für LSTM
def create_dataset(X, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps), :]
        Xs.append(v)
        ys.append(X[i + time_steps, 0:2])  # Nehmt an, dass die ersten beiden Spalten PM10 und PM2.5 sind
    return np.array(Xs), np.array(ys)

# Dataset erstellen
time_steps = 10
X, y = create_dataset(data_scaled, time_steps)

# Aufteilung der Daten
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Überprüfung der Eingabe-Dimensionen für das Modell
print("Input shape for the model:", X_train.shape)


Input shape for the model: (333228, 10, 25)


In [17]:
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [18]:
# from keras.models import Sequential
# from keras.layers import Dense, LSTM, Input

# # Modellstruktur
# model = Sequential()
# model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
# model.add(LSTM(50))
# model.add(Dense(10, activation='relu'))  # Beispiel für eine versteckte Schicht
# model.add(Dense(2))  # für PM10 und PM2.5
# model.compile(loss='mean_squared_error', optimizer='adam')


In [19]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Dropout

model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))

# Erste LSTM-Schicht mit Dropout
model.add(LSTM(50, return_sequences=True))  # `return_sequences=True` für Stacking
model.add(Dropout(0.2))

# Zweite LSTM-Schicht mit Dropout
model.add(LSTM(50))  # Letzte LSTM-Schicht, `return_sequences` default ist False
model.add(Dropout(0.2))

# Dichte Schicht
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))  # Dropout nach der dichten Schicht

# Ausgabeschicht
model.add(Dense(2))  # für PM10 und PM2.5

# Modell kompilieren
model.compile(loss='mean_squared_error', optimizer='adam')


In [20]:
# Überprüfe die Form der Eingabedaten
print("Train input shape:", X_train.shape)
print("Train output shape:", y_train.shape)

Train input shape: (333228, 10, 25)
Train output shape: (333228, 2)


In [21]:
# Modell trainieren
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test), verbose=2, shuffle=False)


Epoch 1/30
10414/10414 - 82s - 8ms/step - loss: 1.6115e-04 - val_loss: 1.3163e-04
Epoch 2/30
10414/10414 - 89s - 9ms/step - loss: 1.5072e-04 - val_loss: 1.3163e-04
Epoch 3/30
10414/10414 - 86s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 4/30
10414/10414 - 87s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 5/30
10414/10414 - 85s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 6/30
10414/10414 - 85s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 7/30
10414/10414 - 88s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 8/30
10414/10414 - 85s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 9/30
10414/10414 - 84s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 10/30
10414/10414 - 86s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 11/30
10414/10414 - 86s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 12/30
10414/10414 - 85s - 8ms/step - loss: 1.5071e-04 - val_loss: 1.3163e-04
Epoch 13/30
1

In [22]:
# Modellstruktur überprüfen
model.summary()

In [23]:
from sklearn.metrics import mean_squared_error

# Vorhersagen auf dem Testdatensatz
predictions = model.predict(X_test)

# Berechnung des Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

[1m2604/2604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step
Mean Squared Error: 0.00013161374954506755


In [24]:
# Modell Keras-Format speichern
model.save('lstm_modell.keras')

In [28]:
predictions

array([[0.00457867, 0.00316421],
       [0.00457867, 0.00316421],
       [0.00457867, 0.00316421],
       ...,
       [0.00457867, 0.00316421],
       [0.00457867, 0.00316421],
       [0.00457867, 0.00316421]], dtype=float32)

In [29]:
# Angenommen, du hast den gleichen Scaler für alles verwendet und die Indizes der Zielvariablen sind bekannt
# Zum Beispiel: PM10 und PM2.5 sind die ersten beiden Features im skalierten Array

# Extrahiere die relevanten Scales und Minima für die Zielvariablen
scale = scaler.scale_[:2]  # Ersten zwei Scales
min_ = scaler.min_[:2]  # Ersten zwei Minima

# Manuelle Rückskalierung
predictions_rescaled = predictions * scale + min_
y_test_rescaled = y_test * scale + min_

# Berechnung des Mean Squared Error nach der Reskalierung
mse_rescaled = mean_squared_error(y_test_rescaled, predictions_rescaled)
print(f"Mean Squared Error (rescaled): {mse_rescaled}")


Mean Squared Error (rescaled): 1.0454849216244361e-10


In [None]:
predictions_rescaled

In [None]:
# Modell laden
from tensorflow.keras.models import load_model
model = load_model('lstm_modell.keras')