In [None]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from keras.optimizers import Adam, AdamW
from keras.layers import Bidirectional, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import r2_score
from tensorflow.python.keras.losses import mean_squared_error, mean_absolute_error
from tensorflow.keras.layers import Conv1D, MaxPooling1D

In [None]:
df = pd.read_csv("thesis_code/thesis/webapp/dataset/final_data.csv")
df.shape

In [None]:
df.head(2)

In [None]:
df.drop("Unnamed: 0", axis=1, inplace=True)

df["Date"] = pd.to_datetime(df["Date"])

In [None]:
df.head(2)

In [None]:
df.isna().sum()

In [None]:
features = [
    # "Station1_CO", "Station1_NO2", "Station1_NOx",
    # "Station2_CO", "Station2_NO2", "Station2_NOx", "Station2_O3",
    # "Station1_SO2", "Station2_SO2", "Station1_PM10",
    "temp", "humidity", "precip",
    "precipcover", "cloudcover", "windspeed", "visibility",
    "winddir_sin", "winddir_cos", "is_heating_season", "is_work_day",
    "year", "month", "day"
]
target = "Station2_PM10"

In [None]:
scaler_features = StandardScaler()
scaler_target = StandardScaler()
df_features_scaled = scaler_features.fit_transform(df[features])
df_target_scaled = scaler_target.fit_transform(df[[target]])
df_scaled = np.hstack((df_features_scaled, df_target_scaled))

In [None]:
def create_sequences(data, target_index, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, target_index])
    return np.array(X), np.array(y)

In [None]:
X, y = create_sequences(df_scaled, target_index=len(features), seq_length=31)

split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [None]:
# # prompt: generate train data to be from 20212 to 2023 and test 2024

# # Assuming 'df' is your DataFrame with a 'Date' column
# train_data = df[(df["Date"].dt.year >= 2021) & (df["Date"].dt.year <= 2023)]
# test_data = df[df["Date"].dt.year == 2024]

# # Now split the scaled data
# split_idx_train = len(train_data) - 31  # Account for sequence length

# X_train, y_train = create_sequences(df_scaled[:split_idx_train], target_index=len(features), seq_length=31)
# X_test, y_test = create_sequences(df_scaled[split_idx_train:], target_index=len(features), seq_length=31)


In [None]:
# train_data = df_scaled[(df_scaled['year'] < 2024) & (df['year'] > 2022)]
# test_data = df[df['year'] == 2024]

# X_train = train_data[features]
# y_train = train_data[target]

# X_test = test_data[features]
# y_test = test_data[target]


In [None]:
model = Sequential([
    LSTM(512, return_sequences=True, recurrent_activation='sigmoid', input_shape=(72, X.shape[2])),
    Dropout(0.3),
    BatchNormalization(),

    LSTM(512, return_sequences=True, recurrent_activation='sigmoid'),
    Dropout(0.3),
    BatchNormalization(),

    LSTM(256, return_sequences=True),
    Dropout(0.2),
    BatchNormalization(),

    LSTM(128, return_sequences=False),
    Dropout(0.2),
    BatchNormalization(),

    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])
# model = load_model("pm10_lstm.h5")

In [None]:
optimizer = AdamW(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mae', metrics=['mae'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=96,
    callbacks=[early_stopping]
  )

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_actual = scaler_target.inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_actual = scaler_target.inverse_transform(y_pred).flatten()

In [None]:
mae = mean_absolute_error(y_test_actual, y_pred_actual)
print(f"MAE: {mae}")
mse = mean_squared_error(y_test_actual, y_pred_actual)
print(f"MSE: {mse}")
r2 = r2_score(y_test_actual, y_pred_actual)
print(f"R2: {r2}")

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test_actual, label='Actual PM10', alpha=0.8)
plt.plot(y_pred_actual, label='Predicted PM10', marker='.', alpha=0.7)
plt.legend()
plt.title('Actual vs Predicted PM10 Values')
plt.ylabel('PM10')
plt.xlabel('Test Sample index')
plt.show()

In [None]:
losses = [mean_absolute_error(y_test_actual[:i+1], y_pred_actual[:i+1]) for i in range(len(y_test_actual))]


plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.xlabel("Epoch")
plt.ylabel("Mean Absolute Error")
plt.title("Loss Curve")
plt.grid(True)
plt.show()