In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
weather_event_cat = {
    'inconnu': 1, 'pluie faible': 2, 'ciel clair': 3, 'brouillard faible': 4, 'pluie': 5, 'brouillard': 6,
    'neige faible': 7, 'pluie forte': 8, 'neige': 9, 'brouillard fort': 10
}

atmo_cat = {'bon': 1, 'moyen': 2, 'dégradé': 3, 'mauvais': 4, "très mauvais": 5, "extrêmement mauvais": 6}

wind_dir_cat = {
    'SO': 1, 'O': 2, 'SSO': 3, 'N': 4, 'S': 5, 'NE': 6, 'OSO': 7, 'NNO': 8, 'ONO': 9, 'ENE': 10, 'E': 11,
    'NNE': 12, 'NO': 13, 'SSE': 14, 'SE': 15, 'ESE': 16
}

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/thibaultrichel/citeos-air-quality/main/data/final/merged-final.csv", sep=';').dropna().drop("day", axis=1)
df["wind_dir_cat"] = df.wind_dir.apply(lambda x: wind_dir_cat[x])
df["weather_event_cat"] = df.weather_event.apply(lambda x: weather_event_cat[x])
df["atmo_cat"] = df.ATMO.apply(lambda x: atmo_cat[x])
df = df.drop(["ATMO", "weather_event", "wind_dir"], axis=1)
df

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,temp,wind_speed,hum,press,wind_dir_cat,weather_event_cat,atmo_cat
0,2019-01-01 01:00:00,5.9,9.2,18.6,2.4,1.3,20.7,41.8,8.2,3.7,84.0,1036.0,9,1,1
1,2019-01-01 02:00:00,5.4,10.3,19.7,2.3,1.4,21.9,39.8,7.9,3.7,90.0,1036.2,8,1,2
2,2019-01-01 03:00:00,8.6,12.9,24.3,2.1,0.7,25.4,32.8,7.7,3.7,88.0,1035.8,2,1,2
3,2019-01-01 04:00:00,10.2,12.5,25.4,2.6,1.0,27.0,36.6,7.9,3.7,82.0,1035.4,9,1,2
4,2019-01-01 05:00:00,11.1,6.9,18.2,3.1,0.6,19.2,48.9,8.0,3.7,81.0,1034.8,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26271,2021-12-30 17:00:00,11.4,8.1,23.9,0.9,1.5,26.2,25.2,13.9,14.8,87.0,1022.3,1,3,1
26272,2021-12-30 18:00:00,10.8,7.5,27.7,0.6,2.3,31.2,20.9,14.0,14.8,87.0,1022.7,3,3,1
26273,2021-12-30 19:00:00,11.9,8.4,25.8,0.6,1.1,27.6,21.2,13.4,13.0,88.0,1022.8,3,3,1
26274,2021-12-30 20:00:00,11.8,8.8,31.1,0.6,1.1,32.9,14.2,12.5,11.1,91.0,1022.9,3,3,1


In [None]:
def multivariate_data(dataset, target, start_index, end_index, history_size, target_size):
    data, labels = [], []
    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size
    for i in range(start_index, end_index):
        indices = list(range(i-history_size, i))
        data.append(dataset[indices])
        labels.append(target[i+target_size])
    return np.array(data), np.array(labels)

In [None]:
X = df.drop("date", axis=1).values
print(f"{X}\n{X.shape}")

[[ 5.9  9.2 18.6 ...  9.   1.   1. ]
 [ 5.4 10.3 19.7 ...  8.   1.   2. ]
 [ 8.6 12.9 24.3 ...  2.   1.   2. ]
 ...
 [11.9  8.4 25.8 ...  3.   3.   1. ]
 [11.8  8.8 31.1 ...  3.   3.   1. ]
 [14.1 10.5 35.9 ...  3.   3.   2. ]]
(25611, 14)


In [None]:
targets = pd.get_dummies(X[:, 13]).values
targets

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]], dtype=uint8)

In [None]:
n_past = 120
n_future = 12
train_size = int(X.shape[0] * 0.75)

X_train, y_train = multivariate_data(
    dataset=X,
    target=targets,
    start_index=0,
    end_index=train_size,
    history_size=n_past,
    target_size=n_future
)

X_train.shape, y_train.shape

((19088, 120, 14), (19088, 6))

In [None]:
X_test, y_test = multivariate_data(
    dataset=X,
    target=targets,
    start_index=train_size,
    end_index=len(X)-n_future,
    history_size=n_past,
    target_size=n_future
)

X_test.shape, y_test.shape

((6271, 120, 14), (6271, 6))

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(6))
model.compile(optimizer="adam", loss="mae", metrics=["mae"])

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    verbose=1,
    shuffle=False
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_absolute_percentage_error

In [None]:
y_pred = model.predict(X_test)
y_pred = pd.DataFrame(abs(np.round(y_pred))).idxmax(axis=1)
y_pred.value_counts()

1    5726
0     545
dtype: int64

In [None]:
y_test[0]
y_pred[0]

array([0, 1, 0, 0, 0, 0], dtype=uint8)

1

In [None]:
acc = accuracy_score(y_test, y_pred_ok)
f1 = f1_score(y_test, y_pred_ok)
mae = mean_absolute_error(y_test, y_pred_ok)
mape = mean_absolute_percentage_error(y_test, y_pred_ok)

print(f"acc = {acc}\nf1 = {f1}\nmae = {mae}\nmape = {mape}")