In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [4]:
weather_event_cat = {
    'inconnu': 1, 'pluie faible': 2, 'ciel clair': 3, 'brouillard faible': 4, 'pluie': 5, 'brouillard': 6,
    'neige faible': 7, 'pluie forte': 8, 'neige': 9, 'brouillard fort': 10
}

atmo_cat = {'bon': 1, 'moyen': 2, 'dégradé': 3, 'mauvais': 4, "très mauvais": 5, "extrêmement mauvais": 6}

wind_dir_cat = {
    'SO': 1, 'O': 2, 'SSO': 3, 'N': 4, 'S': 5, 'NE': 6, 'OSO': 7, 'NNO': 8, 'ONO': 9, 'ENE': 10, 'E': 11,
    'NNE': 12, 'NO': 13, 'SSE': 14, 'SE': 15, 'ESE': 16
}

In [5]:
df = pd.read_csv("./data/final/merged-final.csv", sep=';').dropna().drop("day", axis=1)
df["wind_dir_cat"] = df.wind_dir.apply(lambda x: wind_dir_cat[x])
df["weather_event_cat"] = df.weather_event.apply(lambda x: weather_event_cat[x])
df["atmo_cat"] = df.ATMO.apply(lambda x: atmo_cat[x])
df = df.drop(["ATMO", "weather_event", "wind_dir"], axis=1)

# Décalage d'une heure pour prédire le PM10 de l'heure suivante
pm10_values = df.PM10.tolist()
pm10_values.insert(0, 0)
pm10_values.pop()
len(pm10_values), len(df.PM10.tolist())
df.PM10 = pm10_values
df = df.drop(0)
df

14.1

(25611, 25611)

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,temp,wind_speed,hum,press,wind_dir_cat,weather_event_cat,atmo_cat
1,2019-01-01 02:00:00,5.9,10.3,19.7,2.3,1.4,21.9,39.8,7.9,3.7,90.0,1036.2,8,1,2
2,2019-01-01 03:00:00,5.4,12.9,24.3,2.1,0.7,25.4,32.8,7.7,3.7,88.0,1035.8,2,1,2
3,2019-01-01 04:00:00,8.6,12.5,25.4,2.6,1.0,27.0,36.6,7.9,3.7,82.0,1035.4,9,1,2
4,2019-01-01 05:00:00,10.2,6.9,18.2,3.1,0.6,19.2,48.9,8.0,3.7,81.0,1034.8,2,1,1
5,2019-01-01 06:00:00,11.1,6.1,26.3,3.2,1.6,28.8,34.8,8.3,3.7,80.0,1034.9,9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26271,2021-12-30 17:00:00,9.6,8.1,23.9,0.9,1.5,26.2,25.2,13.9,14.8,87.0,1022.3,1,3,1
26272,2021-12-30 18:00:00,11.4,7.5,27.7,0.6,2.3,31.2,20.9,14.0,14.8,87.0,1022.7,3,3,1
26273,2021-12-30 19:00:00,10.8,8.4,25.8,0.6,1.1,27.6,21.2,13.4,13.0,88.0,1022.8,3,3,1
26274,2021-12-30 20:00:00,11.9,8.8,31.1,0.6,1.1,32.9,14.2,12.5,11.1,91.0,1022.9,3,3,1


# Training

In [None]:
targets = ["PM10", "PM25", "NO2", "SO2", "O3"]

In [6]:
y = df["PM10"]
X = df.drop(["date", "PM10"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
param_grid = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [6, 12, 24],
    "criterion": ["squared_error", "absolute_error"],
    "min_samples_split": [4, 8, 12],
    "min_samples_leaf": [4, 8, 12]
}
 
rf = RandomForestRegressor()

gcv = GridSearchCV(
    estimator=rf,
    param_grid=param_grid
)

In [None]:
gcv.fit(X_train, y_train)

In [None]:
best_params = gcv.best_params_

In [None]:
custom_params = {"criterion": "entropy", "max_depth": 24, "min_samples_split": 8, "n_estimators": 200}
rf = RandomForestClassifier(**best_params)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
mae = mean_absolute_error(y_pred, y_test)

print(f"r2  = {r2}\nmse = {mse}\nmae = {mae}")

In [None]:
y_pred

In [None]:
y_test