In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("./data/final/merged-final.csv", sep=';').dropna().drop("day", axis=1)
df["wind_dir_cat"] = df.wind_dir.apply(lambda x: wind_dir_cat[x])
df["weather_event_cat"] = df.weather_event.apply(lambda x: weather_event_cat[x])
df["atmo_cat"] = df.ATMO.apply(lambda x: atmo_cat[x])
df = df.drop(["ATMO", "weather_event", "wind_dir"], axis=1)

# Décalage d'une heure pour prédire le PM10 de l'heure suivante
pm10_values = df.PM10.tolist()
pm10_values.insert(0, 0)
pm10_values.pop()
len(pm10_values), len(df.PM10.tolist())
df.PM10 = pm10_values
df = df.drop(0)
df

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,temp,wind_speed,hum,press,wind_dir_cat,weather_event_cat,atmo_cat,atmo_mean_previous_3h,atmo_target_1h,atmo_cat_h-1,atmo_cat_h-2,atmo_cat_h-3
0,2019-01-01 04:00:00,10.2,12.5,25.4,2.6,1.0,27.0,36.6,7.9,3.7,82.0,1035.4,9,1,2,1.666667,1,2,2,1
1,2019-01-01 05:00:00,11.1,6.9,18.2,3.1,0.6,19.2,48.9,8.0,3.7,81.0,1034.8,2,1,1,2.000000,1,2,2,2
2,2019-01-01 06:00:00,10.7,6.1,26.3,3.2,1.6,28.8,34.8,8.3,3.7,80.0,1034.9,9,1,1,1.666667,2,1,2,2
3,2019-01-01 07:00:00,15.6,10.4,31.2,3.0,2.7,35.3,36.3,8.4,3.7,79.0,1035.0,9,1,2,1.333333,2,1,1,2
4,2019-01-01 08:00:00,18.2,11.8,25.1,3.1,2.8,29.4,39.8,8.5,5.6,79.0,1035.0,13,1,2,1.333333,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25595,2021-12-30 16:00:00,9.6,6.7,25.6,0.8,2.8,30.0,26.0,13.7,13.0,87.0,1022.0,1,3,1,1.000000,1,1,1,1
25596,2021-12-30 17:00:00,11.4,8.1,23.9,0.9,1.5,26.2,25.2,13.9,14.8,87.0,1022.3,1,3,1,1.000000,1,1,1,1
25597,2021-12-30 18:00:00,10.8,7.5,27.7,0.6,2.3,31.2,20.9,14.0,14.8,87.0,1022.7,3,3,1,1.000000,1,1,1,1
25598,2021-12-30 19:00:00,11.9,8.4,25.8,0.6,1.1,27.6,21.2,13.4,13.0,88.0,1022.8,3,3,1,1.000000,1,1,1,1


# Training

In [None]:
targets = ["PM10", "PM25", "NO2", "SO2", "O3"]

In [None]:
y = df["PM10"]
X = df.drop(["date", "PM10"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [6, 12, 24],
    "criterion": ["squared_error"],
    "min_samples_split": [4, 8, 12]
}
 
rf = RandomForestRegressor()

gcv = GridSearchCV(
    estimator=rf,
    param_grid=param_grid
)

In [8]:
gcv.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'criterion': ['squared_error'],
                         'max_depth': [6, 12, 24],
                         'min_samples_split': [4, 8, 12],
                         'n_estimators': [200, 300, 400]})

In [None]:
best_params = gcv.best_params_

In [None]:
custom_params = {"criterion": "squared_error", "max_depth": 24, "min_samples_split": 8, "n_estimators": 200}
rf = RandomForestRegressor(**custom_params)

In [None]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=24, min_samples_split=8, n_estimators=200)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
mae = mean_absolute_error(y_pred, y_test)

print(f"r2  = {r2}\nmse = {mse}\nmae = {mae}")

r2  = -0.9724593363262857
mse = 1.091509984374025
mae = 0.5754740009104322


In [None]:
y_pred

array([1.2972302 , 3.86009537, 3.03199947, ..., 2.00559754, 1.00001695,
       2.26284945])

In [None]:
y_test

10091    1
25218    4
20868    2
23114    1
20629    2
        ..
21262    2
8225     2
13295    2
20765    1
218      2
Name: atmo_target_1h, Length: 5120, dtype: int64