In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
weather_event_cat = {
    'inconnu': 1, 'pluie faible': 2, 'ciel clair': 3, 'brouillard faible': 4, 'pluie': 5, 'brouillard': 6,
    'neige faible': 7, 'pluie forte': 8, 'neige': 9, 'brouillard fort': 10
}

atmo_cat = {'bon': 1, 'moyen': 2, 'dégradé': 3, 'mauvais': 4, "très mauvais": 5, "extrêmement mauvais": 6}

wind_dir_cat = {
    'SO': 1, 'O': 2, 'SSO': 3, 'N': 4, 'S': 5, 'NE': 6, 'OSO': 7, 'NNO': 8, 'ONO': 9, 'ENE': 10, 'E': 11,
    'NNE': 12, 'NO': 13, 'SSE': 14, 'SE': 15, 'ESE': 16
}

In [4]:
def get_mean_previous(data, n):
    atmo = data.atmo_cat.tolist()
    return n*[-1] + [np.mean(atmo[i-n:i]) for i in range(n, len(atmo))]

In [5]:
def get_target(data, n):
    atmo = data.atmo_cat.tolist()
    return atmo[n:] + n*[-1]

In [6]:
def get_previous_atmo(data, n_previous=3):
    atmo_values = data.atmo_cat.tolist()
    temp = atmo_values
    for i in range(n_previous):
        temp.insert(0, -1)
        temp.pop()
        data[f"atmo_cat_h-{i+1}"] = temp
    return data

In [7]:
def add_features(data, n_mean, n_target):
    means = get_mean_previous(data, n_mean)
    target = get_target(data, n_target)
    res = data.copy()
    res[f"atmo_mean_previous_{n_mean}h"] = means
    res[f"atmo_target_{n_target}h"] = target
    return res

In [8]:
def preprocess(data, n_previous, n_target, n_mean):
    res = data.copy()
    res = add_features(res, n_mean, n_target)
    res = get_previous_atmo(res, n_previous)
    res["test"] = res.apply(lambda x: 1 if -1 in x.tolist() else 0, axis=1)
    res = res[res["test"] != 1].drop("test", axis=1)
    return res

In [9]:
df = pd.read_csv("./data/final/merged-final.csv", sep=';').dropna().drop("day", axis=1)
df["wind_dir_cat"] = df.wind_dir.apply(lambda x: wind_dir_cat[x])
df["weather_event_cat"] = df.weather_event.apply(lambda x: weather_event_cat[x])
df["atmo_cat"] = df.ATMO.apply(lambda x: atmo_cat[x])
df = df.drop(["ATMO", "weather_event", "wind_dir"], axis=1)
df

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,temp,wind_speed,hum,press,wind_dir_cat,weather_event_cat,atmo_cat
0,2019-01-01 01:00:00,5.9,9.2,18.6,2.4,1.3,20.7,41.8,8.2,3.7,84.0,1036.0,9,1,1
1,2019-01-01 02:00:00,5.4,10.3,19.7,2.3,1.4,21.9,39.8,7.9,3.7,90.0,1036.2,8,1,2
2,2019-01-01 03:00:00,8.6,12.9,24.3,2.1,0.7,25.4,32.8,7.7,3.7,88.0,1035.8,2,1,2
3,2019-01-01 04:00:00,10.2,12.5,25.4,2.6,1.0,27.0,36.6,7.9,3.7,82.0,1035.4,9,1,2
4,2019-01-01 05:00:00,11.1,6.9,18.2,3.1,0.6,19.2,48.9,8.0,3.7,81.0,1034.8,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26271,2021-12-30 17:00:00,11.4,8.1,23.9,0.9,1.5,26.2,25.2,13.9,14.8,87.0,1022.3,1,3,1
26272,2021-12-30 18:00:00,10.8,7.5,27.7,0.6,2.3,31.2,20.9,14.0,14.8,87.0,1022.7,3,3,1
26273,2021-12-30 19:00:00,11.9,8.4,25.8,0.6,1.1,27.6,21.2,13.4,13.0,88.0,1022.8,3,3,1
26274,2021-12-30 20:00:00,11.8,8.8,31.1,0.6,1.1,32.9,14.2,12.5,11.1,91.0,1022.9,3,3,1


In [10]:
dfok = preprocess(data=df, n_target=6, n_mean=12, n_previous=6)
dfok

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,temp,wind_speed,...,weather_event_cat,atmo_cat,atmo_mean_previous_12h,atmo_target_6h,atmo_cat_h-1,atmo_cat_h-2,atmo_cat_h-3,atmo_cat_h-4,atmo_cat_h-5,atmo_cat_h-6
12,2019-01-01 13:00:00,14.8,11.1,10.9,3.3,1.5,13.2,62.2,8.6,7.4,...,1,2,1.750000,6,2,2,2,2,2,2
13,2019-01-01 14:00:00,19.4,10.5,20.7,3.6,3.0,25.3,51.6,8.8,5.6,...,1,2,1.833333,2,2,2,2,2,2,2
14,2019-01-01 15:00:00,19.6,9.7,22.3,3.6,2.6,26.3,47.9,8.8,3.7,...,1,1,1.833333,2,2,2,2,2,2,2
15,2019-01-01 16:00:00,18.6,13.6,24.2,3.4,2.5,28.1,45.6,8.7,7.4,...,1,2,1.750000,2,1,2,2,2,2,2
16,2019-01-01 17:00:00,22.8,13.5,27.2,3.4,3.2,32.2,39.8,8.0,5.6,...,1,2,1.750000,1,2,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26265,2021-12-30 11:00:00,11.6,7.8,20.2,0.8,2.5,24.1,35.3,14.2,20.4,...,3,1,1.416667,1,1,1,1,1,1,1
26266,2021-12-30 12:00:00,10.2,6.7,15.9,1.0,3.1,20.6,38.2,14.5,14.8,...,3,1,1.416667,1,1,1,1,1,1,1
26267,2021-12-30 13:00:00,9.0,5.3,16.1,0.8,2.6,20.1,39.5,14.8,13.0,...,3,1,1.000000,1,1,1,1,1,1,1
26268,2021-12-30 14:00:00,8.0,5.3,10.7,0.5,1.7,13.3,46.0,15.2,11.1,...,3,1,1.000000,1,1,1,1,1,1,1


In [11]:
dfok.to_csv("./data/final/final-preprocessed-next6h.csv", index=False, sep=';')