In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# Vitry sur Seine

https://data-airparif-asso.opendata.arcgis.com/datasets/2021-vitry/explore

In [3]:
def process_data(df):
    colnames = ["date"] + [col.split(":")[1] for col in df.columns[1:-1]]
    cols = {name:colnames[i] for i, name in enumerate(df.columns.tolist()[:-1])}
    df = df.drop("OBJECTID", axis=1).rename(columns=cols).iloc[5:]

    indexes_full_nan = df[df.drop("date", axis=1).isnull().all(axis=1)].index.tolist()
    df = df.drop(indexes_full_nan)

    for col in df.columns.tolist()[1:]:
        df[col].fillna(df[col].median(), inplace=True)
        df[col] = pd.to_numeric(df[col])
    return df

In [4]:
df_vitry_2019 = process_data(pd.read_csv("./data/2019_VITRY.csv"))
df_vitry_2020 = process_data(pd.read_csv("./data/2020_VITRY.csv"))
df_vitry_2021 = process_data(pd.read_csv("./data/2021_VITRY.csv"))

In [5]:
df_vitry = pd.concat([df_vitry_2019, df_vitry_2020, df_vitry_2021])
df_vitry.date = df_vitry.date.apply(lambda x: ":".join(x.split(":")[:2]).replace("/", "-"))
df_vitry.reset_index(drop=True, inplace=True)
df_vitry

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3
0,2019-01-01 01:00,5.9,9.2,18.6,2.4,1.3,20.7,41.8
1,2019-01-01 02:00,5.4,10.3,19.7,2.3,1.4,21.9,39.8
2,2019-01-01 03:00,8.6,12.9,24.3,2.1,0.7,25.4,32.8
3,2019-01-01 04:00,10.2,12.5,25.4,2.6,1.0,27.0,36.6
4,2019-01-01 05:00,11.1,6.9,18.2,3.1,0.6,19.2,48.9
...,...,...,...,...,...,...,...,...
26234,2021-12-30 17:00,11.4,8.1,23.9,0.9,1.5,26.2,25.2
26235,2021-12-30 18:00,10.8,7.5,27.7,0.6,2.3,31.2,20.9
26236,2021-12-30 19:00,11.9,8.4,25.8,0.6,1.1,27.6,21.2
26237,2021-12-30 20:00,11.8,8.8,31.1,0.6,1.1,32.9,14.2


In [20]:
def get_atmo_cat(row):
    data = {
        "pm25": {"value": row["PM25"], "status": "inconnu", "ranges": [(0,10), (10,20), (20,25), (25,50), (50,75)]},
        "pm10": {"value": row["PM10"], "status": "inconnu", "ranges": [(0,20), (21,40), (41,50), (51,100), (101,150)]},
        "no2": {"value": row["NO2"], "status": "inconnu", "ranges": [(0,40), (41,90), (91,120), (121,230), (231,340)]},
        "so2": {"value": row["SO2"], "status": "inconnu", "ranges": [(0,100), (101,200), (201,350), (351,500), (501,750)]},
        "o3": {"value": row["O3"], "status": "inconnu", "ranges": [(0,50), (51,100), (101,130), (131,240), (241,380)]}
    }
    
    for pol in list(data.keys()):
        val = data[pol]["value"]
        for i, (left, right) in enumerate(data[pol]["ranges"]):
            if left <= val <= right:
                data[pol]["status"] = i+1
            elif val > right:
                data[pol]["status"] = 6

    indexes = [data[pol]["status"] for pol in list(data.keys())]
    max_val = max(indexes)
    categories = {1: "bon", 2: "moyen", 3: "dégradé", 4: "mauvais", 5: "très mauvais", 6: "extrêmement mauvais"}
    return categories[max_val]

In [21]:
df_vitry["ATMO"] = df_vitry.apply(get_atmo_cat, axis=1)
df_vitry

Unnamed: 0,date,PM10,PM25,NO2,SO2,NO,NOX,O3,ATMO
0,2019-01-01 01:00,5.9,9.2,18.6,2.4,1.3,20.7,41.8,bon
1,2019-01-01 02:00,5.4,10.3,19.7,2.3,1.4,21.9,39.8,moyen
2,2019-01-01 03:00,8.6,12.9,24.3,2.1,0.7,25.4,32.8,moyen
3,2019-01-01 04:00,10.2,12.5,25.4,2.6,1.0,27.0,36.6,moyen
4,2019-01-01 05:00,11.1,6.9,18.2,3.1,0.6,19.2,48.9,bon
...,...,...,...,...,...,...,...,...,...
26234,2021-12-30 17:00,11.4,8.1,23.9,0.9,1.5,26.2,25.2,bon
26235,2021-12-30 18:00,10.8,7.5,27.7,0.6,2.3,31.2,20.9,bon
26236,2021-12-30 19:00,11.9,8.4,25.8,0.6,1.1,27.6,21.2,bon
26237,2021-12-30 20:00,11.8,8.8,31.1,0.6,1.1,32.9,14.2,bon


In [22]:
df_vitry.to_csv("./data/final/data-air-quality.csv", sep=";", index=False)