In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("./data/FR_E2_2021-01-01.csv", sep=';')

In [4]:
df_paris = df[df["Zas"] == "ZAG PARIS"][["Date de début", "Date de fin", "nom site", "Polluant", "valeur brute", "unité de mesure", "validité"]]
col_names = {
    "Date de début": "date_debut",
    "Date de fin": "date_fin",
    "nom site": "site",
    "Polluant": "polluant",
    "valeur brute": "value",
    "unité de mesure": "unit",
    "validité": "valid"
}
df_paris.rename(columns=col_names, inplace=True)
df_paris.reset_index(drop=True, inplace=True)
df_paris

Unnamed: 0,date_debut,date_fin,site,polluant,value,unit,valid
0,2021/01/01 00:00:00,2021/01/01 01:00:00,GENNEVILLIERS,NO,25.025,µg-m3,1
1,2021/01/01 01:00:00,2021/01/01 02:00:00,GENNEVILLIERS,NO,26.375,µg-m3,1
2,2021/01/01 02:00:00,2021/01/01 03:00:00,GENNEVILLIERS,NO,21.825,µg-m3,1
3,2021/01/01 03:00:00,2021/01/01 04:00:00,GENNEVILLIERS,NO,20.050,µg-m3,1
4,2021/01/01 04:00:00,2021/01/01 05:00:00,GENNEVILLIERS,NO,23.300,µg-m3,1
...,...,...,...,...,...,...,...
3667,2021/01/01 19:00:00,2021/01/01 20:00:00,Bld peripherique Est,PM2.5,20.925,µg-m3,1
3668,2021/01/01 20:00:00,2021/01/01 21:00:00,Bld peripherique Est,PM2.5,15.725,µg-m3,1
3669,2021/01/01 21:00:00,2021/01/01 22:00:00,Bld peripherique Est,PM2.5,26.275,µg-m3,1
3670,2021/01/01 22:00:00,2021/01/01 23:00:00,Bld peripherique Est,PM2.5,25.200,µg-m3,1


In [5]:
def mgm3_to_ugm3(value, unit):
    if unit == "mg-m3":
        return value * 1000
    return value

df_paris["value"] = [mgm3_to_ugm3(df_paris.loc[idx, "value"], df_paris.loc[idx, "unit"]) for idx in range(len(df_paris))]
df_paris["unit"] = "µg-m3"
df_paris

Unnamed: 0,date_debut,date_fin,site,polluant,value,unit,valid
0,2021/01/01 00:00:00,2021/01/01 01:00:00,GENNEVILLIERS,NO,25.025,µg-m3,1
1,2021/01/01 01:00:00,2021/01/01 02:00:00,GENNEVILLIERS,NO,26.375,µg-m3,1
2,2021/01/01 02:00:00,2021/01/01 03:00:00,GENNEVILLIERS,NO,21.825,µg-m3,1
3,2021/01/01 03:00:00,2021/01/01 04:00:00,GENNEVILLIERS,NO,20.050,µg-m3,1
4,2021/01/01 04:00:00,2021/01/01 05:00:00,GENNEVILLIERS,NO,23.300,µg-m3,1
...,...,...,...,...,...,...,...
3667,2021/01/01 19:00:00,2021/01/01 20:00:00,Bld peripherique Est,PM2.5,20.925,µg-m3,1
3668,2021/01/01 20:00:00,2021/01/01 21:00:00,Bld peripherique Est,PM2.5,15.725,µg-m3,1
3669,2021/01/01 21:00:00,2021/01/01 22:00:00,Bld peripherique Est,PM2.5,26.275,µg-m3,1
3670,2021/01/01 22:00:00,2021/01/01 23:00:00,Bld peripherique Est,PM2.5,25.200,µg-m3,1


In [6]:
all_sites = df_paris["site"].value_counts().keys().tolist()
all_polluants = df_paris["polluant"].value_counts().keys().tolist()
all_start_dates = df_paris["date_debut"].value_counts().keys().tolist()

In [7]:
def get_enddate(startdate):
    date1 = startdate.split()[0]
    hour = "0" + str(int(startdate.split()[1].split(":")[0]) + 1)
    minute = startdate.split()[1].split(":")[1]
    sec = startdate.split()[1].split(":")[2]
    return date1 + " " + ":".join([hour, minute, sec])

In [8]:
data = []
for site in all_sites:
    for startdate in ["2021/01/01 00:00:00", "2021/01/01 01:00:00", "2021/01/01 02:00:00"]:
        row = {
            "date_debut": startdate,
            "date_fin": get_enddate(startdate),
            "site": site,
        }
        temp = df_paris[(df_paris["date_debut"] == startdate) & (df_paris["site"] == site)]
        for polluant in temp["polluant"].value_counts().keys().tolist():
#            if polluant not in ["SO2", "CO"]:
            row[polluant] = temp[temp["polluant"] == polluant]["value"].values[0]
        data.append(row)

In [10]:
test = pd.DataFrame(data)
test

Unnamed: 0,date_debut,date_fin,site,SO2,NO,NO2,O3,NOX as NO2,PM10,PM2.5,CO
0,2021/01/01 00:00:00,2021/01/01 01:00:00,VITRY-SUR-SEINE,1.70000,18.450,42.400,2.45,70.650,50.750,43.525,
1,2021/01/01 01:00:00,2021/01/01 02:00:00,VITRY-SUR-SEINE,1.86667,14.500,39.725,2.10,62.000,44.675,39.625,
2,2021/01/01 02:00:00,2021/01/01 03:00:00,VITRY-SUR-SEINE,1.70000,18.675,38.450,2.10,67.100,42.800,37.875,
3,2021/01/01 00:00:00,2021/01/01 01:00:00,GENNEVILLIERS,,25.025,44.025,,82.450,52.650,52.475,
4,2021/01/01 01:00:00,2021/01/01 02:00:00,GENNEVILLIERS,,26.375,43.300,,83.725,56.500,51.350,
...,...,...,...,...,...,...,...,...,...,...,...
112,2021/01/01 01:00:00,2021/01/01 02:00:00,LES ULIS,,,,20.30,,,,
113,2021/01/01 02:00:00,2021/01/01 03:00:00,LES ULIS,,,,17.95,,,,
114,2021/01/01 00:00:00,2021/01/01 01:00:00,NOGENT-SUR-MARNE,,,,,,43.875,,
115,2021/01/01 01:00:00,2021/01/01 02:00:00,NOGENT-SUR-MARNE,,,,,,41.575,,


In [10]:
len(df_paris["site"].value_counts())
list(df_paris["site"].value_counts().keys())

39

['VITRY-SUR-SEINE',
 'GENNEVILLIERS',
 'Auto A1 -Saint-Denis',
 'PARIS 1er Les Halles',
 'TREMBLAY-EN-FRANCE',
 'BOBIGNY',
 'RN6-MELUN',
 'LOGNES',
 'PARIS 18eme',
 'Bld peripherique Est',
 'Av Champs Elysees',
 'VILLEMOMBLE',
 'Place Victor Basch',
 'PARIS Stade Lenglen',
 'LA DEFENSE',
 'MONTGERON',
 'Boulevard Haussmann',
 'RN2-PANTIN',
 'NEUILLY-SUR-SEINE',
 "Place de l'Opéra",
 'PARIS 13eme',
 'CHAMPIGNY-SUR-MARNE',
 'MELUN',
 'GONESSE',
 'MANTES-LA-JOLIE',
 'VERSAILLES',
 'EVRY',
 'RN20 - MONTLHERY',
 'ARGENTEUIL',
 'PARIS 12eme',
 'Quai des Celestins',
 'AUBERVILLIERS',
 'Boulevard Soult',
 'SAINT-DENIS',
 'PARIS 7eme',
 'Rue Bonaparte',
 'CERGY-PONTOISE',
 'LES ULIS',
 'NOGENT-SUR-MARNE']