# Engineering

In [5]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import time

from urllib.request import urlopen
import requests
import json
import zipfile

import plotly.graph_objects as go
import plotly.express as px

from netCDF4 import Dataset
import reverse_geocoder as rg

In [6]:
from confidential import secrets

In [30]:
#CONFIG
path_drive="../" #Directory if you use Google Colab and you want to save files
path='../datas_sentinel5/'

# Pour téléchargez les données, veuillez vous enregistrer sur le site ONDA: https://www.onda-dias.eu/cms/
user = secrets.user #Email
password = secrets.password #Password

# Paramètres de téléchargements
Param_API=False #True: Requête vers l'API de ONDA + enregistrement dans un CSV | False: Lecture des CSV enregistrés
Param_Download=False #True: télécharge 1 fichier (i=x) de ONDA. 
Param_Tracking=False #True: write in tracking_files.csv

In [8]:
def f_poly(x):
    poly=[[[elem.replace("((","")]] for elem in x[14:-3].split(")),")]
    poly=[[j.split(" ") for j in poly[i][0][0].split(",")] for i in range(0,len(poly))]
    return [poly]

def timer():
    return '['+datetime.now().strftime("%d/%m/%Y %H:%M:%S")+']'

#Distances
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6373 # km

    dlat = np.radians(lat2-lat1)
    dlon = np.radians(lon2-lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) \
        * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c

    return d

In [9]:
#Vérification existence dossiers
ls=["../datas_sentinel5", "../datas_sentinel5/cleaned", "../datas_sentinel5/archives", "../datas_sentinel5/csv"]
for elem in ls:
    if(os.path.isdir(elem)==0):
        try:
            os.mkdir(elem)
        except OSError:
            print ("Creation of the directory failed")

In [12]:
top=300

#Updating and loaging files
files=os.listdir(path_drive+"datas_sentinel5/csv")
if len(files)!=0:
    #Read more recent file
    ls=[os.path.getmtime(path_drive+"datas_sentinel5/csv/"+elem) for elem in files]
    infos=pd.read_csv(path_drive+"datas_sentinel5/csv/"+files[ls.index(max(ls))], index_col=0)
    infos.creationDate=pd.to_datetime(infos.creationDate)
    #Store max date in a variable
    max_date=infos.creationDate.max()
    
    #If Timedelta > 1 day & Param_API is ON : We start API
    if (pd.Timestamp.today(tz="UTC")-max_date >= pd.Timedelta('1 days')) & Param_API:
        url="https://catalogue.onda-dias.eu/dias-catalogue/Products?$search=%22name:S5P_OFFL_L2__NO2*%22&$top="+str(top)+"&$orderby=creationDate%20desc&$skip=0&$format=json"

        response=requests.get(url)
        response=json.loads(response.content.decode('utf-8'))
        #Cleaning
        keys=["id","name","creationDate","beginPosition","size","downloadable","offline","footprint"]
        infos=[[elem.get(key) for key in keys] for elem in response["value"]]
        infos=pd.DataFrame(infos,columns=keys)
        infos.creationDate=pd.to_datetime(infos.creationDate)
        infos.beginPosition=pd.to_datetime(infos.beginPosition)
        #Remove when creationDate < max_date
        infos=infos[infos.creationDate >= max_date]
        #Save as CSV
        infos.to_csv(path_drive+"datas_sentinel5/csv/infos_"+infos.creationDate.dt.strftime('%Y_%m_%d_%H_%M')[len(infos.creationDate)-1]+"_to_"+infos.creationDate.dt.strftime('%Y_%m_%d_%H_%M')[0]+".csv")
        del(infos)

In [32]:
for full_loop in range(0,1):
    global_start = time.time()
    infos=pd.DataFrame()
    ls=[i for i in os.listdir(path_drive+"datas_sentinel5/csv") if os.path.isfile(os.path.join(path_drive+"datas_sentinel5/csv",i)) and 'infos' in i]
    for elem in ls:
        df=pd.read_csv(path_drive+"datas_sentinel5/csv/"+elem, index_col=0)
        df.creationDate=pd.to_datetime(df.creationDate)
        df.beginPosition=pd.to_datetime(df.beginPosition)
        infos=pd.concat([infos, df], axis=0)
      
    infos=infos.reset_index(drop=True)

    #Group by day
    calendar=infos.groupby(infos.beginPosition.dt.floor("D")).sum()

    #Add absent dates from the last date to today
    while pd.Timestamp.today(tz="UTC").floor("D")-calendar.asfreq('D').index.max()>pd.Timedelta('1 days'):
        calendar_add=pd.DataFrame([[calendar.index.max()+pd.Timedelta('1 days'),0,0,0]], columns=['beginPosition','size','downloadable','offline'])
        calendar_add=calendar_add.set_index("beginPosition")
        calendar=pd.concat([calendar,calendar_add])

    #Adds absent dates and replaces with 0
    calendar=calendar.asfreq('D').fillna(0)

    #Add day of week
    calendar["dayofweek"]=calendar.index.dayofweek.values

  #As categorical
    calendar["categorical"]=0
    calendar.loc[calendar.downloadable > 0,"categorical"]=1
    calendar.loc[calendar.downloadable >= 12,"categorical"]=2

    if os.path.isfile(path_drive+"datas_sentinel5/tracking_files.csv"):
        tracking_files=pd.read_csv(path_drive+"datas_sentinel5/tracking_files.csv", index_col=0)
        tracking_files.date=pd.to_datetime(tracking_files.date)
        tracking_files=tracking_files.drop_duplicates()
    else:
        tracking_files=pd.DataFrame([[pd.Timestamp.today(tz="UTC").floor("D")-pd.Timedelta('31 days'),0]], columns=["date","number"])

    #Group by day
    tracking_files=tracking_files.groupby(tracking_files.date.dt.floor("D")).sum()

    #Add absent dates from the last date to today
    tracking_files_add=pd.DataFrame([[calendar.index.min(),0],[calendar.index.max(),0]], columns=["date","number"]).set_index("date")
    tracking_files=pd.concat([tracking_files_add,tracking_files])
    tracking_files=tracking_files.sort_index()
    tracking_files=tracking_files.asfreq('D').fillna(0)

    #Combine to calendar
    calendar["number"]=tracking_files["number"]
    calendar["new_available"]=calendar.downloadable-calendar.number
    #As categorical
    calendar["categorical"]=0
    calendar.loc[calendar.new_available > 0,"categorical"]=1
    calendar.loc[calendar.new_available >= 12,"categorical"]=2

    if pd.isnull(calendar[calendar.categorical==2].index.max()):
        sys.exit("Aucunes nouvelles données à télécharger")

    #Read files in Archives and Cleansed -> Remove thoses files from the list
    remove_infos=os.listdir(path+"cleaned")
    remove_infos=[elem[:-4]+".zip" for elem in remove_infos]
    infos=infos[~infos.name.isin(remove_infos)]

    #Create link to download
    infos["urls"]="https://catalogue.onda-dias.eu/dias-catalogue/Products("+infos.id+")/$value"
    infos=infos.drop_duplicates()
    infos=infos.sort_values("beginPosition", ascending=False)
    infos=infos[infos.beginPosition.dt.floor("D")==calendar[calendar.categorical==2].index.max()]
    infos=infos.drop_duplicates('name')
    infos=infos.reset_index(drop=True)

    i=list(range(0,infos.shape[0]))

    if Param_Download==True:
        #Check if file doesn't already exist
        for k in i:
            ls=[elem[:-3] for elem in os.listdir(path)]
            if(infos.loc[k,"name"][:-4] in ls)==False:

                print(timer()+'[INFO] Beginning download file '+ infos.name[k])
                r = requests.get(infos.loc[k,"urls"], auth=(user, password))
                print(timer()+'[INFO] Code '+str(r.status_code))
                if r.status_code != 200:
                    while r.status_code != 200:
                        print('\033[1;31;48m'+timer()+'[ERROR] Error '+str(r.status_code)+'. Retry in 1 minute')
                        time.sleep(60)
                        r = requests.get(infos.loc[k,"urls"], auth=(user, password))

                with open(path+infos.loc[k,"name"], 'wb') as f:
                    f.write(r.content)
                print('\033[0m'+timer()+'[INFO] Unzip file')
                with zipfile.ZipFile(path+infos.loc[k,"name"], 'r') as zip_ref:
                    zip_ref.extractall(path)
                print(timer()+'[INFO] Delete zipfile')
                os.remove(path+infos.loc[k,"name"])
                print(timer()+'[Success] The file '+infos.name[k]+' has been downloaded and unzipped.')
                #time.sleep(5)

    if Param_Tracking==True:
        #Save the number of files downloaded in the the tracking file
        if os.path.isfile(path_drive+"datas_sentinel5/tracking_files.csv"):
            temp=pd.read_csv(path_drive+"datas_sentinel5/tracking_files.csv", index_col=0)
            temp_add=pd.DataFrame([[infos.beginPosition.max().floor("D"),infos.shape[0]]], columns=["date","number"])
            temp=pd.concat([temp, temp_add])
            temp.to_csv(path_drive+"datas_sentinel5/tracking_files.csv")
            del(temp, temp_add)
        else:
            pd.DataFrame([[infos.beginPosition.max().floor("D"),infos.shape[0]]], columns=["date","number"]).to_csv(path_drive+"datas_sentinel5/tracking_files.csv")


    #Save full data in dict
    rootgrp=dict()
    for k in i:
        rootgrp[k]=Dataset(path+infos.loc[k,"name"][:-3]+"nc", "r", format="NETCDF4")

    #Rework data as Dataframe
    df_sat=dict()
    for k in i:
        lon_x=rootgrp[k].groups["PRODUCT"].variables["longitude"][0].data.flatten()
        lat_x=rootgrp[k].groups["PRODUCT"].variables["latitude"][0].data.flatten()
        z_value=rootgrp[k].groups["PRODUCT"].variables["nitrogendioxide_tropospheric_column"][0].data.flatten()
        qa_value=rootgrp[k].groups["PRODUCT"].variables["qa_value"][0].data.flatten()
      
        df_sat[k]=pd.DataFrame({'longitude': lon_x, 'latitude': lat_x, 'NO2':z_value,'quality':qa_value})

    for k in i:
        df_sat[k]["date"]=np.full((450,rootgrp[k].groups["PRODUCT"].variables["time_utc"].shape[1]), rootgrp[k].groups["PRODUCT"].variables["time_utc"][0]).flatten("F")

    #Get locations infos from latitude and longitude
    coordinates=dict()
    results=dict()
    start_time = time.time()
    for k in i:
        coordinates[k] =list(zip(df_sat[k]["latitude"], df_sat[k]["longitude"]))
        results[k] =rg.search(coordinates[k])
      
    print(timer()+'[INFO] Linked with countries in %s seconds ---' % (time.time() - start_time))

    #Merge columns created
    for k in i:
        results[k]=pd.DataFrame.from_dict(results[k])
        results[k]=results[k].rename(columns={"lat":"cc_lat","lon":"cc_lon","name":"cc_ville","admin1":"cc_region","admin2":"cc_departement","cc":"cc_pays"})

    for k in i:
        df_sat[k]=pd.concat([df_sat[k], results[k]], axis=1)
        #Convert to float
        df_sat[k].cc_lon=df_sat[k].cc_lon.astype(float)
        df_sat[k].cc_lat=df_sat[k].cc_lat.astype(float)

    for k in i:
        #Compute distances
        df_sat[k]["dist"]=distance([df_sat[k].latitude, df_sat[k].longitude], [df_sat[k].cc_lat, df_sat[k].cc_lon])
        #Keep country infos only if dist <= 30
        df_sat[k].loc[df_sat[k].dist > 30, ["cc_ville","cc_region", "cc_region", "cc_departement", "cc_pays"]]=float("NaN")
        #Remove columns generated
        df_sat[k]=df_sat[k].drop(['cc_lat', 'cc_lon', "dist"], axis=1)

    df_plot=pd.DataFrame()
    for k in i:
        df_sat[k].to_csv(path+"cleaned/"+infos.name[k][:-3]+"csv")
        rootgrp[k].close()
        #os.remove(path+infos.name[k][:-3]+"nc")

    #Make some RAM space:
    del(df_sat, coordinates, lat_x, lon_x, rootgrp, results, start_time)

    #files_list=os.listdir("datas_sentinel5/cleaned")
    files_list=[i for i in os.listdir(path+"cleaned") if os.path.isfile(os.path.join(path+"cleaned/",i)) and 'S5P_OFFL_L2__NO2' in i]
    i=list(range(0,len(files_list)))

    #Row bind for plot
    df_plot=pd.DataFrame()

    for k in range(0,len(files_list)):
        df_sat=pd.read_csv(path+"cleaned/"+files_list[k], index_col=0)
        df_sat.date=pd.to_datetime(df_sat.date)
        df_plot=pd.concat([df_plot, df_sat])
        del(df_sat)
        #Restrictions quality
        df_plot=df_plot[df_plot["quality"]>=0.75]

    #It is mandatory to replace NaN by "Undefined" to not loose datas
    df_plot=df_plot.fillna("Undefined")
    #Add Counter too
    df_plot["counter"]=1

    #Time values
    df_plot["hour_mean"]=df_plot.date.dt.hour
    df_plot["hour_std"]=df_plot.date.dt.hour
    df_plot["day_std"]=df_plot.date.dt.day
    df_plot["day_mean"]=df_plot.date.dt.day
    df_plot["dayofweek_std"]=df_plot.date.dt.dayofweek
    df_plot["dayofweek_mean"]=df_plot.date.dt.dayofweek
    df_plot["week"]=df_plot.date.dt.week
    df_plot["month"]=df_plot.date.dt.month
    df_plot["year"]=df_plot.date.dt.year

    #Aggregation
    df_plot=df_plot.groupby(["year","month","week","cc_pays","cc_departement","cc_region","cc_ville"], as_index=False).agg({'longitude':'mean', 'latitude':'mean', 'NO2':'mean', 'quality':'mean', 'hour_mean':'mean', 'hour_std':'std', 'dayofweek_mean':'mean', 'dayofweek_std':'std', 'day_mean':'mean', 'day_std':'std','counter':'sum'})

    from datetime import date
    filename="archived_"+str(df_plot.year.value_counts()[df_plot.year.value_counts()==df_plot.year.value_counts().max()].index[0])+"_"+str(df_plot.month.value_counts()[df_plot.month.value_counts()==df_plot.month.value_counts().max()].index[0])+"_"+str(int(df_plot.day_mean.value_counts()[df_plot.day_mean.value_counts()==df_plot.day_mean.value_counts().max()].index[0]))+".csv"
    #Save files
    df_plot.to_csv(path_drive+"datas_sentinel5/archives/"+filename)

    #Cleaning
    del(df_plot)
    for k in range(0,len(files_list)):
        os.remove(path+"cleaned/"+files_list[k])

    print(timer()+'[SUCCESS] The day have been downloaded and cleaned in '+str((time.time() - global_start)/60))

FileNotFoundError: [Errno 2] No such file or directory: b'../datas_sentinel5/S5P_OFFL_L2__NO2____20200420T201750_20200420T215920_13059_01_010302_20200422T125917.nc'

In [0]:
i=list(range(0,infos.shape[0]))
path='datas_sentinel5/'

if Param_Download==True:
    #Check if file doesn't already exist
    for k in i:
        ls=[elem[:-3] for elem in os.listdir("datas_sentinel5")]
        if(infos.loc[k,"name"][:-4] in ls)==False:

            print(timer()+'[INFO] Beginning download file '+ infos.name[k])
            r = requests.get(infos.loc[k,"urls"], auth=(user, password))
            print(timer()+'[INFO] Code '+str(r.status_code))
            if r.status_code != 200:
                while r.status_code != 200:
                    print('\033[1;31;48m'+timer()+'[ERROR] Error '+str(r.status_code)+'. Retry in 1 minute')
                    time.sleep(60)
                    r = requests.get(infos.loc[k,"urls"], auth=(user, password))

            with open(path+infos.loc[k,"name"], 'wb') as f:
                f.write(r.content)
            print('\033[0m'+timer()+'[INFO] Unzip file')
            with zipfile.ZipFile(path+infos.loc[k,"name"], 'r') as zip_ref:
                zip_ref.extractall(path)
            print(timer()+'[INFO] Delete zipfile')
            os.remove(path+infos.loc[k,"name"])
            print(timer()+'[Success] The file '+infos.name[k]+' has been downloaded and unzipped.')
            #time.sleep(5)