# **Preprocessing**

In preprocessing the data, the following steps are taken:

> * Prepare packages and setup  
> * Load in the data  
> * Tidy the data and store metadata  
> * Inspect data with various metrics  
> * Inspect data with visualisations 
> * Select locations
> * Select timeframe
> * Feature engineering
> * Perform train-validation-test-split  
> * Normalisation  
> * Create big combined normalised dataframe

#### **Prepare packages and setup**

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['figure.figsize'] = (6, 2) # landscape plots

"Global" variables (we're still in a notebook)

In [None]:
DEVICE = 'tinus'                        # for working directory compatibility

# =============================================================================

SUBSET_MONTHS = bool(1)                 # If true, only the months specified in the list below will be
                                        # used for the training, validation and testing set
START_MON = '08'                        # starting month for the data
END_MON = '12'                          # ending month for the data

# ============================================================================+

# Sensor locations in the case of Utrecht area:
DE_BILT = 'S260'                        # starting (and only used) location for meteorological data
TUINDORP = 'NL10636'                    # starting location for contamination data
BREUKELEN = 'NL10641'                   # 'goal' location for contamination data

# =============================================================================

# At multiple locations, a sys.exit() can be used to halt the script

#### **Load in the data**

In [None]:
# Encoding is in ISO-8859-15: https://data.rivm.nl/data/luchtmeetnet/readme.pdf

def read_contaminant_csv_from_data_raw(component, year):
    if DEVICE == 'tinus':               # adjust/delete this if on another machine
        os.chdir(r"c:\Users\vwold\Documents\thesis\bsc_thesis\preprocessing")                      

    rows_to_skip = 9                    # to not include metadata                                    
    return pd.read_csv(f"../data/data_raw/{year}_{component}.csv", 
                       sep = ';', encoding = 'ISO-8859-15', skiprows = rows_to_skip)


def read_meteo_csv_from_data_raw(year):
    if DEVICE == 'tinus':               # adjust/delete this if on another machine
        os.chdir(r"c:\Users\vwold\Documents\thesis\bsc_thesis\preprocessing") 

    return pd.read_csv(f"../data/data_raw/{year}_meteo_Utrecht.csv",
                       sep = ';', encoding = 'UTF-8', index_col = 0) 
                                                          
                                                            
def read_four_contaminants(year, contaminants):
    df1 = read_contaminant_csv_from_data_raw(contaminants[0], year)
    df2 = read_contaminant_csv_from_data_raw(contaminants[1], year)
    df3 = read_contaminant_csv_from_data_raw(contaminants[2], year)
    df4 = read_contaminant_csv_from_data_raw(contaminants[3], year)
    return df1, df2, df3, df4


def read_two_meteo_years(yr1, yr2):
    df1 = read_meteo_csv_from_data_raw(yr1)
    df2 = read_meteo_csv_from_data_raw(yr2)
    return df1, df2

In [None]:
contaminants = ['PM25', 'PM10', 'O3', 'NO2']

df_PM25_2016_raw, df_PM10_2016_raw, df_O3_2016_raw, df_NO2_2016_raw = \
    read_four_contaminants(2016, contaminants)
df_PM25_2017_raw, df_PM10_2017_raw, df_O3_2017_raw, df_NO2_2017_raw = \
    read_four_contaminants(2017, contaminants)
df_PM25_2018_raw, df_PM10_2018_raw, df_O3_2018_raw, df_NO2_2018_raw = \
    read_four_contaminants(2018, contaminants)
df_PM25_2019_raw, df_PM10_2019_raw, df_O3_2019_raw, df_NO2_2019_raw = \
    read_four_contaminants(2019, contaminants)
df_PM25_2020_raw, df_PM10_2020_raw, df_O3_2020_raw, df_NO2_2020_raw = \
    read_four_contaminants(2020, contaminants)
df_PM25_2021_raw, df_PM10_2021_raw, df_O3_2021_raw, df_NO2_2021_raw = \
    read_four_contaminants(2021, contaminants)
df_PM25_2022_raw, df_PM10_2022_raw, df_O3_2022_raw, df_NO2_2022_raw = \
    read_four_contaminants(2022, contaminants)
df_PM25_2023_raw, df_PM10_2023_raw, df_O3_2023_raw, df_NO2_2023_raw = \
    read_four_contaminants(2023, contaminants)

In [None]:
df_meteo_2016_raw = read_meteo_csv_from_data_raw(2016)
df_meteo_2017_raw = read_meteo_csv_from_data_raw(2017)
df_meteo_2018_raw = read_meteo_csv_from_data_raw(2018)
df_meteo_2019_raw = read_meteo_csv_from_data_raw(2019)
df_meteo_2020_raw = read_meteo_csv_from_data_raw(2020)
df_meteo_2021_raw = read_meteo_csv_from_data_raw(2021)
df_meteo_2022_raw = read_meteo_csv_from_data_raw(2022)
df_meteo_2023_raw = read_meteo_csv_from_data_raw(2023)

In [None]:
# print(df_meteo_2016_raw.head(1))
# print(df_meteo_2017_raw.head(1))
# print(df_meteo_2018_raw.head(1))
# print(df_meteo_2019_raw.head(1))
# print(df_meteo_2020_raw.head(1))
# print(df_meteo_2021_raw.head(1))
print(df_meteo_2022_raw.head(1))
print(df_meteo_2023_raw.head(1))

#### **Tidy the data and store metadata**

First, tidy the contaminant data

In [None]:
def col_contains_NaN(df, col):
    """Checks if column contains NaNs, returns True if so, False if not"""
    return df[col].isna().any()


def get_component(df):
    return f"{df['Component'].iloc[0]}"


def get_unit(df):
    return f"{df['Eenheid'].iloc[0]}"


def get_metadata(df):
    """Returns dictionary with component and unit of contaminant"""
    metadata = {'comp' : get_component(df),
                'unit' : get_unit(df)}
    return metadata


def remove_unuseful_cols(df, cols):
    """Removes cols from df"""
    return df.drop(cols, axis = 1)


def change_contaminant_date_format(df):
    """Changes the date format yyyy-mm-dd hh:mm"""
    try:
        df['Begindatumtijd'] = pd.to_datetime(df['Begindatumtijd'],
                                              format = '%Y%m%d %H:%M')
    except ValueError:
        df['Begindatumtijd'] = pd.to_datetime(df['Begindatumtijd'],
                                              format = 'ISO8601')
    df.rename(columns = {'Begindatumtijd' : 'DateTime'},
              inplace = True)
    return df


def strip_dot1_of_col_names(col_names): 
    """Removes '.1' from col names (which were caused by duplicate cols in the raw data)"""
    return [name.removesuffix('.1') for name in np.asarray(col_names)]


def resolve_split_columns(df):          
    """Groups cols which were split over two columns in the raw data"""
    df.columns = strip_dot1_of_col_names(df.columns)

    # tranpose; group by duplicate column names; no sorting; sum the cols;
    # minimum count is 1 to get NaN when column is empty; transpose again
    return df.transpose().groupby(by = df.columns, sort = False).sum(min_count = 1).transpose()


def fill_NaNs_forward(df):
    """Fills in NaNs by copying last value - forward fill"""
    return df.ffill(axis = 1)


def fill_NaNs_linear(df):
    """Fills in NaNs by linear interpolation, with a maximum of a week (168 hours)"""
    return df.interpolate(method = 'linear', limit = 24 * 7)


def subset_month_range(df, start_mon, end_mon, year):
    """Returns df subsetted by month range"""
    return df[f'{year}-{start_mon}' : f'{year}-{end_mon}']


def delete_feb_29th(df):
    """Deletes the 29th of February from a df"""
    return df[~((df.index.month == 2) & (df.index.day == 29))]

def delete_firework_days(df):
    """Deletes the 31st of December and 1st of January"""
    return df[~(((df.index.month == 12) & (df.index.day == 31)) | ((df.index.month == 1) & (df.index.day == 1)))]


def delete_empty_columns(df):
    """Drops cols with more than 25% NaNs"""          
    threshold = df.shape[0] * 0.75
    return df.dropna(thresh = threshold, axis = 1)


def tidy_raw_contaminant_data(df, year, fill_NaNs = True):
    """Tidies raw contaminant data by various preprocessing steps"""
    df.columns = df.columns.str.strip() # remove leading and trailing ws in col names
    df = remove_unuseful_cols(df, ['Component', 'Bep.periode', 'Eenheid', 'Einddatumtijd'])
                                        # change format to yr-mm-dd hr-mn
    df = change_contaminant_date_format(df)         
                                        # set the index to the dates ('datetime')
    df = df.set_index('DateTime', drop = True)
    df = resolve_split_columns(df)      # concat sensor data split over two columns

    if fill_NaNs:
        df = fill_NaNs_linear(df)       # fill in NaNs using linear interpolation
                                        
    if SUBSET_MONTHS:
        df = subset_month_range(df, START_MON, END_MON, year)
    df = delete_feb_29th(df)            # delete the 29th of February
    df = delete_firework_days(df)       # delete the 31st of December and 1st of January
    df = delete_empty_columns(df)       # drop columns which remained too empty after interpolating

    return df

In [None]:
PM25_2016_meta = get_metadata(df_PM25_2016_raw)
PM10_2016_meta = get_metadata(df_PM10_2016_raw)
O3_2016_meta   = get_metadata(df_O3_2016_raw)
NO2_2016_meta  = get_metadata(df_NO2_2016_raw)
PM25_2017_meta = get_metadata(df_PM25_2017_raw)
PM10_2017_meta = get_metadata(df_PM10_2017_raw)
O3_2017_meta   = get_metadata(df_O3_2017_raw)
NO2_2017_meta  = get_metadata(df_NO2_2017_raw)
PM25_2018_meta = get_metadata(df_PM25_2018_raw)
PM10_2018_meta = get_metadata(df_PM10_2018_raw)
O3_2018_meta   = get_metadata(df_O3_2018_raw)
NO2_2018_meta  = get_metadata(df_NO2_2018_raw)
PM25_2019_meta = get_metadata(df_PM25_2019_raw)
PM10_2019_meta = get_metadata(df_PM10_2019_raw)
O3_2019_meta   = get_metadata(df_O3_2019_raw)
NO2_2019_meta  = get_metadata(df_NO2_2019_raw)
PM25_2020_meta = get_metadata(df_PM25_2020_raw)
PM10_2020_meta = get_metadata(df_PM10_2020_raw)
O3_2020_meta   = get_metadata(df_O3_2020_raw)
NO2_2020_meta  = get_metadata(df_NO2_2020_raw)
PM25_2021_meta = get_metadata(df_PM25_2021_raw)
PM10_2021_meta = get_metadata(df_PM10_2021_raw)
O3_2021_meta   = get_metadata(df_O3_2021_raw)
NO2_2021_meta  = get_metadata(df_NO2_2021_raw)
PM25_2022_meta = get_metadata(df_PM25_2022_raw)
PM10_2022_meta = get_metadata(df_PM10_2022_raw)
O3_2022_meta   = get_metadata(df_O3_2022_raw)
NO2_2022_meta  = get_metadata(df_NO2_2022_raw)
PM25_2023_meta = get_metadata(df_PM25_2023_raw)
PM10_2023_meta = get_metadata(df_PM10_2023_raw)
O3_2023_meta   = get_metadata(df_O3_2023_raw)
NO2_2023_meta  = get_metadata(df_NO2_2023_raw)

df_PM25_2016_tidy = tidy_raw_contaminant_data(df_PM25_2016_raw, '2016')
df_PM10_2016_tidy = tidy_raw_contaminant_data(df_PM10_2016_raw, '2016')
df_O3_2016_tidy   = tidy_raw_contaminant_data(df_O3_2016_raw, '2016')
df_NO2_2016_tidy  = tidy_raw_contaminant_data(df_NO2_2016_raw, '2016')
df_PM25_2017_tidy = tidy_raw_contaminant_data(df_PM25_2017_raw, '2017')
df_PM10_2017_tidy = tidy_raw_contaminant_data(df_PM10_2017_raw, '2017')
df_O3_2017_tidy   = tidy_raw_contaminant_data(df_O3_2017_raw, '2017')
df_NO2_2017_tidy  = tidy_raw_contaminant_data(df_NO2_2017_raw, '2017')
df_PM25_2018_tidy = tidy_raw_contaminant_data(df_PM25_2018_raw, '2018')
df_PM10_2018_tidy = tidy_raw_contaminant_data(df_PM10_2018_raw, '2018')
df_O3_2018_tidy   = tidy_raw_contaminant_data(df_O3_2018_raw, '2018')
df_NO2_2018_tidy  = tidy_raw_contaminant_data(df_NO2_2018_raw, '2018')
df_PM25_2019_tidy = tidy_raw_contaminant_data(df_PM25_2019_raw, '2019')
df_PM10_2019_tidy = tidy_raw_contaminant_data(df_PM10_2019_raw, '2019')
df_O3_2019_tidy   = tidy_raw_contaminant_data(df_O3_2019_raw, '2019')
df_NO2_2019_tidy  = tidy_raw_contaminant_data(df_NO2_2019_raw, '2019')
df_PM25_2020_tidy = tidy_raw_contaminant_data(df_PM25_2020_raw, '2020')
df_PM10_2020_tidy = tidy_raw_contaminant_data(df_PM10_2020_raw, '2020')
df_O3_2020_tidy   = tidy_raw_contaminant_data(df_O3_2020_raw, '2020')
df_NO2_2020_tidy  = tidy_raw_contaminant_data(df_NO2_2020_raw, '2020')
df_PM25_2021_tidy = tidy_raw_contaminant_data(df_PM25_2021_raw, '2021')
df_PM10_2021_tidy = tidy_raw_contaminant_data(df_PM10_2021_raw, '2021')
df_O3_2021_tidy   = tidy_raw_contaminant_data(df_O3_2021_raw, '2021')
df_NO2_2021_tidy  = tidy_raw_contaminant_data(df_NO2_2021_raw, '2021')
df_PM25_2022_tidy = tidy_raw_contaminant_data(df_PM25_2022_raw, '2022')
df_PM10_2022_tidy = tidy_raw_contaminant_data(df_PM10_2022_raw, '2022')
df_O3_2022_tidy   = tidy_raw_contaminant_data(df_O3_2022_raw, '2022')
df_NO2_2022_tidy  = tidy_raw_contaminant_data(df_NO2_2022_raw, '2022')
df_PM25_2023_tidy = tidy_raw_contaminant_data(df_PM25_2023_raw, '2023')
df_PM10_2023_tidy = tidy_raw_contaminant_data(df_PM10_2023_raw, '2023')
df_O3_2023_tidy   = tidy_raw_contaminant_data(df_O3_2023_raw, '2023')
df_NO2_2023_tidy  = tidy_raw_contaminant_data(df_NO2_2023_raw, '2023')

In [None]:
# print(df_PM25_2016_tidy.shape)
# print(df_PM10_2016_tidy.shape)
# print(df_O3_2016_tidy.shape)
# print(df_NO2_2016_tidy.shape)
print(df_PM25_2017_tidy.shape)
print(df_PM10_2017_tidy.shape)
print(df_O3_2017_tidy.shape)
print(df_NO2_2017_tidy.shape)
print(df_PM25_2018_tidy.shape)
print(df_PM10_2018_tidy.shape)
print(df_O3_2018_tidy.shape)
print(df_NO2_2018_tidy.shape)
# print(df_PM25_2019_tidy.shape)
# print(df_PM10_2019_tidy.shape)
# print(df_O3_2019_tidy.shape)
# print(df_NO2_2019_tidy.shape)
print(df_PM25_2020_tidy.shape)
print(df_PM10_2020_tidy.shape)
print(df_O3_2020_tidy.shape)
print(df_NO2_2020_tidy.shape)
print(df_PM25_2021_tidy.shape)
print(df_PM10_2021_tidy.shape)
print(df_O3_2021_tidy.shape)
print(df_NO2_2021_tidy.shape)
print(df_PM25_2022_tidy.shape)
print(df_PM10_2022_tidy.shape)
print(df_O3_2022_tidy.shape)
print(df_NO2_2022_tidy.shape)
print(df_PM25_2023_tidy.shape)
print(df_PM10_2023_tidy.shape)
print(df_O3_2023_tidy.shape)
print(df_NO2_2023_tidy.shape)

Second, tidy the meteorological data

In [None]:
def change_meteo_date_format(df):
    """Changes the date format to yyyy-mm-dd hh:mm"""
    df['DateTime'] = pd.to_datetime(df['YYYYMMDD'].astype(str) + ' ' + df['HH'].astype(str), 
                                    format = '%Y%m%d %H')
    df = remove_unuseful_cols(df, ['YYYYMMDD', 'HH'])
    return df


def replace_WD_990_with_NaN(df, col):
    """Replaces all occurrences of 990 in the WD column with 0"""
    df[col] = df[col].replace(990, np.nan)
    return df


def extract_tidy_raw_meteo_data(df, col, only_260, year, fill_NaNs = True):
    """Tidies the raw meteo data by various preprocessing steps, and returns the selected col(s)"""
    df.columns = df.columns.str.strip() # remove leading and trailing ws in col names
    if '# STN' in df.columns:           # change col name of stations
        df = df.rename(columns = {'# STN' : 'STN'})
    df = remove_unuseful_cols(df, ['T10N', 'FF', 'VV', 'N', 'U',
                                   'WW', 'IX', 'M', 'R', 'O', 'S', 'Y'])
    df['HH'] = df['HH'].subtract(1)     # 1-24 to 0-23 hour range
    df = change_meteo_date_format(df)   # create DateTime column
    df = df.set_index('DateTime')       # set DateTime as index

                                        # separate the three stations:
    df = df[[col, 'STN']].copy()        # keep only selected col and station name col
    if col == 'DD':                     # 990 (change in DD (or WD)) -> 0, for more even influence
        df = replace_WD_990_with_NaN(df, col)

                                        # continue with the 260 station:
    df_260 = remove_unuseful_cols(df[df['STN'] == 260], 'STN')

    if fill_NaNs:
        df_260 = fill_NaNs_linear(df_260).astype('float64')

    if SUBSET_MONTHS:
        df_260 = subset_month_range(df_260, START_MON, END_MON, year)

    df_260 = df_260.rename(columns = {df.columns[0] : 'S260'})
    df_260 = delete_feb_29th(df_260)
    df_260 = delete_firework_days(df_260)

    if only_260:                        # return only the 260 station (only_260 var is unused)
        return df_260
    else:
        return None

In [None]:
only_DeBilt = True                      # True: only De Bilt is used

# df_temp_2016_tidy = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'T', only_DeBilt)
# df_dewP_2016_tidy = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'TD', only_DeBilt)
# df_WD_2016_tidy   = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'DD', only_DeBilt)
# df_Wvh_2016_tidy  = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'FH', only_DeBilt)
# df_Wmax_2016_tidy = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'FX', only_DeBilt)
# df_preT_2016_tidy = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'DR', only_DeBilt)
# df_preS_2016_tidy = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'RH', only_DeBilt)
# df_SQ_2016_tidy   = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'SQ', only_DeBilt)
# df_Q_2016_tidy    = extract_tidy_raw_meteo_data(df_meteo_2016_raw, 'Q', only_DeBilt)

df_temp_2017_tidy = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'T', only_DeBilt, '2017')
df_dewP_2017_tidy = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'TD', only_DeBilt, '2017')
df_WD_2017_tidy   = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'DD', only_DeBilt, '2017')
df_Wvh_2017_tidy  = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'FH', only_DeBilt, '2017')
df_Wmax_2017_tidy = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'FX', only_DeBilt, '2017')
df_preT_2017_tidy = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'DR', only_DeBilt, '2017')
df_P_2017_tidy    = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'P', only_DeBilt, '2017')
df_preS_2017_tidy = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'RH', only_DeBilt, '2017')
df_SQ_2017_tidy   = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'SQ', only_DeBilt, '2017')
df_Q_2017_tidy    = extract_tidy_raw_meteo_data(df_meteo_2017_raw, 'Q', only_DeBilt, '2017')

df_temp_2018_tidy = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'T', only_DeBilt, '2018')
df_dewP_2018_tidy = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'TD', only_DeBilt, '2018')
df_WD_2018_tidy   = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'DD', only_DeBilt, '2018')
df_Wvh_2018_tidy  = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'FH', only_DeBilt, '2018')
df_Wmax_2018_tidy = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'FX', only_DeBilt, '2018')
df_preT_2018_tidy = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'DR', only_DeBilt, '2018')
df_P_2018_tidy    = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'P', only_DeBilt, '2018')
df_preS_2018_tidy = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'RH', only_DeBilt, '2018')
df_SQ_2018_tidy   = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'SQ', only_DeBilt, '2018')
df_Q_2018_tidy    = extract_tidy_raw_meteo_data(df_meteo_2018_raw, 'Q', only_DeBilt, '2018')

# df_temp_2019_tidy = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'T', only_DeBilt, '2019')
# df_dewP_2019_tidy = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'TD', only_DeBilt, '2019')
# df_WD_2019_tidy   = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'DD', only_DeBilt, '2019')
# df_Wvh_2019_tidy  = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'FH', only_DeBilt, '2019')
# df_Wmax_2019_tidy = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'FX', only_DeBilt, '2019')
# df_preT_2019_tidy = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'DR', only_DeBilt, '2019')
# df_P_2019_tidy    = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'P', only_DeBilt, '2019')
# df_preS_2019_tidy = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'RH', only_DeBilt, '2019')
# df_SQ_2019_tidy   = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'SQ', only_DeBilt, '2019')
# df_Q_2019_tidy    = extract_tidy_raw_meteo_data(df_meteo_2019_raw, 'Q', only_DeBilt, '2019')

df_temp_2020_tidy = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'T', only_DeBilt, '2020')
df_dewP_2020_tidy = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'TD', only_DeBilt, '2020')
df_WD_2020_tidy   = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'DD', only_DeBilt, '2020')
df_Wvh_2020_tidy  = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'FH', only_DeBilt, '2020')
df_Wmax_2020_tidy = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'FX', only_DeBilt, '2020')
df_preT_2020_tidy = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'DR', only_DeBilt, '2020')
df_P_2020_tidy    = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'P', only_DeBilt, '2020')
df_preS_2020_tidy = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'RH', only_DeBilt, '2020')
df_SQ_2020_tidy   = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'SQ', only_DeBilt, '2020')
df_Q_2020_tidy    = extract_tidy_raw_meteo_data(df_meteo_2020_raw, 'Q', only_DeBilt, '2020')

df_temp_2021_tidy = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'T', only_DeBilt, '2021')
df_dewP_2021_tidy = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'TD', only_DeBilt, '2021')
df_WD_2021_tidy   = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'DD', only_DeBilt, '2021')
df_Wvh_2021_tidy  = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'FH', only_DeBilt, '2021')
df_Wmax_2021_tidy = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'FX', only_DeBilt, '2021')
df_preT_2021_tidy = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'DR', only_DeBilt, '2021')
df_P_2021_tidy    = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'P', only_DeBilt, '2021')
df_preS_2021_tidy = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'RH', only_DeBilt, '2021')
df_SQ_2021_tidy   = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'SQ', only_DeBilt, '2021')
df_Q_2021_tidy    = extract_tidy_raw_meteo_data(df_meteo_2021_raw, 'Q', only_DeBilt, '2021')

df_temp_2022_tidy = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'T', only_DeBilt, '2022')
df_dewP_2022_tidy = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'TD', only_DeBilt, '2022')
df_WD_2022_tidy   = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'DD', only_DeBilt, '2022')
df_Wvh_2022_tidy  = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'FH', only_DeBilt, '2022')
df_Wmax_2022_tidy = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'FX', only_DeBilt, '2022')
df_preT_2022_tidy = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'DR', only_DeBilt, '2022')
df_P_2022_tidy    = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'P', only_DeBilt, '2022')
df_preS_2022_tidy = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'RH', only_DeBilt, '2022')
df_SQ_2022_tidy   = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'SQ', only_DeBilt, '2022')
df_Q_2022_tidy    = extract_tidy_raw_meteo_data(df_meteo_2022_raw, 'Q', only_DeBilt, '2022')

df_temp_2023_tidy = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'T', only_DeBilt, '2023')
df_dewP_2023_tidy = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'TD', only_DeBilt, '2023')
df_WD_2023_tidy   = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'DD', only_DeBilt, '2023')
df_Wvh_2023_tidy  = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'FH', only_DeBilt, '2023')
df_Wmax_2023_tidy = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'FX', only_DeBilt, '2023')
df_preT_2023_tidy = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'DR', only_DeBilt, '2023')
df_P_2023_tidy    = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'P', only_DeBilt, '2023')
df_preS_2023_tidy = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'RH', only_DeBilt, '2023')
df_SQ_2023_tidy   = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'SQ', only_DeBilt, '2023')
df_Q_2023_tidy    = extract_tidy_raw_meteo_data(df_meteo_2023_raw, 'Q', only_DeBilt, '2023')

In [None]:
print(df_WD_2022_tidy.head(2))
print(df_WD_2022_tidy.tail(2))
print(df_P_2023_tidy.head(2))

In [None]:
print(df_temp_2017_tidy.shape)
print(df_dewP_2017_tidy.shape)
print(df_WD_2017_tidy.shape)
print(df_Wvh_2017_tidy.shape)
print(df_Wmax_2017_tidy.shape)
print(df_preT_2017_tidy.shape)
print(df_P_2017_tidy.shape)
print(df_preS_2017_tidy.shape)
print(df_SQ_2017_tidy.shape)
print(df_Q_2017_tidy.shape)

# print(df_temp_2019_tidy.shape)
# print(df_dewP_2019_tidy.shape)
# print(df_WD_2019_tidy.shape)
# print(df_Wvh_2019_tidy.shape)
# print(df_Wmax_2019_tidy.shape)
# print(df_preT_2019_tidy.shape)
# print(df_P_2019_tidy.shape)
# print(df_preS_2019_tidy.shape)
# print(df_SQ_2019_tidy.shape)
# print(df_Q_2019_tidy.shape)

print(df_temp_2023_tidy.shape)
print(df_dewP_2023_tidy.shape)
print(df_WD_2023_tidy.shape)
print(df_Wvh_2023_tidy.shape)
print(df_Wmax_2023_tidy.shape)
print(df_preT_2023_tidy.shape)
print(df_P_2023_tidy.shape)
print(df_preS_2023_tidy.shape)
print(df_SQ_2023_tidy.shape)
print(df_Q_2023_tidy.shape)

In [None]:
# sys.exit()

#### **Inspect data with various metrics**

min, mean, max of day, month, year

In [None]:
def get_min_sensor_value(df, sensor):
    return df[sensor].min()


def get_mean_sensor_value(df, sensor):
    return df[sensor].mean()


def get_max_sensor_value(df, sensor):
    return df[sensor].max()


def get_min_per_day(df, sensor):
    return pd.Series(data = df[sensor].resample('D', origin = 'start').min())


def get_mean_per_day(df, sensor):
    return pd.Series(data = df[sensor].resample('D', origin = 'start').mean())


def get_max_per_day(df, sensor):
    return pd.Series(data = df[sensor].resample('D', origin = 'start').max())


def get_min_per_month(df, sensor):      # resample by month, take min(), shift to 15th of month
    return pd.Series(data = \
                     df[sensor].resample('MS', convention = 'start').min().shift(14, 'D'))
                     

def get_mean_per_month(df, sensor):     # resample by month, take mean(), shift to 15th of month
    return pd.Series(data = \
                     df[sensor].resample('MS', convention = 'start').mean().shift(14, 'D'))


def get_max_per_month(df, sensor):      # resample by month, take max(), shift to 15th of month
    return pd.Series(data = \
                     df[sensor].resample('MS', convention = 'start').max().shift(14, 'D'))

Some helper functions

In [None]:
def get_col_measurement_count(df, col):
    """Returns number of measurements"""
    return df[col].count()


def print_index_sampling_info(df):
    """Prints various sampling metrics of the index"""
    print(f'Sample time distribution  =\n{df.index.to_series().diff().value_counts()}')
    print(f'Most frequent sample time = {df.index.to_series().diff().median()}')
    print(f'Mean sample time          = {df.index.to_series().diff().mean()}')


def print_sensor_metrics_min_mean_max_entries(df, sensor, meta):
    """Prints the min, mean, max, and number of entries of a sensor"""
    if not sensor in df.columns:
        return print(f"{meta['comp']} measurements for sensor {sensor} are not avaiable\n")

    print(f"[min, mean, max] for sensor {sensor} measuring {meta['comp']} {meta['unit']}:")
    print(f"[{get_min_sensor_value(df, sensor):.4f}, {get_mean_sensor_value(df, sensor):.4f}, {get_max_sensor_value(df, sensor):.4f}] with n = {get_col_measurement_count(df, sensor)}")
    print()


def get_daily_sensor_metrics(df, sensor):
    return get_min_per_day(df, sensor), get_mean_per_day(df, sensor), get_max_per_day(df, sensor)


def get_monthly_sensor_metrics(df, sensor):
    return get_min_per_month(df, sensor), get_mean_per_month(df, sensor), get_max_per_month(df, sensor)

Print some metrics

In [None]:
# print_sensor_metrics_min_mean_max_entries(df_PM25_2016_tidy, TUINDORP, PM25_2016_meta)
# print_sensor_metrics_min_mean_max_entries(df_PM25_2017_tidy, TUINDORP, PM25_2017_meta)
# print_sensor_metrics_min_mean_max_entries(df_PM25_2018_tidy, TUINDORP, PM25_2018_meta)
print_sensor_metrics_min_mean_max_entries(df_PM25_2019_tidy, TUINDORP, PM25_2019_meta)
print_sensor_metrics_min_mean_max_entries(df_PM25_2020_tidy, TUINDORP, PM25_2020_meta)
print_sensor_metrics_min_mean_max_entries(df_PM25_2021_tidy, TUINDORP, PM25_2021_meta)
print_sensor_metrics_min_mean_max_entries(df_PM25_2022_tidy, TUINDORP, PM25_2022_meta)

In [None]:
# print_sensor_metrics_min_mean_max_entries(df_PM10_2016_tidy, TUINDORP, PM10_2016_meta)
# print_sensor_metrics_min_mean_max_entries(df_PM10_2017_tidy, TUINDORP, PM10_2017_meta)
# print_sensor_metrics_min_mean_max_entries(df_PM10_2018_tidy, TUINDORP, PM10_2018_meta)
print_sensor_metrics_min_mean_max_entries(df_PM10_2019_tidy, TUINDORP, PM10_2019_meta)
print_sensor_metrics_min_mean_max_entries(df_PM10_2020_tidy, TUINDORP, PM10_2020_meta)
print_sensor_metrics_min_mean_max_entries(df_PM10_2021_tidy, TUINDORP, PM10_2021_meta)
print_sensor_metrics_min_mean_max_entries(df_PM10_2022_tidy, TUINDORP, PM10_2022_meta)

In [None]:
# print_sensor_metrics_min_mean_max_entries(df_O3_2016_tidy, TUINDORP, O3_2016_meta)
# print_sensor_metrics_min_mean_max_entries(df_O3_2017_tidy, TUINDORP, O3_2017_meta)
# print_sensor_metrics_min_mean_max_entries(df_O3_2018_tidy, TUINDORP, O3_2018_meta)
print_sensor_metrics_min_mean_max_entries(df_O3_2019_tidy, TUINDORP, O3_2019_meta)
print_sensor_metrics_min_mean_max_entries(df_O3_2020_tidy, TUINDORP, O3_2020_meta)
print_sensor_metrics_min_mean_max_entries(df_O3_2021_tidy, TUINDORP, O3_2021_meta)
print_sensor_metrics_min_mean_max_entries(df_O3_2022_tidy, TUINDORP, O3_2022_meta)

In [None]:
# print_sensor_metrics_min_mean_max_entries(df_NO2_2016_tidy, TUINDORP, NO2_2016_meta)
# print_sensor_metrics_min_mean_max_entries(df_NO2_2017_tidy, TUINDORP, NO2_2017_meta)
# print_sensor_metrics_min_mean_max_entries(df_NO2_2018_tidy, TUINDORP, NO2_2018_meta)
print_sensor_metrics_min_mean_max_entries(df_NO2_2019_tidy, TUINDORP, NO2_2019_meta)
print_sensor_metrics_min_mean_max_entries(df_NO2_2020_tidy, TUINDORP, NO2_2020_meta)
print_sensor_metrics_min_mean_max_entries(df_NO2_2021_tidy, TUINDORP, NO2_2021_meta)
print_sensor_metrics_min_mean_max_entries(df_NO2_2022_tidy, TUINDORP, NO2_2022_meta)

Free up memory

In [None]:
del df_PM25_2016_raw, df_PM10_2016_raw, df_O3_2016_raw, df_NO2_2016_raw
del df_PM25_2017_raw, df_PM10_2017_raw, df_O3_2017_raw, df_NO2_2017_raw
del df_PM25_2018_raw, df_PM10_2018_raw, df_O3_2018_raw, df_NO2_2018_raw
del df_PM25_2019_raw, df_PM10_2019_raw, df_O3_2019_raw, df_NO2_2019_raw
del df_PM25_2020_raw, df_PM10_2020_raw, df_O3_2020_raw, df_NO2_2020_raw
del df_PM25_2021_raw, df_PM10_2021_raw, df_O3_2021_raw, df_NO2_2021_raw
del df_PM25_2022_raw, df_PM10_2022_raw, df_O3_2022_raw, df_NO2_2022_raw
del df_PM25_2023_raw, df_PM10_2023_raw, df_O3_2023_raw, df_NO2_2023_raw
del df_meteo_2016_raw
del df_meteo_2017_raw
del df_meteo_2018_raw
del df_meteo_2019_raw
del df_meteo_2020_raw
del df_meteo_2021_raw
del df_meteo_2022_raw
del df_meteo_2023_raw

In [None]:
# sys.exit()

#### **Inspect data with visualisations**

In [None]:
def set_style():
    sns.set_theme()
    sns.axes_style('darkgrid')
    sns.set_palette('dark') 
    sns.set_context('notebook')
    

def plot_sensor(df, sensor, info = ''):
    """Plots all measurements for one sensor against time"""
    if sensor not in df:
        return print(f"Sensor {sensor} is not avaiable\n")

    set_style()
    sns.lineplot(data = df, x = 'DateTime', y = sensor, color = '#800000')

    plt.title(f"Sensor {sensor} plotted against time - {info}")
    plt.xlabel("Time")
    plt.xticks(rotation = 18)
    plt.ylabel(f"Measurement value")
    plt.show()


def plot_sensor_meta(df, sensor, meta):
    """Plots all measurements for one sensor against time, with metadata"""
    if sensor not in df:
        return print(f"{meta['comp']} measurements for sensor {sensor} are not avaiable\n")

    set_style()
    sns.lineplot(data = df, x = 'DateTime', y = sensor, color = '#800000')

    plt.title(f"Sensor {sensor} plotted against time")
    plt.xlabel("Time")
    plt.xticks(rotation = 18)
    plt.ylabel(f"{meta['comp']} value in {meta['unit']}")
    plt.show()

                                        
def plot_min_mean_max(df, sensor, meta):
    """Plots min, mean, max of a sensor against time"""
    mins, means, maxs = get_daily_sensor_metrics(df, sensor)
    
    set_style()

    sns.lineplot(data = mins.to_frame(), x = mins.index, y = mins.values, 
                 label = 'min')#, color = '#FED116')
    sns.lineplot(data = means.to_frame(), x = means.index, y = means.values, 
                 label = 'mean')#, color = '#CD1127')
    sns.lineplot(data = maxs.to_frame(), x = maxs.index, y = maxs.values, 
                 label = 'max')#, color = '#013893')

    plt.title(f"Sensor {sensor}'s min, max, mean plotted against time")
    plt.xlabel("Time")
    plt.xticks(rotation = 18)
    plt.ylabel(f"{meta['comp']} value in {meta['unit']}")
    plt.show()


def plot_day_vs_month(df, sensor, meta):
    """Plots daily vs monthly average of a sensor against time"""
    days = get_mean_per_day(df, sensor)
    mons = get_mean_per_month(df, sensor)

    set_style()

    sns.lineplot(data = days.to_frame(), x = days.index, 
                 y = days.values, label = 'day')
    plt.stem(mons.index, mons.values, basefmt = ' ', 
             linefmt = '--r', markerfmt = 'or', label = 'month')
    # for more customization visit:
    # https://stackoverflow.com/questions/38984959/how-can-i
    # -get-the-stemlines-color-to-match-the-marker-color-in-a-stem-plot

    plt.title(f"Sensor {sensor}'s daily and monthly average plotted against time")
    plt.xlabel("Time")
    plt.xticks(rotation = 18)
    plt.ylabel(f"{meta['comp']} value in {meta['unit']}")
    plt.legend()
    plt.show()

#### **Select locations**

Here, we'll select the locations we want to use. The I/O-task can be either 0-dimensional, or 1-dimensional.  

EDIT: The project is continued with a one-dimensional set-up, but some code might still be accustomed to both possible set-ups.

In [None]:
def subset_sensors(df, sensors):
    """Subsets sensor in the vicinity of Groningen, Friesland, and Drenthe"""

    if isinstance(sensors, str):        # subset one sensor, so a str,
        return df.loc[:, sensors]       
    else:                               # else, subset multiple from a list
        return df.loc[:, df.columns.isin(sensors)]

In [None]:
sensors_1D = [TUINDORP, BREUKELEN]

df_PM25_2017_tidy_subset_1D = subset_sensors(df_PM25_2017_tidy, sensors_1D)
df_PM10_2017_tidy_subset_1D = subset_sensors(df_PM10_2017_tidy, sensors_1D)
df_O3_2017_tidy_subset_1D = subset_sensors(df_O3_2017_tidy, sensors_1D)
df_NO2_2017_tidy_subset_1D = subset_sensors(df_NO2_2017_tidy, sensors_1D)
df_PM25_2018_tidy_subset_1D = subset_sensors(df_PM25_2018_tidy, sensors_1D)
df_PM10_2018_tidy_subset_1D = subset_sensors(df_PM10_2018_tidy, sensors_1D)
df_O3_2018_tidy_subset_1D = subset_sensors(df_O3_2018_tidy, sensors_1D)
df_NO2_2018_tidy_subset_1D = subset_sensors(df_NO2_2018_tidy, sensors_1D)
df_PM25_2020_tidy_subset_1D = subset_sensors(df_PM25_2020_tidy, sensors_1D)
df_PM10_2020_tidy_subset_1D = subset_sensors(df_PM10_2020_tidy, sensors_1D)
df_O3_2020_tidy_subset_1D = subset_sensors(df_O3_2020_tidy, sensors_1D)
df_NO2_2020_tidy_subset_1D = subset_sensors(df_NO2_2020_tidy, sensors_1D)
df_PM25_2021_tidy_subset_1D = subset_sensors(df_PM25_2021_tidy, sensors_1D)
df_PM10_2021_tidy_subset_1D = subset_sensors(df_PM10_2021_tidy, sensors_1D)
df_O3_2021_tidy_subset_1D = subset_sensors(df_O3_2021_tidy, sensors_1D)
df_NO2_2021_tidy_subset_1D = subset_sensors(df_NO2_2021_tidy, sensors_1D)
df_PM25_2022_tidy_subset_1D = subset_sensors(df_PM25_2022_tidy, sensors_1D)
df_PM10_2022_tidy_subset_1D = subset_sensors(df_PM10_2022_tidy, sensors_1D)
df_O3_2022_tidy_subset_1D = subset_sensors(df_O3_2022_tidy, sensors_1D)
df_NO2_2022_tidy_subset_1D = subset_sensors(df_NO2_2022_tidy, sensors_1D)
df_PM25_2023_tidy_subset_1D = subset_sensors(df_PM25_2023_tidy, sensors_1D)
df_PM10_2023_tidy_subset_1D = subset_sensors(df_PM10_2023_tidy, sensors_1D)
df_O3_2023_tidy_subset_1D = subset_sensors(df_O3_2023_tidy, sensors_1D)
df_NO2_2023_tidy_subset_1D = subset_sensors(df_NO2_2023_tidy, sensors_1D)

del df_PM25_2017_tidy, df_PM10_2017_tidy, df_O3_2017_tidy, df_NO2_2017_tidy
del df_PM25_2018_tidy, df_PM10_2018_tidy, df_O3_2018_tidy, df_NO2_2018_tidy
del df_PM25_2020_tidy, df_PM10_2020_tidy, df_O3_2020_tidy, df_NO2_2020_tidy
del df_PM25_2021_tidy, df_PM10_2021_tidy, df_O3_2021_tidy, df_NO2_2021_tidy
del df_PM25_2022_tidy, df_PM10_2022_tidy, df_O3_2022_tidy, df_NO2_2022_tidy
del df_PM25_2023_tidy, df_PM10_2023_tidy, df_O3_2023_tidy, df_NO2_2023_tidy

#### **Select timeframe**

Check the shapes before plotting

In [None]:
# print(df_NO2_2016_tidy_subset_1D.shape, df_O3_2016_tidy_subset_1D.shape,
#       df_PM25_2016_tidy_subset_1D.shape, df_PM10_2016_tidy_subset_1D.shape)
print(df_NO2_2017_tidy_subset_1D.shape, df_O3_2017_tidy_subset_1D.shape,
        df_PM25_2017_tidy_subset_1D.shape, df_PM10_2017_tidy_subset_1D.shape)
print(df_NO2_2018_tidy_subset_1D.shape, df_O3_2018_tidy_subset_1D.shape,
        df_PM25_2018_tidy_subset_1D.shape, df_PM10_2018_tidy_subset_1D.shape)
# print(df_NO2_2019_tidy_subset_1D.shape, df_O3_2019_tidy_subset_1D.shape,
#         df_PM25_2019_tidy_subset_1D.shape, df_PM10_2019_tidy_subset_1D.shape)
print(df_NO2_2020_tidy_subset_1D.shape, df_O3_2020_tidy_subset_1D.shape,
        df_PM25_2020_tidy_subset_1D.shape, df_PM10_2020_tidy_subset_1D.shape)
print(df_NO2_2021_tidy_subset_1D.shape, df_O3_2021_tidy_subset_1D.shape,
        df_PM25_2021_tidy_subset_1D.shape, df_PM10_2021_tidy_subset_1D.shape)
print(df_NO2_2022_tidy_subset_1D.shape, df_O3_2022_tidy_subset_1D.shape,
        df_PM25_2022_tidy_subset_1D.shape, df_PM10_2022_tidy_subset_1D.shape)

# # Add dummy column for missing cols
# df_O3_2016_tidy_subset_1D[TUINDORP] = np.nan
# df_O3_2019_tidy_subset_1D[TUINDORP] = np.nan

In [None]:
# sys.exit()

#### **Feature Engineering**

In [None]:
# https://seaborn.pydata.org/examples/many_pairwise_correlations.html
def plot_corr_matrix_pearson(df, threshold = 0, method = 'pearson'):
    """Plot a diagonal correlation matrix using the assembled dataframe"""
    corr = df.corr(method)
    if threshold:
        corr = corr[corr.abs() > threshold]

    mask = np.triu(np.ones_like(corr, dtype = bool))

    f, ax = plt.subplots(figsize = (7, 5))
    # # cmap = sns.diverging_palette(230, 20, as_cmap = True)
    # cmap = sns.diverging_palette(0, 255, s = 100, sep = 1, as_cmap = True)

    # sns.heatmap(corr, mask = mask, cmap = cmap, center = 0,
    #             square = True, linewidths = .5, cbar_kws = {"shrink": .5});
    sns.heatmap(corr, mask = mask,
                vmin = -1, vmax = 1, center = 0,
                square = True, linewidth = .5, cbar_kws = {"shrink": .75})
    plt.tight_layout()
    plt.show()

In [None]:
# https://seaborn.pydata.org/examples/many_pairwise_correlations.html
def plot_corr_matrix_other_method(df, method = 'kendall'):
    """Plot a diagonal correlation matrix using the assembled dataframe"""
    corr = df.corr(method)
    mask = np.triu(np.ones_like(corr, dtype = bool))
    f, ax = plt.subplots(figsize = (7, 5))
    sns.heatmap(corr, mask = mask,
                vmin = -1, vmax = 1, center = 0,
                square = True, linewidth = .5, cbar_kws = {"shrink": .75})
    plt.tight_layout()
    plt.show()

See the paper (README.md) for an analysis of the correlation plots created by the functions above.

#### **Perform train-validation-test-split**

*Training data:*  
September, October, November, December of 2017, 2018, 2020, and September, October, first 2 weeks of November of 2021, 2022  


*Validation data:*  
Last 2 weeks of November, first week of December of 2021, 2022, and September and first 2 weeks of October of 2023  


*Testing data:*  
Last 3 weeks of December of 2022, and last 2 weeks of October and November of 2023  

In [None]:
def perform_data_split(df, days_vali, days_test):
    """Performs a train-validation-test split on the data"""
                                        # partition points of val/test 
                                        # set expressed in hours for indexing
    pp_vali = int(df.shape[0] - (days_vali + days_test) * 24)
    pp_test = int(pp_vali + days_test * 24)
    
    df_train = df[ : pp_vali]
    df_vali  = df[pp_vali : pp_test]
    df_test  = df[pp_test : ]

    return df_train, df_vali, df_test

In [None]:
def perform_data_split_without_train(df, days_vali, days_test):
    """Performs a train-validation-test split on the data"""
                                        # partition points of val/test 
                                        # set expressed in hours for indexing
    pp_vali = int(days_vali * 24)
    pp_test = int(pp_vali + days_test * 24)
    
    df_vali  = df[ : pp_vali]
    df_test  = df[pp_vali : pp_test]

    return df_vali, df_test

With these two variables, the size of the validation and testing set is set (in days):

In [None]:
days_vali = 21
days_test = 21

Train-test-split is performed. Each component is split separately. In the case of a one-dimensional prediction task, additional contaminant data is split as well. (All data remains segregate for now for proper normalisation later.)

For the final version we'll consider the data mentioned above, this is the splitting code:

In [None]:
df_PM25_2017_train_1D = df_PM25_2017_tidy_subset_1D.copy()
df_PM10_2017_train_1D = df_PM10_2017_tidy_subset_1D.copy()
df_NO2_2017_train_1D  = df_NO2_2017_tidy_subset_1D.copy()
df_O3_2017_train_1D   = df_O3_2017_tidy_subset_1D.copy()
df_temp_2017_train = df_temp_2017_tidy.copy()
df_dewP_2017_train = df_dewP_2017_tidy.copy()
df_WD_2017_train   = df_WD_2017_tidy.copy()
df_Wvh_2017_train  = df_Wvh_2017_tidy.copy()
df_P_2017_train    = df_P_2017_tidy.copy()
df_SQ_2017_train   = df_SQ_2017_tidy.copy()

df_PM25_2018_train_1D = df_PM25_2018_tidy_subset_1D.copy()
df_PM10_2018_train_1D = df_PM10_2018_tidy_subset_1D.copy()
df_NO2_2018_train_1D  = df_NO2_2018_tidy_subset_1D.copy()
df_O3_2018_train_1D   = df_O3_2018_tidy_subset_1D.copy()
df_temp_2018_train = df_temp_2018_tidy.copy()
df_dewP_2018_train = df_dewP_2018_tidy.copy()
df_WD_2018_train   = df_WD_2018_tidy.copy()
df_Wvh_2018_train  = df_Wvh_2018_tidy.copy()
df_P_2018_train    = df_P_2018_tidy.copy()
df_SQ_2018_train   = df_SQ_2018_tidy.copy()

df_PM25_2020_train_1D = df_PM25_2020_tidy_subset_1D.copy()
df_PM10_2020_train_1D = df_PM10_2020_tidy_subset_1D.copy()
df_NO2_2020_train_1D  = df_NO2_2020_tidy_subset_1D.copy()
df_O3_2020_train_1D   = df_O3_2020_tidy_subset_1D.copy()
df_temp_2020_train = df_temp_2020_tidy.copy()
df_dewP_2020_train = df_dewP_2020_tidy.copy()
df_WD_2020_train   = df_WD_2020_tidy.copy()
df_Wvh_2020_train  = df_Wvh_2020_tidy.copy()
df_P_2020_train    = df_P_2020_tidy.copy()
df_SQ_2020_train   = df_SQ_2020_tidy.copy()

df_PM25_2021_train_1D, df_PM25_2021_val_1D, df_PM25_2021_test_1D = \
    perform_data_split(df_PM25_2021_tidy_subset_1D, days_vali, days_test)
df_PM10_2021_train_1D, df_PM10_2021_val_1D, df_PM10_2021_test_1D = \
    perform_data_split(df_PM10_2021_tidy_subset_1D, days_vali, days_test)
df_NO2_2021_train_1D,  df_NO2_2021_val_1D,  df_NO2_2021_test_1D  = \
    perform_data_split(df_NO2_2021_tidy_subset_1D, days_vali, days_test)
df_O3_2021_train_1D,   df_O3_2021_val_1D,   df_O3_2021_test_1D   = \
    perform_data_split(df_O3_2021_tidy_subset_1D, days_vali, days_test)
df_temp_2021_train, df_temp_2021_val, df_temp_2021_test = \
    perform_data_split(df_temp_2021_tidy, days_vali, days_test)
df_dewP_2021_train, df_dewP_2021_val, df_dewP_2021_test = \
    perform_data_split(df_dewP_2021_tidy, days_vali, days_test)
df_WD_2021_train,   df_WD_2021_val,   df_WD_2021_test   = \
    perform_data_split(df_WD_2021_tidy, days_vali, days_test)
df_Wvh_2021_train,  df_Wvh_2021_val,  df_Wvh_2021_test  = \
    perform_data_split(df_Wvh_2021_tidy, days_vali, days_test)
df_P_2021_train,    df_P_2021_val,    df_P_2021_test    = \
    perform_data_split(df_P_2021_tidy, days_vali, days_test)
df_SQ_2021_train,   df_SQ_2021_val,   df_SQ_2021_test   = \
    perform_data_split(df_SQ_2021_tidy, days_vali, days_test)

df_PM25_2022_train_1D, df_PM25_2022_val_1D, df_PM25_2022_test_1D = \
    perform_data_split(df_PM25_2022_tidy_subset_1D, days_vali, days_test)
df_PM10_2022_train_1D, df_PM10_2022_val_1D, df_PM10_2022_test_1D = \
    perform_data_split(df_PM10_2022_tidy_subset_1D, days_vali, days_test)
df_NO2_2022_train_1D,  df_NO2_2022_val_1D,  df_NO2_2022_test_1D  = \
    perform_data_split(df_NO2_2022_tidy_subset_1D, days_vali, days_test)
df_O3_2022_train_1D,   df_O3_2022_val_1D,   df_O3_2022_test_1D   = \
    perform_data_split(df_O3_2022_tidy_subset_1D, days_vali, days_test)
df_temp_2022_train, df_temp_2022_val, df_temp_2022_test = \
    perform_data_split(df_temp_2022_tidy, days_vali, days_test)
df_dewP_2022_train, df_dewP_2022_val, df_dewP_2022_test = \
    perform_data_split(df_dewP_2022_tidy, days_vali, days_test)
df_WD_2022_train,   df_WD_2022_val,   df_WD_2022_test   = \
    perform_data_split(df_WD_2022_tidy, days_vali, days_test)
df_Wvh_2022_train,  df_Wvh_2022_val,  df_Wvh_2022_test  = \
    perform_data_split(df_Wvh_2022_tidy, days_vali, days_test)
df_P_2022_train,    df_P_2022_val,    df_P_2022_test    = \
    perform_data_split(df_P_2022_tidy, days_vali, days_test)
df_SQ_2022_train,   df_SQ_2022_val,   df_SQ_2022_test   = \
    perform_data_split(df_SQ_2022_tidy, days_vali, days_test)

df_PM25_2023_val_1D, df_PM25_2023_test_1D = \
    perform_data_split_without_train(df_PM25_2023_tidy_subset_1D, 63, 63)
df_PM10_2023_val_1D, df_PM10_2023_test_1D = \
    perform_data_split_without_train(df_PM10_2023_tidy_subset_1D, 63, 63)
df_NO2_2023_val_1D,  df_NO2_2023_test_1D  = \
    perform_data_split_without_train(df_NO2_2023_tidy_subset_1D, 63, 63)
df_O3_2023_val_1D,   df_O3_2023_test_1D   = \
    perform_data_split_without_train(df_O3_2023_tidy_subset_1D, 63, 63)
df_temp_2023_val,    df_temp_2023_test = \
    perform_data_split_without_train(df_temp_2023_tidy, 63, 63)
df_dewP_2023_val,    df_dewP_2023_test = \
    perform_data_split_without_train(df_dewP_2023_tidy, 63, 63)
df_WD_2023_val,      df_WD_2023_test   = \
    perform_data_split_without_train(df_WD_2023_tidy, 63, 63)
df_Wvh_2023_val,     df_Wvh_2023_test  = \
    perform_data_split_without_train(df_Wvh_2023_tidy, 63, 63)
df_P_2023_val,       df_P_2023_test    = \
    perform_data_split_without_train(df_P_2023_tidy, 63, 63)
df_SQ_2023_val,      df_SQ_2023_test   = \
    perform_data_split_without_train(df_SQ_2023_tidy, 63, 63)

In [None]:
def print_split_ratios(dfs_train: list, df_val, df_test, comp):
    """Prints the splitting ratios (useful after the train-validaiton-test split)"""
    total_len = sum([len(df) for df in dfs_train]) + len(df_val) + len(df_test)
    print(f"[train/validation/test] %-ratio for {comp} data is: ", end = '')
    print(f"[{round((sum([len(df) for df in dfs_train])) / total_len * 100, 1)}/", end = '')
    print(f"{round(len(df_val) / total_len * 100, 1)}/{round(len(df_test) / total_len * 100, 1)}]")


print_split_ratios([df_PM25_2017_train_1D,
                    df_PM25_2018_train_1D,
                    df_PM25_2020_train_1D,
                    df_PM25_2021_train_1D,
                    df_PM25_2022_train_1D],
                    pd.concat([df_PM25_2021_val_1D,
                               df_PM25_2022_val_1D,
                               df_PM25_2023_val_1D]),
                    pd.concat([df_PM25_2021_test_1D,
                               df_PM25_2022_test_1D,
                               df_PM25_2023_test_1D]),
                    'PM25')

In [None]:
print(df_PM25_2023_val_1D.head())
print(df_PM25_2023_val_1D.tail())
print(df_PM25_2023_test_1D.head())
print(df_PM25_2023_test_1D.tail())

In [None]:
# sys.exit()

#### **Normalisation**

Linear scaling: $x' = (x - x_{min}) / (x_{max} - x_{min})$

In [None]:
def get_df_minimum(df):
    """Returns minimum of entire dataframe"""
    return np.min(df.min())             


def get_df_maximum(df):
    """Returns maximum of entire dataframe"""
    return np.max(df.max())


def calc_combined_min_max_params(dfs: list):
    """"Returns min and max of two dataframes combined"""
    min = np.min([get_df_minimum(df) for df in dfs])
    max = np.max([get_df_maximum(df) for df in dfs])
    return min, max


def normalise_linear(df, min, max):
    """Performs linear scaling (minmax) on dataframe"""
    return (df - min) / (max - min)


def normalise_linear_inv(df_norm, min, max):
    """Performs inverse linear scaling (minmax) on dataframe"""
    return df_norm * (max - min) + min

Normalise each component/contaminant separately

In [None]:
PM25_min_train, PM25_max_train = calc_combined_min_max_params([
                                                            df_PM25_2017_train_1D,
                                                            df_PM25_2018_train_1D,
                                                            df_PM25_2020_train_1D,
                                                            df_PM25_2021_train_1D,
                                                            df_PM25_2022_train_1D,
                                                            ])
PM10_min_train, PM10_max_train = calc_combined_min_max_params([
                                                            df_PM10_2017_train_1D,
                                                            df_PM10_2018_train_1D,
                                                            df_PM10_2020_train_1D,
                                                            df_PM10_2021_train_1D,
                                                            df_PM10_2022_train_1D,
                                                            ])
O3_min_train,   O3_max_train   = calc_combined_min_max_params([
                                                            df_O3_2017_train_1D,
                                                            df_O3_2018_train_1D,
                                                            df_O3_2020_train_1D,
                                                            df_O3_2021_train_1D,
                                                            df_O3_2022_train_1D,
                                                            ])
NO2_min_train,  NO2_max_train  = calc_combined_min_max_params([
                                                            df_NO2_2017_train_1D,
                                                            df_NO2_2018_train_1D,
                                                            df_NO2_2020_train_1D,
                                                            df_NO2_2021_train_1D,
                                                            df_NO2_2022_train_1D,
                                                            ])
temp_min_train, temp_max_train = calc_combined_min_max_params([
                                                            df_temp_2017_train,
                                                            df_temp_2018_train,
                                                            df_temp_2020_train,
                                                            df_temp_2021_train,
                                                            df_temp_2022_train,
                                                            ])
dewP_min_train, dewP_max_train = calc_combined_min_max_params([
                                                            df_dewP_2017_train,
                                                            df_dewP_2018_train,
                                                            df_dewP_2020_train,
                                                            df_dewP_2021_train,
                                                            df_dewP_2022_train,
                                                            ])
WD_min_train,   WD_max_train   = calc_combined_min_max_params([
                                                            df_WD_2017_train,
                                                            df_WD_2018_train,
                                                            df_WD_2020_train,
                                                            df_WD_2021_train,
                                                            df_WD_2022_train,
                                                            ])
Wvh_min_train,  Wvh_max_train  = calc_combined_min_max_params([
                                                            df_Wvh_2017_train,
                                                            df_Wvh_2018_train,
                                                            df_Wvh_2020_train,
                                                            df_Wvh_2021_train,
                                                            df_Wvh_2022_train,
                                                            ])
P_min_train,    P_max_train    = calc_combined_min_max_params([
                                                            df_P_2017_train,
                                                            df_P_2018_train,
                                                            df_P_2020_train,
                                                            df_P_2021_train,
                                                            df_P_2022_train,
                                                            ])
SQ_min_train,   SQ_max_train   = calc_combined_min_max_params([
                                                            df_SQ_2017_train,
                                                            df_SQ_2018_train,
                                                            df_SQ_2020_train,
                                                            df_SQ_2021_train,
                                                            df_SQ_2022_train,
                                                            ])

df_minmax = pd.DataFrame({'NO2':  [NO2_min_train, NO2_max_train],
                          'O3':   [O3_min_train, O3_max_train],
                          'PM10': [PM10_min_train, PM10_max_train],
                          'PM25': [PM25_min_train, PM25_max_train]},
                          index = ['min', 'max']).T
print(df_minmax)
df_minmax.to_csv(f"../data/dataset_final/contaminant_minmax.csv", 
                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')

In [None]:
df_NO2_2017_train_norm_1D = normalise_linear(df_NO2_2017_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2018_train_norm_1D = normalise_linear(df_NO2_2018_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2020_train_norm_1D = normalise_linear(df_NO2_2020_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_train_norm_1D = normalise_linear(df_NO2_2021_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_val_norm_1D = normalise_linear(df_NO2_2021_val_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_test_norm_1D = normalise_linear(df_NO2_2021_test_1D, NO2_min_train, NO2_max_train)
df_NO2_2022_train_norm_1D = normalise_linear(df_NO2_2022_train_1D, NO2_min_train, NO2_max_train)
df_NO2_val_2022_norm_1D = normalise_linear(df_NO2_2022_val_1D, NO2_min_train, NO2_max_train)
df_NO2_test_2022_norm_1D = normalise_linear(df_NO2_2022_test_1D, NO2_min_train, NO2_max_train)
df_NO2_val_2023_norm_1D = normalise_linear(df_NO2_2023_val_1D, NO2_min_train, NO2_max_train)
df_NO2_test_2023_norm_1D = normalise_linear(df_NO2_2023_test_1D, NO2_min_train, NO2_max_train)

df_O3_2017_train_norm_1D = normalise_linear(df_O3_2017_train_1D, O3_min_train, O3_max_train)
df_O3_2018_train_norm_1D = normalise_linear(df_O3_2018_train_1D, O3_min_train, O3_max_train)
df_O3_2020_train_norm_1D = normalise_linear(df_O3_2020_train_1D, O3_min_train, O3_max_train)
df_O3_2021_train_norm_1D = normalise_linear(df_O3_2021_train_1D, O3_min_train, O3_max_train)
df_O3_2021_val_norm_1D = normalise_linear(df_O3_2021_val_1D, O3_min_train, O3_max_train)
df_O3_2021_test_norm_1D = normalise_linear(df_O3_2021_test_1D, O3_min_train, O3_max_train)
df_O3_2022_train_norm_1D = normalise_linear(df_O3_2022_train_1D, O3_min_train, O3_max_train)
df_O3_val_2022_norm_1D = normalise_linear(df_O3_2022_val_1D, O3_min_train, O3_max_train)
df_O3_test_2022_norm_1D = normalise_linear(df_O3_2022_test_1D, O3_min_train, O3_max_train)
df_O3_val_2023_norm_1D = normalise_linear(df_O3_2023_val_1D, O3_min_train, O3_max_train)
df_O3_test_2023_norm_1D = normalise_linear(df_O3_2023_test_1D, O3_min_train, O3_max_train)

df_PM10_2017_train_norm_1D = normalise_linear(df_PM10_2017_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2018_train_norm_1D = normalise_linear(df_PM10_2018_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2020_train_norm_1D = normalise_linear(df_PM10_2020_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_train_norm_1D = normalise_linear(df_PM10_2021_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_val_norm_1D = normalise_linear(df_PM10_2021_val_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_test_norm_1D = normalise_linear(df_PM10_2021_test_1D, PM10_min_train, PM10_max_train)
df_PM10_2022_train_norm_1D = normalise_linear(df_PM10_2022_train_1D, PM10_min_train, PM10_max_train)
df_PM10_val_2022_norm_1D = normalise_linear(df_PM10_2022_val_1D, PM10_min_train, PM10_max_train)
df_PM10_test_2022_norm_1D = normalise_linear(df_PM10_2022_test_1D, PM10_min_train, PM10_max_train)
df_PM10_val_2023_norm_1D = normalise_linear(df_PM10_2023_val_1D, PM10_min_train, PM10_max_train)
df_PM10_test_2023_norm_1D = normalise_linear(df_PM10_2023_test_1D, PM10_min_train, PM10_max_train)

df_PM25_2017_train_norm_1D = normalise_linear(df_PM25_2017_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2018_train_norm_1D = normalise_linear(df_PM25_2018_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2020_train_norm_1D = normalise_linear(df_PM25_2020_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_train_norm_1D = normalise_linear(df_PM25_2021_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_val_norm_1D = normalise_linear(df_PM25_2021_val_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_test_norm_1D = normalise_linear(df_PM25_2021_test_1D, PM25_min_train, PM25_max_train)
df_PM25_2022_train_norm_1D = normalise_linear(df_PM25_2022_train_1D, PM25_min_train, PM25_max_train)
df_PM25_val_2022_norm_1D = normalise_linear(df_PM25_2022_val_1D, PM25_min_train, PM25_max_train)
df_PM25_test_2022_norm_1D = normalise_linear(df_PM25_2022_test_1D, PM25_min_train, PM25_max_train)
df_PM25_val_2023_norm_1D = normalise_linear(df_PM25_2023_val_1D, PM25_min_train, PM25_max_train)
df_PM25_test_2023_norm_1D = normalise_linear(df_PM25_2023_test_1D, PM25_min_train, PM25_max_train)

df_temp_2017_train_norm = normalise_linear(df_temp_2017_train, temp_min_train, temp_max_train)
df_temp_2018_train_norm = normalise_linear(df_temp_2018_train, temp_min_train, temp_max_train)
df_temp_2020_train_norm = normalise_linear(df_temp_2020_train, temp_min_train, temp_max_train)
df_temp_2021_train_norm = normalise_linear(df_temp_2021_train, temp_min_train, temp_max_train)
df_temp_2021_val_norm = normalise_linear(df_temp_2021_val, temp_min_train, temp_max_train)
df_temp_2021_test_norm = normalise_linear(df_temp_2021_test, temp_min_train, temp_max_train)
df_temp_2022_train_norm = normalise_linear(df_temp_2022_train, temp_min_train, temp_max_train)
df_temp_val_2022_norm = normalise_linear(df_temp_2022_val, temp_min_train, temp_max_train)
df_temp_test_2022_norm = normalise_linear(df_temp_2022_test, temp_min_train, temp_max_train)
df_temp_val_2023_norm = normalise_linear(df_temp_2023_val, temp_min_train, temp_max_train)
df_temp_test_2023_norm = normalise_linear(df_temp_2023_test, temp_min_train, temp_max_train)

df_dewP_2017_train_norm = normalise_linear(df_dewP_2017_train, dewP_min_train, dewP_max_train)
df_dewP_2018_train_norm = normalise_linear(df_dewP_2018_train, dewP_min_train, dewP_max_train)
df_dewP_2020_train_norm = normalise_linear(df_dewP_2020_train, dewP_min_train, dewP_max_train)
df_dewP_2021_train_norm = normalise_linear(df_dewP_2021_train, dewP_min_train, dewP_max_train)
df_dewP_2021_val_norm = normalise_linear(df_dewP_2021_val, dewP_min_train, dewP_max_train)
df_dewP_2021_test_norm = normalise_linear(df_dewP_2021_test, dewP_min_train, dewP_max_train)
df_dewP_2022_train_norm = normalise_linear(df_dewP_2022_train, dewP_min_train, dewP_max_train)
df_dewP_val_2022_norm = normalise_linear(df_dewP_2022_val, dewP_min_train, dewP_max_train)
df_dewP_test_2022_norm = normalise_linear(df_dewP_2022_test, dewP_min_train, dewP_max_train)
df_dewP_val_2023_norm = normalise_linear(df_dewP_2023_val, dewP_min_train, dewP_max_train)
df_dewP_test_2023_norm = normalise_linear(df_dewP_2023_test, dewP_min_train, dewP_max_train)

df_WD_2017_train_norm = normalise_linear(df_WD_2017_train, WD_min_train, WD_max_train)
df_WD_2018_train_norm = normalise_linear(df_WD_2018_train, WD_min_train, WD_max_train)
df_WD_2020_train_norm = normalise_linear(df_WD_2020_train, WD_min_train, WD_max_train)
df_WD_2021_train_norm = normalise_linear(df_WD_2021_train, WD_min_train, WD_max_train)
df_WD_2021_val_norm = normalise_linear(df_WD_2021_val, WD_min_train, WD_max_train)
df_WD_2021_test_norm = normalise_linear(df_WD_2021_test, WD_min_train, WD_max_train)
df_WD_2022_train_norm = normalise_linear(df_WD_2022_train, WD_min_train, WD_max_train)
df_WD_val_2022_norm = normalise_linear(df_WD_2022_val, WD_min_train, WD_max_train)
df_WD_test_2022_norm = normalise_linear(df_WD_2022_test, WD_min_train, WD_max_train)
df_WD_val_2023_norm = normalise_linear(df_WD_2023_val, WD_min_train, WD_max_train)
df_WD_test_2023_norm = normalise_linear(df_WD_2023_test, WD_min_train, WD_max_train)

df_Wvh_2017_train_norm = normalise_linear(df_Wvh_2017_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2018_train_norm = normalise_linear(df_Wvh_2018_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2020_train_norm = normalise_linear(df_Wvh_2020_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_train_norm = normalise_linear(df_Wvh_2021_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_val_norm = normalise_linear(df_Wvh_2021_val, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_test_norm = normalise_linear(df_Wvh_2021_test, Wvh_min_train, Wvh_max_train)
df_Wvh_2022_train_norm = normalise_linear(df_Wvh_2022_train, Wvh_min_train, Wvh_max_train)
df_Wvh_val_2022_norm = normalise_linear(df_Wvh_2022_val, Wvh_min_train, Wvh_max_train)
df_Wvh_test_2022_norm = normalise_linear(df_Wvh_2022_test, Wvh_min_train, Wvh_max_train)
df_Wvh_val_2023_norm = normalise_linear(df_Wvh_2023_val, Wvh_min_train, Wvh_max_train)
df_Wvh_test_2023_norm = normalise_linear(df_Wvh_2023_test, Wvh_min_train, Wvh_max_train)

df_P_2017_train_norm = normalise_linear(df_P_2017_train, P_min_train, P_max_train)
df_P_2018_train_norm = normalise_linear(df_P_2018_train, P_min_train, P_max_train)
df_P_2020_train_norm = normalise_linear(df_P_2020_train, P_min_train, P_max_train)
df_P_2021_train_norm = normalise_linear(df_P_2021_train, P_min_train, P_max_train)
df_P_2021_val_norm = normalise_linear(df_P_2021_val, P_min_train, P_max_train)
df_P_2021_test_norm = normalise_linear(df_P_2021_test, P_min_train, P_max_train)
df_P_2022_train_norm = normalise_linear(df_P_2022_train, P_min_train, P_max_train)
df_P_val_2022_norm = normalise_linear(df_P_2022_val, P_min_train, P_max_train)
df_P_test_2022_norm = normalise_linear(df_P_2022_test, P_min_train, P_max_train)
df_P_val_2023_norm = normalise_linear(df_P_2023_val, P_min_train, P_max_train)
df_P_test_2023_norm = normalise_linear(df_P_2023_test, P_min_train, P_max_train)

df_SQ_2017_train_norm = normalise_linear(df_SQ_2017_train, SQ_min_train, SQ_max_train)
df_SQ_2018_train_norm = normalise_linear(df_SQ_2018_train, SQ_min_train, SQ_max_train)
df_SQ_2020_train_norm = normalise_linear(df_SQ_2020_train, SQ_min_train, SQ_max_train)
df_SQ_2021_train_norm = normalise_linear(df_SQ_2021_train, SQ_min_train, SQ_max_train)
df_SQ_2021_val_norm = normalise_linear(df_SQ_2021_val, SQ_min_train, SQ_max_train)
df_SQ_2021_test_norm = normalise_linear(df_SQ_2021_test, SQ_min_train, SQ_max_train)
df_SQ_2022_train_norm = normalise_linear(df_SQ_2022_train, SQ_min_train, SQ_max_train)
df_SQ_val_2022_norm = normalise_linear(df_SQ_2022_val, SQ_min_train, SQ_max_train)
df_SQ_test_2022_norm = normalise_linear(df_SQ_2022_test, SQ_min_train, SQ_max_train)
df_SQ_val_2023_norm = normalise_linear(df_SQ_2023_val, SQ_min_train, SQ_max_train)
df_SQ_test_2023_norm = normalise_linear(df_SQ_2023_test, SQ_min_train, SQ_max_train)

Plot distributions

In [None]:
def plot_distributions_KDE(data, title):
    """Plots the distribution of a sensor's measurements"""
    set_style()
    
    if isinstance(data, pd.Series):      # distinguish between Series and DataFrame
        sns.kdeplot(data)
    else:
        for column in data.columns:
            sns.kdeplot(data, x = column)

    plt.xlim(right = 1)
    plt.ylim(top = 10)
    plt.title(f"Measurement distributions - {title}")
    plt.xlabel('Measurement value')
    plt.show()


def plot_multiple_distributions(data: list, title, metadata):
    """Plots the distribution of sensors' measurements"""
    set_style()
    
    # if isinstance(data1, pd.Series):    # distinguish between Series and DataFrame
    #     sns.kdeplot(data1, label = '1')
    #     sns.kdeplot(data2, label = '2')
    #     sns.kdeplot(data3, label = '3')
    # else:
        # for column in data1.columns:
        #     sns.kdeplot(data1, x = column, label = '1')
        # for column in data2.columns:
        #     sns.kdeplot(data2, x = column, label = '2')
        # for column in data3.columns:
        #     sns.kdeplot(data3, x = column, label = '3')
    for idx, df in enumerate(data):
        for column in df.columns:
            sns.kdeplot(df, x = column, label = idx + 1)

    plt.xlim(right = 1)
    plt.ylim(top = 10)
    plt.title(f"Measurement dist.s for {metadata['comp']} - {title}")
    plt.xlabel('Measurement value')
    plt.legend()
    plt.show()

#### **Create big combined normalised dataframe**

In [None]:
def concat_frames_vertically(dfs, keys):           # https://pandas.pydata.org/docs/user_guide/cookbook.html
    """
    Concatenates a list of dataframes into one dataframe with a MultiIndex,
    where the first level is the key and the second level is the original index.
    The values get unionized over the columns, and the index is sorted by date.
    """
    frames = [df.rename(columns = {df.columns[0] : 'Groningen'}) for df in dfs]
    
    return pd.concat(objs = frames, 
                     axis = 0,          # concat over row axis while unionizing column-axis
                     join = 'outer',    # create MultiIndex, rename MultiIndex, then sort on date
                     keys = keys).rename_axis(['Component', 'DateTime']).sort_index(level = 'DateTime')

In [None]:
def plot_tails(data, title):            # https://seaborn.pydata.org/generated/seaborn.violinplot.html
    """Plots violin plot of a sensor's different component measurements"""
    set_style()

    if isinstance(data, pd.Series):     # interpret as Series
        df = pd.DataFrame(data).reset_index()
        df.columns = (['Component', 'DateTime', 'Value'])
        sns.violinplot(data = df, x = 'Component', y = 'Value', hue = 'Component', legend = False)
    else:
        df = data.reset_index()         # interpret as DataFrame
        df.columns = (['Component', 'DateTime', 'Value'])
        sns.violinplot(data = df, x = 'Component', y = 'Value', hue = 'Component', legend = False)

    plt.title(title)
    plt.ylim(top = 1.0)
    plt.xlabel('Component')
    plt.ylabel('Normalised value')
    plt.show()   

In [None]:
keys = ['PM25', 'PM10', 'O3', 'NO2',
        'temp', 'dewP', 'WD', 'Wvh', 'p', 'SQ']

# Create input dataframes (u)
frames_train_2017_1D_u = [df_PM25_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2017_train_norm,
                              df_dewP_2017_train_norm,
                              df_WD_2017_train_norm,
                              df_Wvh_2017_train_norm,
                              df_P_2017_train_norm,
                              df_SQ_2017_train_norm]
frames_train_2018_1D_u = [df_PM25_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2018_train_norm,
                              df_dewP_2018_train_norm,
                              df_WD_2018_train_norm,
                              df_Wvh_2018_train_norm,
                              df_P_2018_train_norm,
                              df_SQ_2018_train_norm]
frames_train_2020_1D_u = [df_PM25_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2020_train_norm,
                              df_dewP_2020_train_norm,
                              df_WD_2020_train_norm,
                              df_Wvh_2020_train_norm,
                              df_P_2020_train_norm,
                              df_SQ_2020_train_norm]
frames_train_2021_1D_u = [df_PM25_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2021_train_norm,
                              df_dewP_2021_train_norm,
                              df_WD_2021_train_norm,
                              df_Wvh_2021_train_norm,
                              df_P_2021_train_norm,
                              df_SQ_2021_train_norm]
frames_val_2021_1D_u = [df_PM25_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_O3_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_temp_2021_val_norm,
                            df_dewP_2021_val_norm,
                            df_WD_2021_val_norm,
                            df_Wvh_2021_val_norm,
                            df_P_2021_val_norm,
                            df_SQ_2021_val_norm]
frames_test_2021_1D_u = [df_PM25_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_O3_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_temp_2021_test_norm,
                             df_dewP_2021_test_norm,
                             df_WD_2021_test_norm,
                             df_Wvh_2021_test_norm,
                             df_P_2021_test_norm,
                             df_SQ_2021_test_norm]
frames_train_2022_1D_u = [df_PM25_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2022_train_norm,
                              df_dewP_2022_train_norm,
                              df_WD_2022_train_norm,
                              df_Wvh_2022_train_norm,
                              df_P_2022_train_norm,
                              df_SQ_2022_train_norm]
frames_val_2022_1D_u = [df_PM25_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_O3_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_temp_val_2022_norm,
                            df_dewP_val_2022_norm,
                            df_WD_val_2022_norm,
                            df_Wvh_val_2022_norm,
                            df_P_val_2022_norm,
                            df_SQ_val_2022_norm]
frames_val_2023_1D_u = [df_PM25_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_O3_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_temp_val_2023_norm,
                            df_dewP_val_2023_norm,
                            df_WD_val_2023_norm,
                            df_Wvh_val_2023_norm,
                            df_P_val_2023_norm,
                            df_SQ_val_2023_norm]
frames_test_2022_1D_u = [df_PM25_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_O3_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_temp_test_2022_norm,
                             df_dewP_test_2022_norm,
                             df_WD_test_2022_norm,
                             df_Wvh_test_2022_norm,
                             df_P_test_2022_norm,
                             df_SQ_test_2022_norm]
frames_test_2023_1D_u = [df_PM25_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_O3_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_temp_test_2023_norm,
                             df_dewP_test_2023_norm,
                             df_WD_test_2023_norm,
                             df_Wvh_test_2023_norm,
                             df_P_test_2023_norm,
                             df_SQ_test_2023_norm]

In [None]:
frames_train_2017_1D_y = [df_PM25_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2017_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2018_1D_y = [df_PM25_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2018_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2020_1D_y = [df_PM25_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2020_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2021_1D_y = [df_PM25_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2021_train_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2021_1D_y = [df_PM25_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_2021_val_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2021_1D_y = [df_PM25_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_PM10_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_O3_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_NO2_2021_test_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2022_1D_y = [df_PM25_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2022_train_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2022_1D_y = [df_PM25_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_val_2022_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2023_1D_y = [df_PM25_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_val_2023_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2022_1D_y = [df_PM25_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_test_2022_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2023_1D_y = [df_PM25_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_test_2023_norm_1D.loc[:, [BREUKELEN]]]

The sampling of the dataset is done in the model files, not here.

In [None]:
def make_manual_dict_of_dfs(dfs, components):
    """
    Creates a dictionary of dataframes with the components as keys.
    Important:
    It assumes they are in the same order as the components list!
    """
    return dict(zip(components, dfs))


def sort_dict_of_dfs(dfs_dict, components_sorted):
    """Sorts a dictionary of dataframes by the given components list"""
    return [dfs_dict[name] for name in components_sorted]


def print_dict_of_dfs(dfs):
    """Prints a dictionary of dataframes in a concise manner"""
    for key, df in dfs.items():
        print(key)
        print(df.head(2))


def print_dfs_sorted(dfs):
    """Prints a list of dataframes in a concise manner"""
    for df in dfs:
        print(df.head(2))


def concat_frames_horizontally(dfs, components):
    """
    Concatenates a list of dataframes into one dataframe where:
    - the x-axis (axis = 1) is the DateTime index;
    - the y-axis (axis = 0) is the component axis.
    With this approach, the diagrams cannot be automatically sorted.
    Hence, this is done manually through the following sorting order:
    NO2, O3, PM10, PM25, Q, SQ, WD, Wmax, Wvh, dewP, temp.

    The following steps are taken:
    1. Create a dictionary of dataframes with the components as keys;
    2. Sort the components list alphabetically;
    3. Sort the dictionary by the components list;
    4. Concatenate the sorted dataframes;
    5. Drop the old column names (i.e. sensor names).
    """
    dfs_dict = make_manual_dict_of_dfs(dfs, components)
    components_sorted = sorted(components)
    dfs_sorted = sort_dict_of_dfs(dfs_dict, components_sorted)
    
    df = pd.concat(objs = dfs_sorted, 
                   axis = 1,            # concat over column axis    
                   keys = components_sorted).sort_index(level = 'DateTime')
    df.columns = df.columns.droplevel(1)
    return df

In [None]:
input_keys = ['PM25', 'PM10', 'O3', 'NO2',
              'temp', 'dewP', 'WD', 'Wvh', 'p', 'SQ']
target_keys = ['PM25', 'PM10', 'O3', 'NO2']

In [None]:
print(sorted(input_keys))
print(sorted(target_keys))

In [None]:
for df in frames_val_2023_1D_u:
    print(df.index.tz)

The 2023 data seemed to have a different kind of DateTime index, namely one that was timezone-aware instead of timezone-naive. Hence, we delete the timezone information here for uniformity.

In [None]:
for df in frames_val_2023_1D_u:
    df.index = df.index.tz_localize(None)
for df in frames_test_2023_1D_u:
    df.index = df.index.tz_localize(None)

In [None]:
df_train_2017_horizontal_u = concat_frames_horizontally(frames_train_2017_1D_u, input_keys)
df_train_2018_horizontal_u = concat_frames_horizontally(frames_train_2018_1D_u, input_keys)
df_train_2020_horizontal_u = concat_frames_horizontally(frames_train_2020_1D_u, input_keys)
df_train_2021_horizontal_u = concat_frames_horizontally(frames_train_2021_1D_u, input_keys)
df_val_2021_horizontal_u = concat_frames_horizontally(frames_val_2021_1D_u, input_keys)
df_test_2021_horizontal_u = concat_frames_horizontally(frames_test_2021_1D_u, input_keys)
df_train_2022_horizontal_u = concat_frames_horizontally(frames_train_2022_1D_u, input_keys)
df_val_2022_horizontal_u = concat_frames_horizontally(frames_val_2022_1D_u, input_keys)
df_val_2023_horizontal_u = concat_frames_horizontally(frames_val_2023_1D_u, input_keys)
df_test_2022_horizontal_u = concat_frames_horizontally(frames_test_2022_1D_u, input_keys)
df_test_2023_horizontal_u = concat_frames_horizontally(frames_test_2023_1D_u, input_keys)

df_train_2017_horizontal_y = concat_frames_horizontally(frames_train_2017_1D_y, target_keys)
df_train_2018_horizontal_y = concat_frames_horizontally(frames_train_2018_1D_y, target_keys)
df_train_2020_horizontal_y = concat_frames_horizontally(frames_train_2020_1D_y, target_keys)
df_train_2021_horizontal_y = concat_frames_horizontally(frames_train_2021_1D_y, target_keys)
df_val_2021_horizontal_y = concat_frames_horizontally(frames_val_2021_1D_y, target_keys)
df_test_2021_horizontal_y = concat_frames_horizontally(frames_test_2021_1D_y, target_keys)
df_train_2022_horizontal_y = concat_frames_horizontally(frames_train_2022_1D_y, target_keys)
df_val_2022_horizontal_y = concat_frames_horizontally(frames_val_2022_1D_y, target_keys)
df_val_2023_horizontal_y = concat_frames_horizontally(frames_val_2023_1D_y, target_keys)
df_test_2022_horizontal_y = concat_frames_horizontally(frames_test_2022_1D_y, target_keys)
df_test_2023_horizontal_y = concat_frames_horizontally(frames_test_2023_1D_y, target_keys)

# print(df_train_2017_horizontal_u.shape)
# print(df_train_2018_horizontal_u.shape)
# print(df_train_2019_horizontal_u.shape)
print(df_train_2020_horizontal_u.shape)
# print(df_train_2021_horizontal_u.shape)
# print(df_train_2022_horizontal_u.shape)
print(df_val_2022_horizontal_u.shape)
print(df_test_2022_horizontal_u.shape)
# print(df_train_2017_horizontal_y.shape)
# print(df_train_2018_horizontal_y.shape)
# print(df_train_2019_horizontal_y.shape)
print(df_train_2020_horizontal_y.shape)
# print(df_train_2021_horizontal_y.shape)
# print(df_train_2022_horizontal_y.shape)
print(df_val_2022_horizontal_y.shape)
print(df_test_2022_horizontal_y.shape)

Save the dataframes to data_combined/ folder. The windowing will be performed by a PyTorch Dataset class in the model notebooks.

In [None]:
df_train_2017_horizontal_u.to_csv("../data/data_combined/train_2017_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2018_horizontal_u.to_csv("../data/data_combined/train_2018_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2020_horizontal_u.to_csv("../data/data_combined/train_2020_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2021_horizontal_u.to_csv("../data/data_combined/train_2021_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2021_horizontal_u.to_csv("../data/data_combined/val_2021_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2021_horizontal_u.to_csv("../data/data_combined/test_2021_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2022_horizontal_u.to_csv("../data/data_combined/train_2022_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2022_horizontal_u.to_csv("../data/data_combined/val_2022_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2023_horizontal_u.to_csv("../data/data_combined/val_2023_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2022_horizontal_u.to_csv("../data/data_combined/test_2022_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2023_horizontal_u.to_csv("../data/data_combined/test_2023_combined_u.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')

df_train_2017_horizontal_y.to_csv("../data/data_combined/train_2017_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2018_horizontal_y.to_csv("../data/data_combined/train_2018_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2020_horizontal_y.to_csv("../data/data_combined/train_2020_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2021_horizontal_y.to_csv("../data/data_combined/train_2021_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2021_horizontal_y.to_csv("../data/data_combined/val_2021_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2021_horizontal_y.to_csv("../data/data_combined/test_2021_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2022_horizontal_y.to_csv("../data/data_combined/train_2022_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2022_horizontal_y.to_csv("../data/data_combined/val_2022_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2023_horizontal_y.to_csv("../data/data_combined/val_2023_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2022_horizontal_y.to_csv("../data/data_combined/test_2022_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2023_horizontal_y.to_csv("../data/data_combined/test_2023_combined_y.csv", index = True, sep = ';', decimal = '.', encoding = 'utf-8')

In [None]:
def plot_distributions_KDE(data, title):
    """Plots the distribution of a sensor's measurements"""
    set_style()
    
    if isinstance(data, pd.Series):      # distinguish between Series and DataFrame
        sns.kdeplot(data, label = data.name)
    else:
        for column in data.columns:
            sns.kdeplot(data, x = column, label = column)

    plt.xlim(left = -0.1, right = 1)
    plt.ylim(top = 10)
    plt.title(f"Measurement distributions - {title}")
    plt.xlabel('Measurement value')
    plt.legend()
    plt.show()

In [None]:
df_training_y_combined = pd.concat([
    df_train_2017_horizontal_y,
    df_train_2018_horizontal_y,
    df_train_2020_horizontal_y,
    df_train_2021_horizontal_y,
    df_train_2022_horizontal_y
])
df_val_y_combined = pd.concat([
    df_val_2021_horizontal_y,
    df_val_2022_horizontal_y,
    df_val_2023_horizontal_y
])
df_test_y_combined = pd.concat([
    df_test_2021_horizontal_y,
    df_test_2022_horizontal_y,
    df_test_2023_horizontal_y
])

plot_distributions_KDE(df_training_y_combined, 'training data - y')
plot_distributions_KDE(df_val_y_combined, 'validation data - y')
plot_distributions_KDE(df_test_y_combined, 'test data - y')