In [1]:
# Imports
# ---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.dates import DateFormatter
import matplotlib as mpl
from datetime import timezone
import pickle

register_matplotlib_converters()

# File locations
# ----------------
the_folder = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/gap_filling/'
oxaria0 = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/'
aurn = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/aurn/'
pngs = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/pngs/gap_filling/'
the_files = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/1oxaria/'

In [2]:
# Read in stable 15min operational data Jan - Nov 2020
# ------------------------------------------------------
oxaria_gases_s15 = pd.read_feather(the_folder +
                                   'oxaria_gases_536_stable15.ftr').set_index(
                                       ['tag', 'rec']).sort_index()
oxaria_pm_s15 = pd.read_feather(the_folder +
                                'oxaria_pm_536_stable15.ftr').set_index(
                                    ['tag', 'rec']).sort_index()
oxaria_climate_s15 = pd.read_feather(the_folder +
                                     'oxaria_climate_536_stable15.ftr').set_index(
                                         ['tag', 'rec']).sort_index()
oxaria_status_s15 = pd.read_feather(the_folder +
                                    'oxaria_status_536_stable15.ftr').set_index(
                                        ['tag', 'rec']).sort_index()

In [3]:
# Get colocated sensor data - St Ebbes
# --------------------------------------
oxaria_sebbes_gases_s15 = oxaria_gases_s15.query('tag == "scs-bgx-538"')
oxaria_sebbes_gases_s15.reset_index().to_feather(
    the_folder + 'oxaria_sebbes_gases_536_s15.ftr')
oxaria_sebbes_pm_s15 = oxaria_pm_s15.query('tag == "scs-bgx-538"')
oxaria_sebbes_pm_s15.reset_index().to_feather(
    the_folder + 'oxaria_sebbes_pm_536_s15.ftr')
oxaria_sebbes_status_s15 = oxaria_status_s15.query('tag == "scs-bgx-538"')
oxaria_sebbes_status_s15.reset_index().to_feather(
    the_folder + 'oxaria_sebbes_536_status_15.ftr')
oxaria_sebbes_climate_s15 = oxaria_climate_s15.query('tag == "scs-bgx-538"')
oxaria_sebbes_climate_s15.reset_index().to_feather(
    the_folder + 'oxaria_sebbes_536_climate_s15.ftr')

In [4]:
# Get colocated sensor data - High St
# --------------------------------------
oxaria_highs_gases_s15 = oxaria_gases_s15.query('tag == "scs-bgx-536"')
oxaria_highs_gases_s15.reset_index().to_feather(
    the_folder + 'oxaria_highs_gases_536_s15.ftr')
oxaria_highs_pm_s15 = oxaria_pm_s15.query('tag == "scs-bgx-536"')
oxaria_highs_pm_s15.reset_index().to_feather(
    the_folder + 'oxaria_highs_pm_536_s15.ftr')
oxaria_highs_status_s15 = oxaria_status_s15.query('tag == "scs-bgx-536"')
oxaria_highs_status_s15.reset_index().to_feather(
    the_folder + 'oxaria_highs_536_status_15.ftr')
oxaria_highs_climate_s15 = oxaria_climate_s15.query('tag == "scs-bgx-536"')
oxaria_highs_climate_s15.reset_index().to_feather(
    the_folder + 'oxaria_highs_536_climate_s15.ftr')

In [5]:
# Generate a base training dataset by merging useful scalar variables from gases, pm, status & climate
# ------------------------------------------------------------------------------------------------------
sebbes_train_s15 = oxaria_sebbes_gases_s15.merge(
    oxaria_sebbes_climate_s15,
    left_index=True,
    right_index=True,
    how='inner',
    suffixes=('_g', '_c')).merge(
        oxaria_sebbes_pm_s15,
        left_index=True,
        right_index=True,
        how='inner',
        suffixes=('_g', '_p')).merge(
            oxaria_sebbes_status_s15,
            left_index=True,
            right_index=True,
            how='inner').drop(['name_x', 'name_y', 'name_c'], axis=1).rename({'name_g': 'name'}, axis=1)
sebbes_train_s15.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25824 entries, ('scs-bgx-538', Timestamp('2020-06-05 00:15:00+0000', tz='UTC')) to ('scs-bgx-538', Timestamp('2021-03-01 00:00:00+0000', tz='UTC'))
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   val.no2.wev        25824 non-null  float32
 1   val.no2.cnc        25824 non-null  float32
 2   val.no2.aev        25824 non-null  float32
 3   val.no2.wec        25824 non-null  float32
 4   val.sht.hmd_g      25824 non-null  float32
 5   val.sht.tmp_g      25824 non-null  float32
 6   val.no2.cnc_1      25613 non-null  float32
 7   name               25824 non-null  object 
 8   val.hmd            25824 non-null  float32
 9   val.tmp            25824 non-null  float32
 10  val.mtf1           25824 non-null  float32
 11  val.pm1            25824 non-null  float32
 12  val.mtf5           25824 non-null  float32
 13  val.pm2p5          25824 non-null  float32
 

In [6]:
# Generate a base training dataset by merging useful scalar variables from gases, pm, status & climate
# ------------------------------------------------------------------------------------------------------
highs_train_s15 = oxaria_highs_gases_s15.merge(
    oxaria_highs_climate_s15,
    left_index=True,
    right_index=True,
    how='inner',
    suffixes=('_g', '_c')).merge(
        oxaria_highs_pm_s15,
        left_index=True,
        right_index=True,
        how='inner',
        suffixes=('_g', '_p')).merge(
            oxaria_highs_status_s15,
            left_index=True,
            right_index=True,
            how='inner').drop(['name_x', 'name_y', 'name_c'], axis=1).rename({'name_g': 'name'}, axis=1)
highs_train_s15.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15072 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-536', Timestamp('2021-03-01 00:00:00+0000', tz='UTC'))
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   val.no2.wev        15072 non-null  float32
 1   val.no2.cnc        15072 non-null  float32
 2   val.no2.aev        15072 non-null  float32
 3   val.no2.wec        15072 non-null  float32
 4   val.sht.hmd_g      15072 non-null  float32
 5   val.sht.tmp_g      15072 non-null  float32
 6   val.no2.cnc_1      15072 non-null  float32
 7   name               15072 non-null  object 
 8   val.hmd            15072 non-null  float32
 9   val.tmp            15072 non-null  float32
 10  val.mtf1           15072 non-null  float32
 11  val.pm1            15072 non-null  float32
 12  val.mtf5           15072 non-null  float32
 13  val.pm2p5          15072 non-null  float32
 

In [7]:
# AURN data for both St Ebbes & High St
# ---------------------------------------
auto_merged = pd.read_feather('/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/' +
                              'auto_merged_ratified.ftr').set_index('rec').sort_index()
auto_merged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35136 entries, 2020-01-01 00:15:00+00:00 to 2021-01-01 00:00:00+00:00
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sitecode_s     35136 non-null  object 
 1   name_s         35136 non-null  object 
 2   no_ppb_s       34398 non-null  float32
 3   no2_ppb_s      34398 non-null  float32
 4   pm10_ugg_s     35120 non-null  float32
 5   o3_ppb_s       35051 non-null  float32
 6   pm25_ugg_s     35120 non-null  float32
 7   fidas_t_s      35125 non-null  float32
 8   fidas_rh_s     35125 non-null  float32
 9   sitecode_h     35136 non-null  object 
 10  name_h         35136 non-null  object 
 11  no_ppb_h       34189 non-null  float32
 12  no2_ppb_h      34189 non-null  float32
 13  pm10_ugg_h     34549 non-null  float32
 14  o3_ppb_h       0 non-null      float32
 15  pm25_ugg_h     0 non-null      float32
 16  fidas_t_h      0 non-null      float32
 17  fid

In [8]:
# Merge auto / reference data on to sensor data by timestamp
#------------------------------------------------------------
sebbes_train_s15 = sebbes_train_s15.merge(auto_merged,how='inner',left_index=True,right_index=True)
sebbes_train_s15.reset_index().to_feather(the_folder + 'sebbes_train_536_s15_ratified.ftr')
sebbes_train_s15.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 20160 entries, ('scs-bgx-538', Timestamp('2020-06-05 00:15:00+0000', tz='UTC')) to ('scs-bgx-538', Timestamp('2021-01-01 00:00:00+0000', tz='UTC'))
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   val.no2.wev        20160 non-null  float32
 1   val.no2.cnc        20160 non-null  float32
 2   val.no2.aev        20160 non-null  float32
 3   val.no2.wec        20160 non-null  float32
 4   val.sht.hmd_g      20160 non-null  float32
 5   val.sht.tmp_g      20160 non-null  float32
 6   val.no2.cnc_1      19949 non-null  float32
 7   name               20160 non-null  object 
 8   val.hmd            20160 non-null  float32
 9   val.tmp            20160 non-null  float32
 10  val.mtf1           20160 non-null  float32
 11  val.pm1            20160 non-null  float32
 12  val.mtf5           20160 non-null  float32
 13  val.pm2p5          20160 non-null  float32
 

In [9]:
# Merge auto / reference data on to sensor data by timestamp
#------------------------------------------------------------
highs_train_s15 = highs_train_s15.merge(auto_merged,how='inner',left_index=True,right_index=True)
highs_train_s15.reset_index().to_feather(the_folder + 'highs_train_536_s15_ratified.ftr')
highs_train_s15.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9408 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-536', Timestamp('2021-01-01 00:00:00+0000', tz='UTC'))
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   val.no2.wev        9408 non-null   float32
 1   val.no2.cnc        9408 non-null   float32
 2   val.no2.aev        9408 non-null   float32
 3   val.no2.wec        9408 non-null   float32
 4   val.sht.hmd_g      9408 non-null   float32
 5   val.sht.tmp_g      9408 non-null   float32
 6   val.no2.cnc_1      9408 non-null   float32
 7   name               9408 non-null   object 
 8   val.hmd            9408 non-null   float32
 9   val.tmp            9408 non-null   float32
 10  val.mtf1           9408 non-null   float32
 11  val.pm1            9408 non-null   float32
 12  val.mtf5           9408 non-null   float32
 13  val.pm2p5          9408 non-null   float32
 1

In [10]:
# Pull out data points at other locations where evidence shows that
# background & roadside conditions converge
# -------------------------------------------------------------------
# Load datetimes when High St & St Ebbes no2
with open(oxaria0+'convergent_datelist_no2_ratified.pkl', "rb") as input_file:
    convergent_no2 = pickle.load(input_file)

with open(oxaria0+'convergent_datelist_pm10_ratified.pkl', "rb") as input_file:
    convergent_pm10 = pickle.load(input_file)

In [11]:
convergent_gases_observations = []

dfg = oxaria_gases_s15.query('tag != "scs-bgx-538"')

for k,v in dfg.groupby('tag'):
    v.reset_index(inplace=True)
    if v.iloc[0,1] in convergent_no2:
        convergent_gases_observations.append(v)
convergent_gases_obs = pd.concat(convergent_gases_observations)
convergent_gases_obs.set_index(['tag','rec'],inplace=True)
convergent_gases_obs.sort_index(inplace=True)
convergent_gases_obs.reset_index().to_feather(the_folder+'convergent_gases_obs_536_s15_ratified.ftr')


In [12]:
convergent_pm_observations = []

dfp = oxaria_pm_s15.query('tag != "scs-bgx-538"')

for k,v in dfp.groupby('tag'):
    v.reset_index(inplace=True)
    if v.iloc[0,1] in convergent_pm10:
        convergent_pm_observations.append(v)
convergent_pm_obs = pd.concat(convergent_pm_observations)
convergent_pm_obs.set_index(['tag','rec'],inplace=True)
convergent_pm_obs.sort_index(inplace=True)
convergent_pm_obs.reset_index().to_feather(the_folder+'convergent_pm_obs_536_s15_ratified.ftr')


In [13]:
convergent_climate_observations = []

dfc = oxaria_climate_s15.query('tag != "scs-bgx-538"')

for k,v in dfc.groupby('tag'):
    v.reset_index(inplace=True)
    if v.iloc[0,1] in convergent_no2 or v.iloc[0,1] in convergent_pm10:
        convergent_climate_observations.append(v)
convergent_climate_obs = pd.concat(convergent_climate_observations)
convergent_climate_obs.set_index(['tag','rec'],inplace=True)
convergent_climate_obs.sort_index(inplace=True)
convergent_climate_obs.reset_index().to_feather(the_folder+'convergent_climate_obs_536_s15_ratified.ftr')
display(convergent_climate_obs)

Unnamed: 0_level_0,Unnamed: 1_level_0,val.hmd,val.tmp,name
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
scs-bgx-536,2020-09-25 00:15:00+00:00,73.160004,9.673333,High St
scs-bgx-536,2020-09-25 00:30:00+00:00,72.820000,9.600000,High St
scs-bgx-536,2020-09-25 00:45:00+00:00,73.286667,9.593333,High St
scs-bgx-536,2020-09-25 01:00:00+00:00,72.466667,9.653334,High St
scs-bgx-536,2020-09-25 01:15:00+00:00,72.513336,9.633333,High St
...,...,...,...,...
scs-bgx-559,2021-02-28 23:00:00+00:00,70.559998,9.240000,Speedwell St
scs-bgx-559,2021-02-28 23:15:00+00:00,70.580002,9.060000,Speedwell St
scs-bgx-559,2021-02-28 23:30:00+00:00,70.206665,8.920000,Speedwell St
scs-bgx-559,2021-02-28 23:45:00+00:00,70.566666,8.573334,Speedwell St


In [14]:
convergent_status_observations = []

dfs = oxaria_status_s15.query('tag != "scs-bgx-538"')

for k,v in dfs.groupby('tag'):
    v.reset_index(inplace=True)
    if v.iloc[0,1] in convergent_no2 or v.iloc[0,1] in convergent_pm10:
        convergent_status_observations.append(v)
convergent_status_obs = pd.concat(convergent_status_observations)
convergent_status_obs.set_index(['tag','rec'],inplace=True)
convergent_status_obs.sort_index(inplace=True)
convergent_status_obs.reset_index().to_feather(the_folder+'convergent_status_obs_536_s15_ratified.ftr')


In [15]:
convergent_gases_obs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 78815 entries, ('scs-bgx-552', Timestamp('2020-12-15 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-03-01 00:00:00+0000', tz='UTC'))
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   val.no2.wev    78815 non-null  float32
 1   val.no2.cnc    78815 non-null  float32
 2   val.no2.aev    78815 non-null  float32
 3   val.no2.wec    78815 non-null  float32
 4   val.sht.hmd    78815 non-null  float32
 5   val.sht.tmp    78815 non-null  float32
 6   val.no2.cnc_1  78607 non-null  float32
 7   name           78815 non-null  object 
dtypes: float32(7), object(1)
memory usage: 3.7+ MB


In [16]:
convergent_pm_obs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 119649 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-557', Timestamp('2021-03-01 00:00:00+0000', tz='UTC'))
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   val.mtf1     119649 non-null  float32
 1   val.pm1      119649 non-null  float32
 2   val.mtf5     119649 non-null  float32
 3   val.pm2p5    119649 non-null  float32
 4   val.mtf3     119649 non-null  float32
 5   val.pm10     119649 non-null  float32
 6   val.mtf7     119649 non-null  float32
 7   val.per      119649 non-null  float32
 8   val.sfr      119649 non-null  float32
 9   val.sht.hmd  119649 non-null  float32
 10  val.sht.tmp  119649 non-null  float32
 11  val.pm10_1   93799 non-null   float32
 12  val.pm2p5_1  93799 non-null   float32
 13  val.pm1_1    93799 non-null   float32
 14  name         119649 non-null  object 
dtypes: float32(14), object(1)
memory