In [1]:
# Imports
# ---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.dates import DateFormatter
import matplotlib as mpl
from datetime import timezone
import pickle

register_matplotlib_converters()

# File locations
# ----------------
folder0 = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/gap_filling/'

aurn = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/aurn/'
pngs = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/pngs/gap_filling/'

In [2]:
# Read in stable 15min operational data Jan - Nov 2020
# ------------------------------------------------------
oxaria_gases_s15 = pd.read_feather(folder0 +
                                   'oxaria_gases_536_stable15_transients.ftr').set_index(
                                       ['tag', 'rec']).sort_index()
oxaria_pm_s15 = pd.read_feather(folder0 +
                                'oxaria_pm_536_stable15_transients.ftr').set_index(
                                    ['tag', 'rec']).sort_index()
oxaria_climate_s15 = pd.read_feather(folder0 +
                                     'oxaria_climate_536_stable15_transients.ftr').set_index(
                                         ['tag', 'rec']).sort_index()
oxaria_status_s15 = pd.read_feather(folder0 +
                                    'oxaria_status_stable15.ftr').set_index(
                                        ['tag', 'rec']).sort_index()

In [4]:
# Read in stable 15min operational data Jan - June 2021
#------------------------------------------------------
oxaria_gases_s15_q12021 = pd.read_feather(folder0 +
                                   'q12021/oxaria_gases_536_stable15_q12021_transients_v2.ftr').set_index(
                                      ['tag', 'rec']).sort_index()
oxaria_pm_s15_q12021 = pd.read_feather(folder0 +
                                'q12021/oxaria_pm_536_stable15_q12021_transients.ftr').set_index(
                                    ['tag', 'rec']).sort_index()
oxaria_climate_s15_q12021 = pd.read_feather(folder0 +
                                     'q12021/oxaria_climate_536_stable15_q12021_transients.ftr').set_index(
                                         ['tag', 'rec']).sort_index()
oxaria_status_s15_q12021 = pd.read_feather(folder0 +
                                    'q12021/oxaria_status_536_stable15_q12021_transients.ftr').set_index(
                                        ['tag', 'rec']).sort_index()

In [5]:
# Read in stable 15min operational data Sep - June 2021
#------------------------------------------------------
oxaria_gases_s15_oct21 = pd.read_feather(folder0 +
                                   'jun_to_sept_2021/oxaria_gases_stable15_oct21_transients.ftr').set_index(
                                      ['tag', 'rec']).sort_index()
oxaria_pm_s15_oct21 = pd.read_feather(folder0 +
                                'jun_to_sept_2021/oxaria_pm_stable15_oct21_transients.ftr').set_index(
                                    ['tag', 'rec']).sort_index()
oxaria_climate_s15_oct21 = pd.read_feather(folder0 +
                                     'jun_to_sept_2021/oxaria_climate_stable15_oct21_transients.ftr').set_index(
                                         ['tag', 'rec']).sort_index()
oxaria_status_s15_oct21 = pd.read_feather(folder0 +
                                    'jun_to_sept_2021/oxaria_status_stable15_oct21_transients.ftr').set_index(
                                        ['tag', 'rec']).sort_index()

In [39]:
oxaria_climate_s15_oct21.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 186302 entries, ('scs-bgx-536', Timestamp('2021-06-01 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   val.hmd        186302 non-null  float32
 1   val.tmp        186302 non-null  float32
 2   name_x         186302 non-null  object 
 3   mag_hmd_s20c   186300 non-null  float32
 4   mag_tmp_s20c   186300 non-null  float32
 5   mean_hmd_s20c  186300 non-null  float32
 6   mean_tmp_s20c  186300 non-null  float32
 7   max_hmd_s20c   186300 non-null  float32
 8   name_y         186302 non-null  object 
dtypes: float32(7), object(2)
memory usage: 8.7+ MB


In [40]:
# Combine 2020 and 2021
#-----------------------
oxaria_gases_s15_2021 = pd.concat([oxaria_gases_s15,oxaria_gases_s15_q12021,oxaria_gases_s15_oct21])
oxaria_pm_s15_2021 = pd.concat([oxaria_pm_s15,oxaria_pm_s15_q12021,oxaria_pm_s15_oct21])
oxaria_climate_s15_2021 = pd.concat([oxaria_climate_s15,oxaria_climate_s15_q12021,oxaria_climate_s15_oct21])
oxaria_status_s15_2021 = pd.concat([oxaria_status_s15,oxaria_status_s15_q12021,oxaria_status_s15_oct21])


In [41]:
oxaria_pm_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 756885 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   val.mtf1        756885 non-null  float32
 1   val.pm1         756885 non-null  float32
 2   val.mtf5        756885 non-null  float32
 3   val.pm2p5       756885 non-null  float32
 4   val.mtf3        756885 non-null  float32
 5   val.pm10        756885 non-null  float32
 6   val.mtf7        756885 non-null  float32
 7   val.per         756885 non-null  float32
 8   val.sfr         756885 non-null  float32
 9   val.sht.hmd     756885 non-null  float32
 10  val.sht.tmp     756885 non-null  float32
 11  val.pm10_1      556892 non-null  float32
 12  val.pm2p5_1     556892 non-null  float32
 13  val.pm1_1       556892 non-null  float32
 14  name            341622 non-

In [42]:
# Some housekeeping needed to avoid warning messages later on 
#-------------------------------------------------------------
oxaria_gases_s15_2021['name'] =  np.where(oxaria_gases_s15_2021['name'],np.nan,oxaria_gases_s15_2021['name_x'])
oxaria_pm_s15_2021['name'] =  np.where(oxaria_pm_s15_2021['name'],np.nan,oxaria_pm_s15_2021['name_x'])
#oxaria_status_s15_2021['name'] =  np.where(oxaria_status_s15_2021['name'],np.nan,oxaria_status_s15_2021['name_x'])
oxaria_climate_s15_2021['name'] =  np.where(oxaria_climate_s15_2021['name'],np.nan,oxaria_climate_s15_2021['name_x'])

oxaria_gases_s15_2021.drop(columns=['name_x','name_y'], inplace=True)
oxaria_pm_s15_2021.drop(columns=['name_x','name_y'], inplace=True)
#oxaria_status_s15_2021.drop(columns=['name_x','name_y'], inplace=True)
oxaria_climate_s15_2021.drop(columns=['name_x','name_y'], inplace=True)


In [43]:
# Get colocated sensor data - St Ebbes
# --------------------------------------
# Gases
oxaria_sebbes_gases_s15_2021 = oxaria_gases_s15_2021.query(
    'tag == "scs-bgx-538"')
oxaria_sebbes_gases_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_sebbes_gases_s15_2021_transients.ftr')
# PM
oxaria_sebbes_pm_s15_2021 = oxaria_pm_s15_2021.query(
    'tag == "scs-bgx-538"')
oxaria_sebbes_pm_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_sebbes_pm_s15_2021_transients.ftr')
# Status
oxaria_sebbes_status_s15_2021 = oxaria_status_s15_2021.query(
    'tag == "scs-bgx-538"')
oxaria_sebbes_status_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_sebbes_status_15_2021.ftr')
# Climate
oxaria_sebbes_climate_s15_2021 = oxaria_climate_s15_2021.query(
    'tag == "scs-bgx-538"')
oxaria_sebbes_climate_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_sebbes_climate_s15_2021.ftr')

In [46]:
oxaria_sebbes_climate_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 50937 entries, ('scs-bgx-538', Timestamp('2020-06-05 00:15:00+0000', tz='UTC')) to ('scs-bgx-538', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   val.hmd        50937 non-null  float32
 1   val.tmp        50937 non-null  float32
 2   name           0 non-null      object 
 3   mag_hmd_s20c   50937 non-null  float32
 4   mag_tmp_s20c   50937 non-null  float32
 5   mean_hmd_s20c  50937 non-null  float32
 6   mean_tmp_s20c  50937 non-null  float32
 7   max_hmd_s20c   11262 non-null  float32
dtypes: float32(7), object(1)
memory usage: 4.5+ MB


In [26]:
# Get colocated sensor data - High St
# --------------------------------------
# Gases
oxaria_highs_gases_s15_2021 = oxaria_gases_s15_2021.query(
    'tag == "scs-bgx-536"')
oxaria_highs_gases_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_highs_gases_s15_2021_transients.ftr')
# PM
oxaria_highs_pm_s15_2021 = oxaria_pm_s15_2021.query(
    'tag == "scs-bgx-536"')
oxaria_highs_pm_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_highs_pm_s15_2021_transients.ftr')
# Status
oxaria_highs_status_s15_2021 = oxaria_status_s15_2021.query(
    'tag == "scs-bgx-536"')
oxaria_highs_status_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_highs_status_15_2021.ftr')
# Climate
oxaria_highs_climate_s15_2021 = oxaria_climate_s15_2021.query(
    'tag == "scs-bgx-536"')
oxaria_highs_climate_s15_2021.reset_index().to_feather(
    folder0 + 'oxaria_highs_climate_s15_2021.ftr')

In [27]:
oxaria_highs_climate_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48855 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-536', Timestamp('2021-09-14 14:15:00+0000', tz='UTC'))
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   val.hmd        48855 non-null  float32
 1   val.tmp        48855 non-null  float32
 2   name           0 non-null      object 
 3   mag_hmd_s20c   48855 non-null  float32
 4   mag_tmp_s20c   48855 non-null  float32
 5   mean_hmd_s20c  48855 non-null  float32
 6   mean_tmp_s20c  48855 non-null  float32
 7   max_hmd_s20c   4795 non-null   float32
dtypes: float32(7), object(1)
memory usage: 4.4+ MB


In [12]:
# Generate a St Ebbes base training dataset by merging useful scalar variables 
# from gases, pm, status & climate
#-----------------------------------------------------------------------------
try:
    oxaria_sebbes_gases_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_sebbes_climate_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_sebbes_pm_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_sebbes_status_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass

sebbes_train_s15_2021 = oxaria_sebbes_gases_s15_2021.merge(
    oxaria_sebbes_climate_s15_2021,
    left_index=True,
    right_index=True,
    how='inner',
    suffixes=('_g', '_c')).merge(
        oxaria_sebbes_pm_s15_2021,
        left_index=True,
        right_index=True,
        how='inner',
        suffixes=('_g', '_p')).merge(
            oxaria_sebbes_status_s15_2021,
            left_index=True,
            right_index=True,
            how='inner').drop(['name_x', 'name_y', 'name_c'],
                              axis=1).rename({'name_g': 'name'}, axis=1)
sebbes_train_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 397302 entries, ('scs-bgx-538', Timestamp('2020-06-05 00:15:00+0000', tz='UTC')) to ('scs-bgx-538', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 58 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   val.no2.wev        397302 non-null  float32
 1   val.no2.cnc        397302 non-null  float32
 2   val.no2.aev        397302 non-null  float32
 3   val.no2.wec        397302 non-null  float32
 4   val.sht.hmd_g      397302 non-null  float32
 5   val.sht.tmp_g      397302 non-null  float32
 6   val.no2.cnc_1      394577 non-null  float32
 7   name               0 non-null       object 
 8   mag_hmd_s20_g      397302 non-null  float32
 9   mag_tmp_s20_g      397302 non-null  float32
 10  mean_hmd_s20_g     397302 non-null  float32
 11  mean_tmp_s20_g     397302 non-null  float32
 12  exg.vb20.no2.cnc   10790 non-null   float32
 13  val.hmd            397302 no

In [13]:
# Generate a High St base training dataset by merging useful scalar variables 
# from gases, pm, status & climate
#-----------------------------------------------------------------------------
try:
    oxaria_highs_gases_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_highs_climate_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_highs_pm_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass
try:
    oxaria_highs_status_s15_2021.set_index(['tag', 'rec'], inplace=True)
except Exception:
    pass

highs_train_s15_2021 = oxaria_highs_gases_s15_2021.merge(
    oxaria_highs_climate_s15_2021,
    left_index=True,
    right_index=True,
    how='inner',
    suffixes=('_g', '_c')).merge(
        oxaria_highs_pm_s15_2021,
        left_index=True,
        right_index=True,
        how='inner',
        suffixes=('_g', '_p')).merge(
            oxaria_highs_status_s15_2021,
            left_index=True,
            right_index=True,
            how='inner').drop(['name_x', 'name_y', 'name_c'],
                              axis=1).rename({'name_g': 'name'}, axis=1)
highs_train_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 390084 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-536', Timestamp('2021-09-14 14:15:00+0000', tz='UTC'))
Data columns (total 58 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   val.no2.wev        390084 non-null  float32
 1   val.no2.cnc        390084 non-null  float32
 2   val.no2.aev        390084 non-null  float32
 3   val.no2.wec        390084 non-null  float32
 4   val.sht.hmd_g      390084 non-null  float32
 5   val.sht.tmp_g      390084 non-null  float32
 6   val.no2.cnc_1      384356 non-null  float32
 7   name               0 non-null       object 
 8   mag_hmd_s20_g      390084 non-null  float32
 9   mag_tmp_s20_g      390084 non-null  float32
 10  mean_hmd_s20_g     390084 non-null  float32
 11  mean_tmp_s20_g     390084 non-null  float32
 12  exg.vb20.no2.cnc   4180 non-null    float32
 13  val.hmd            390084 no

In [14]:
# AURN data for both St Ebbes & High St
#---------------------------------------
auto_merged = pd.read_feather(folder0+'jun_to_sept_2021/auto_merged_ratified+2021_oct_update.ftr').set_index('rec').sort_index()
auto_merged.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 63072 entries, 2020-01-01 00:15:00+00:00 to 2021-10-19 00:00:00+00:00
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index_s        63072 non-null  int64  
 1   sitecode_s     63072 non-null  object 
 2   name_s         63072 non-null  object 
 3   no_ppb_s       61781 non-null  float32
 4   no2_ppb_s      61781 non-null  float32
 5   pm10_ugg_s     63030 non-null  float32
 6   o3_ppb_s       35051 non-null  float32
 7   pm25_ugg_s     63030 non-null  float32
 8   fidas_t_s      61317 non-null  float32
 9   fidas_rh_s     61317 non-null  float32
 10  index_h        63072 non-null  int64  
 11  sitecode_h     63072 non-null  object 
 12  name_h         63072 non-null  object 
 13  no_ppb_h       62047 non-null  float32
 14  no2_ppb_h      61482 non-null  float32
 15  pm10_ugg_h     58935 non-null  float32
 16  o3_ppb_h       0 non-null      float32
 17  pm2

In [15]:
# Merge auto / reference data on to sensor data by timestamp
#------------------------------------------------------------
sebbes_train_s15_2021 = sebbes_train_s15_2021.merge(auto_merged,how='inner',left_index=True,right_index=True)
sebbes_train_s15_2021.reset_index().to_feather(folder0 + 'jun_to_sept_2021/sebbes_train_s15+2021_oct_update_transients.ftr')
sebbes_train_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 397302 entries, ('scs-bgx-538', Timestamp('2020-06-05 00:15:00+0000', tz='UTC')) to ('scs-bgx-538', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   val.no2.wev        397302 non-null  float32
 1   val.no2.cnc        397302 non-null  float32
 2   val.no2.aev        397302 non-null  float32
 3   val.no2.wec        397302 non-null  float32
 4   val.sht.hmd_g      397302 non-null  float32
 5   val.sht.tmp_g      397302 non-null  float32
 6   val.no2.cnc_1      394577 non-null  float32
 7   name               0 non-null       object 
 8   mag_hmd_s20_g      397302 non-null  float32
 9   mag_tmp_s20_g      397302 non-null  float32
 10  mean_hmd_s20_g     397302 non-null  float32
 11  mean_tmp_s20_g     397302 non-null  float32
 12  exg.vb20.no2.cnc   10790 non-null   float32
 13  val.hmd            397302 no

In [16]:
# Merge auto / reference data on to sensor data by timestamp
#------------------------------------------------------------
highs_train_s15_2021 = highs_train_s15_2021.merge(auto_merged,how='inner',left_index=True,right_index=True)
highs_train_s15_2021.reset_index().to_feather(folder0 + 'jun_to_sept_2021/highs_train_s15+2021_oct_update_transients.ftr')
highs_train_s15_2021.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 390084 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-536', Timestamp('2021-09-14 14:15:00+0000', tz='UTC'))
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   val.no2.wev        390084 non-null  float32
 1   val.no2.cnc        390084 non-null  float32
 2   val.no2.aev        390084 non-null  float32
 3   val.no2.wec        390084 non-null  float32
 4   val.sht.hmd_g      390084 non-null  float32
 5   val.sht.tmp_g      390084 non-null  float32
 6   val.no2.cnc_1      384356 non-null  float32
 7   name               0 non-null       object 
 8   mag_hmd_s20_g      390084 non-null  float32
 9   mag_tmp_s20_g      390084 non-null  float32
 10  mean_hmd_s20_g     390084 non-null  float32
 11  mean_tmp_s20_g     390084 non-null  float32
 12  exg.vb20.no2.cnc   4180 non-null    float32
 13  val.hmd            390084 no