In [2]:
# Imports
#---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
from matplotlib.dates import DateFormatter
import matplotlib as mpl
import peakutils
from peakutils.plot import plot as pplot
import warnings
import pickle
warnings.filterwarnings("ignore")

register_matplotlib_converters()

# File locations
#----------------
pngs = './oxaria/data/pngs/'
folder0 = './oxaria/processed/batch_2_feb21_may21/'
aurn = './oxaria/data/raw/0oxaria/'


In [3]:
# Load baseline adjusted gases data
#-----------------------------------
df0 = pd.read_feather(folder0+'q12021/oxaria_gases_536_stable15_bl_adjusted_ratified+2021.ftr').set_index(['tag','rec'])
df0.rename(
    columns={
        'val.sht.tmp':'val.sht.tmp_g', 'val.sht.hmd':'val.sht.hmd_g'
    }, inplace=True)

auto_merged = pd.read_feather(aurn+'auto_merged_ratified+2021.ftr')
df0.info()
auto_merged.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 480256 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   val.no2.wev          480256 non-null  float32
 1   val.no2.cnc          479936 non-null  float32
 2   val.no2.aev          480256 non-null  float32
 3   val.no2.wec          479936 non-null  float32
 4   val.sht.hmd_g        479936 non-null  float32
 5   val.sht.tmp_g        479936 non-null  float32
 6   val.no2.cnc_1        470876 non-null  float32
 7   name                 480256 non-null  object 
 8   exg.vb20.no2.cnc     23240 non-null   float32
 9   val.no2.cnc_1_bl     470876 non-null  float64
 10  val.no2.cnc_1_c0     470876 non-null  float64
 11  val.no2.cnc_1_c0_bl  470876 non-null  float64
 12  no2_ppb_s_bl         465888 non-null  float32
 13

In [4]:
# Function to generate the remaining feature required by the correction model
#-----------------------------------------------------------------------------


def no2_feature_gen(df):
    # calc % change in all df cols over last 15 mins
    tmpdf1 = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna(
        axis=1, how='all').pct_change(periods=1)
    tmpdf1.columns = ['pc15_' + name for name in list(tmpdf1.columns)]
    # and same over last 30 mins
    tmpdf2 = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna(
        axis=1, how='all').pct_change(periods=2)
    tmpdf2.columns = ['pc30_' + name for name in list(tmpdf2.columns)]
    # merge % change stats back on to original df
    df = df.merge(tmpdf1, left_index=True, right_index=True,
                  how='left').merge(tmpdf2,
                                    left_index=True,
                                    right_index=True,
                                    how='left').set_index(['tag', 'rec'])
    return df


def rushhour(df):
    # calc integer flags for hour of observation
    df['hour'] = df.index.get_level_values(1).hour
    # and day of week
    df['day'] = df.index.get_level_values(1).dayofweek
    # and integer flag(s) for off-peak, morning & evening rush hours
    df.loc[(df.index.get_level_values(1).time >= dt.time(7, 30, 0)) &
           (df.index.get_level_values(1).time >= dt.time(9, 30, 0)) &
           (df.index.get_level_values(1).weekday < 5), 'rushhour'] = 1
    df.loc[(df.index.get_level_values(1).time >= dt.time(16, 0, 0)) &
           (df.index.get_level_values(1).time >= dt.time(18, 0, 0)) &
           (df.index.get_level_values(1).weekday < 5), 'rushhour'] = 2
    df['rushhour'] = np.where(df['rushhour'] >= 1, df['rushhour'],
                              0).astype(np.int32)
    return df

In [5]:
# Add the features needed by the RF model
#---------------------------------------
tmp_list = []
for tag, dat in df0.reset_index().groupby('tag'):
    tmpdf = no2_feature_gen(dat)
    tmpdf = rushhour(tmpdf)
    tmp_list.append(tmpdf)
df1 = pd.concat(tmp_list)
df1.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 480256 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   val.no2.wev               480256 non-null  float32
 1   val.no2.cnc               479936 non-null  float32
 2   val.no2.aev               480256 non-null  float32
 3   val.no2.wec               479936 non-null  float32
 4   val.sht.hmd_g             479936 non-null  float32
 5   val.sht.tmp_g             479936 non-null  float32
 6   val.no2.cnc_1             470876 non-null  float32
 7   name                      480256 non-null  object 
 8   exg.vb20.no2.cnc          23240 non-null   float32
 9   val.no2.cnc_1_bl          470876 non-null  float64
 10  val.no2.cnc_1_c0          470876 non-null  float64
 11  val.no2.cnc_1_c0_bl       470876 no

In [6]:
# Subset the df to include only cols needed by model
#----------------------------------------------------
df2 = df1.loc[:, [
    'val.no2.wev', 
    'val.no2.aev', 
    'val.no2.wec', 
    'val.sht.hmd_g',
    'val.sht.tmp_g', 
    'val.no2.cnc_1_c1', 
    'pc15_val.no2.wev',
    'pc15_val.no2.aev', 
    'pc15_val.no2.wec', 
    'pc15_val.sht.hmd_g',
    'pc15_val.sht.tmp_g', 
    'pc15_val.no2.cnc_1_c1', 
    'pc30_val.no2.wev',
    'pc30_val.no2.cnc', 
    'pc30_val.no2.aev', 
    'pc30_val.no2.wec',
    'pc30_val.sht.hmd_g', 
    'pc30_val.sht.tmp_g', 
    'pc30_val.no2.cnc_1_c1',
    'hour', 
    'day', 
    'rushhour'
]]

# Drop weirdness in some columns
df2 = df2.replace([np.inf, -np.inf], np.nan).dropna(axis=0).sort_index()

df2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 454175 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:45:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   val.no2.wev            454175 non-null  float32
 1   val.no2.aev            454175 non-null  float32
 2   val.no2.wec            454175 non-null  float32
 3   val.sht.hmd_g          454175 non-null  float32
 4   val.sht.tmp_g          454175 non-null  float32
 5   val.no2.cnc_1_c1       454175 non-null  float64
 6   pc15_val.no2.wev       454175 non-null  float32
 7   pc15_val.no2.aev       454175 non-null  float32
 8   pc15_val.no2.wec       454175 non-null  float32
 9   pc15_val.sht.hmd_g     454175 non-null  float32
 10  pc15_val.sht.tmp_g     454175 non-null  float32
 11  pc15_val.no2.cnc_1_c1  454175 non-null  float64
 12  pc30_val.no2.wev     

In [8]:
# Load the model from disk & run
#--------------------------------
model = pickle.load(
    open(folder0 + 'RFR_model_no2_mln3500_e100_sebbes_356_ratified_mar_final.sav', 'rb'))

# calc predicted vals & merge with input
corrected_no2_vals = pd.concat([
    df2.reset_index(),
    pd.DataFrame(model.predict(df2), columns=['val.no2.cnc_1_c2'])
],
                               axis=1,
                               join='outer').set_index(['tag', 'rec'])



Unnamed: 0_level_0,Unnamed: 1_level_0,val.no2.wev,val.no2.aev,val.no2.wec,val.sht.hmd_g,val.sht.tmp_g,val.no2.cnc_1_c1,pc15_val.no2.wev,pc15_val.no2.aev,pc15_val.no2.wec,pc15_val.sht.hmd_g,...,pc30_val.no2.cnc,pc30_val.no2.aev,pc30_val.no2.wec,pc30_val.sht.hmd_g,pc30_val.sht.tmp_g,pc30_val.no2.cnc_1_c1,hour,day,rushhour,val.no2.cnc_1_c2
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
scs-bgx-536,2020-09-25 00:45:00+00:00,0.284208,0.271754,-0.00057,73.25222,9.592222,7.017048,0.000896,0.00043,-0.109258,0.00592,...,0.000614,0.000308,0.684626,0.000729,-0.009409,-0.136102,0,4,0,2.467971
scs-bgx-536,2020-09-25 01:30:00+00:00,0.28359,0.272055,-0.001675,72.738892,9.71,1.560887,-4e-05,-3e-06,0.010958,0.002875,...,0.003245,0.00265,2.710118,0.003622,0.005457,-0.777558,1,4,0,1.168745
scs-bgx-536,2020-09-25 01:45:00+00:00,0.28385,0.271968,-0.001281,72.91333,9.785556,3.543401,0.000916,-0.00032,-0.234822,0.002398,...,-0.00099,-0.000323,-0.226437,0.00528,0.016236,-0.49503,1,4,0,1.36384
scs-bgx-536,2020-09-25 02:00:00+00:00,0.284074,0.271392,-0.000144,73.111115,9.845555,9.237521,0.00079,-0.002118,-0.887964,0.002713,...,-0.00404,-0.002437,-0.914272,0.005117,0.01396,4.918121,2,4,0,2.934994
scs-bgx-536,2020-09-25 02:15:00+00:00,0.28413,0.271956,-0.000992,73.42556,9.907778,5.012818,0.000197,0.002078,5.913312,0.004301,...,-0.000763,-4.4e-05,-0.22546,0.007025,0.01249,0.414691,2,4,0,1.840474


In [11]:
dfin = pd.read_feather(folder0+'q12021/oxaria_gases_536_stable15_full_corr_rat+2021.ftr')
dfin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480256 entries, 0 to 480255
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   tag                   480256 non-null  object             
 1   rec                   480256 non-null  datetime64[ns, UTC]
 2   val.no2.wev           480256 non-null  float32            
 3   val.no2.cnc           479936 non-null  float32            
 4   val.no2.aev           480256 non-null  float32            
 5   val.no2.wec           479936 non-null  float32            
 6   val.sht.hmd_g         479936 non-null  float32            
 7   val.sht.tmp_g         479936 non-null  float32            
 8   val.no2.cnc_1         470876 non-null  float32            
 9   name                  480256 non-null  object             
 10  exg.vb20.no2.cnc      23240 non-null   float32            
 11  val.no2.cnc_1_bl      470876 non-null  float64      

In [13]:
# Merge with subset
dfout = dfin.set_index(['tag','rec']).merge(corrected_no2_vals['val.no2.cnc_1_c2'],
                on=['tag', 'rec'],
                how='left')
dfout.reset_index().to_feather(folder0+'q12021/oxaria_gases_536_stable15_full_corr_rat+2021.ftr')
dfout.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 480256 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   val.no2.wev           480256 non-null  float32
 1   val.no2.cnc           479936 non-null  float32
 2   val.no2.aev           480256 non-null  float32
 3   val.no2.wec           479936 non-null  float32
 4   val.sht.hmd_g         479936 non-null  float32
 5   val.sht.tmp_g         479936 non-null  float32
 6   val.no2.cnc_1         470876 non-null  float32
 7   name                  480256 non-null  object 
 8   exg.vb20.no2.cnc      23240 non-null   float32
 9   val.no2.cnc_1_bl      470876 non-null  float64
 10  val.no2.cnc_1_c0      470876 non-null  float64
 11  val.no2.cnc_1_c0_bl   470876 non-null  float64
 12  no2_ppb_s_bl          465888 non-nu