In [1]:
# Imports
#---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
from matplotlib.dates import DateFormatter
import matplotlib as mpl
import peakutils
from peakutils.plot import plot as pplot
import warnings
import pickle
warnings.filterwarnings("ignore")

register_matplotlib_converters()

# File locations
#----------------
data_home = './oxaria/processed/batch_2_feb21_may21/'
rf_home = './oxaria/data/rf/gap_filling/536_method/ratified/'
aurn = './oxaria/data/raw/0oxaria/'


In [2]:
# Load baseline adjusted pm data
#--------------------------------

####
# Load from "oxaria_pm_stable15_536_bl_adjusted+2021.ftr" write calculated column to 
# "oxaria_pm_stable15_full_corr_rat+2021.ftr"
####

df0 = pd.read_feather(data_home+'q12021/oxaria_pm_stable15_536_bl_adjusted+2021_sept_update_transients.ftr').set_index(['tag','rec'])
df0.rename(
    columns={
        'val.sht.tmp':'val.sht.tmp_p', 'val.sht.hmd':'val.sht.hmd_p'
    }, inplace=True)

auto_merged = pd.read_feather(aurn+'auto_merged_sept21_aurn_update.ftr')
df0.info()
auto_merged.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 481227 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 37 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   val.mtf1            481227 non-null  float32
 1   val.pm1             481227 non-null  float32
 2   val.mtf5            481227 non-null  float32
 3   val.pm2p5           481227 non-null  float32
 4   val.mtf3            481227 non-null  float32
 5   val.pm10            481227 non-null  float32
 6   val.mtf7            481227 non-null  float32
 7   val.per             481227 non-null  float32
 8   val.sfr             481227 non-null  float32
 9   val.sht.hmd_p       481227 non-null  float32
 10  val.sht.tmp_p       481227 non-null  float32
 11  val.pm10_1          347355 non-null  float32
 12  val.pm2p5_1         347355 non-null  float32
 13  val.pm1_1    

In [3]:
# Function to generate the remaining feature required by the correction model
#-----------------------------------------------------------------------------


def no2_feature_gen(df):
    # calc % change in all df cols over last 15 mins
    tmpdf1 = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna(
        axis=1, how='all').pct_change(periods=1)
    tmpdf1.columns = ['pc15_' + name for name in list(tmpdf1.columns)]
    # and same over last 30 mins
    tmpdf2 = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna(
        axis=1, how='all').pct_change(periods=2)
    tmpdf2.columns = ['pc30_' + name for name in list(tmpdf2.columns)]
    # merge % change stats back on to original df
    df = df.merge(tmpdf1, left_index=True, right_index=True,
                  how='left').merge(tmpdf2,
                                    left_index=True,
                                    right_index=True,
                                    how='left').set_index(['tag', 'rec'])
    return df


def rushhour(df):
    # calc integer flags for hour of observation
    df['hour'] = df.index.get_level_values(1).hour
    # and day of week
    df['day'] = df.index.get_level_values(1).dayofweek
    # and integer flag(s) for off-peak, morning & evening rush hours
    df.loc[(df.index.get_level_values(1).time >= dt.time(7, 30, 0)) &
           (df.index.get_level_values(1).time >= dt.time(9, 30, 0)) &
           (df.index.get_level_values(1).weekday < 5), 'rushhour'] = 1
    df.loc[(df.index.get_level_values(1).time >= dt.time(16, 0, 0)) &
           (df.index.get_level_values(1).time >= dt.time(18, 0, 0)) &
           (df.index.get_level_values(1).weekday < 5), 'rushhour'] = 2
    df['rushhour'] = np.where(df['rushhour'] >= 1, df['rushhour'],
                              0).astype(np.int32)
    return df

In [4]:
# Add the features need by the RF model
#---------------------------------------
tmp_list = []
for tag, dat in df0.reset_index().groupby('tag'):
    tmpdf = no2_feature_gen(dat)
    tmpdf = rushhour(tmpdf)
    tmp_list.append(tmpdf)
df1 = pd.concat(tmp_list)


In [6]:
# Subset the df to include only cols needed by model
#----------------------------------------------------
df2 = df1.loc[:, [
    'val.mtf1',
    'val.mtf5',
    'val.mtf3',
    'val.mtf7',
    'val.sfr',
    'val.hmd',
    'val.tmp',
    'pc15_val.mtf1',
    'pc15_val.mtf5',
    'pc15_val.mtf3',
    'pc15_val.mtf7',
    'pc15_val.sfr',
    'pc15_val.hmd',
    'pc15_val.tmp',
    'pc30_val.mtf1',
    'pc30_val.mtf5',
    'pc30_val.mtf3',
    'pc30_val.mtf7',
    'pc30_val.sfr',
    'pc30_val.hmd',
    'pc30_val.tmp',
    'hour',
    'day',
    'val.pm10_1_c1',
    'pc15_val.pm10_1_c1',
    'pc30_val.pm10_1_c1',
]]

# Drop weirdness in some columns
df2 = df2.replace([np.inf, -np.inf], np.nan).dropna(axis=0).sort_index()

df2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 313981 entries, ('scs-bgx-536', Timestamp('2020-09-25 01:00:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-05-31 11:00:00+0000', tz='UTC'))
Data columns (total 26 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   val.mtf1            313981 non-null  float64
 1   val.mtf5            313981 non-null  float64
 2   val.mtf3            313981 non-null  float64
 3   val.mtf7            313981 non-null  float64
 4   val.sfr             313981 non-null  float64
 5   val.hmd             313981 non-null  float64
 6   val.tmp             313981 non-null  float64
 7   pc15_val.mtf1       313981 non-null  float64
 8   pc15_val.mtf5       313981 non-null  float64
 9   pc15_val.mtf3       313981 non-null  float64
 10  pc15_val.mtf7       313981 non-null  float64
 11  pc15_val.sfr        313981 non-null  float64
 12  pc15_val.hmd        313981 non-null  float64
 13  pc15_val.tmp 

In [8]:
# Load the model from disk & run
#--------------------------------
model = pickle.load(
    open(rf_home + 'RFR_model_pm10_mln3000_val_pm10_1_c2_mar_final.sav', 'rb'))

# calc predicted vals & merge with input
corrected_pm10_vals = pd.concat([
    df2.reset_index(),
    pd.DataFrame(model.predict(df2), columns=['val.pm10_1_c2'])
],
                               axis=1,
                               join='outer').set_index(['tag', 'rec'])

# Merge with subset
df3 = df2.merge(corrected_pm10_vals['val.pm10_1_c2'],
                on=['tag', 'rec'],
                how='left')
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,val.mtf1,val.mtf5,val.mtf3,val.mtf7,val.sfr,val.hmd,val.tmp,pc15_val.mtf1,pc15_val.mtf5,pc15_val.mtf3,...,pc30_val.mtf7,pc30_val.sfr,pc30_val.hmd,pc30_val.tmp,hour,day,val.pm10_1_c1,pc15_val.pm10_1_c1,pc30_val.pm10_1_c1,val.pm10_1_c2
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
scs-bgx-536,2020-09-25 01:00:00+00:00,22.719101,31.842697,25.797752,15.292135,5.654494,72.466667,9.653334,0.021849,0.005559,-0.028943,...,-0.23369,-0.052531,-0.004852,0.005556,1,4,4.772219,-0.164802,-0.164802,11.209264
scs-bgx-536,2020-09-25 01:15:00+00:00,22.822222,31.577778,26.211111,10.2,5.590889,72.513336,9.633333,0.004539,-0.00832,0.016023,...,-0.408124,-0.052962,-0.010552,0.00417,1,4,4.194832,-0.120989,-0.265852,8.843861
scs-bgx-536,2020-09-25 01:30:00+00:00,22.188889,31.366667,26.366667,14.877778,5.921667,72.753334,9.693334,-0.027751,-0.006685,0.005935,...,-0.027096,0.04725,0.003956,0.004144,1,4,5.696597,0.358004,0.1937,12.414027
scs-bgx-536,2020-09-25 01:45:00+00:00,21.588888,30.233334,26.133333,15.988889,6.255111,72.926666,9.786667,-0.027041,-0.036132,-0.00885,...,0.567538,0.118804,0.0057,0.015917,1,4,7.159473,0.256798,0.706736,19.719891
scs-bgx-536,2020-09-25 02:00:00+00:00,21.299999,30.622223,25.955555,17.577778,6.422222,73.099998,9.833333,-0.013381,0.012863,-0.006803,...,0.181479,0.08453,0.004765,0.014443,2,4,8.570127,0.197033,0.504429,20.450957


In [9]:
# Merge back to to origin & save
#--------------------------------
oxaria_pm_full_corr_rat = df0.merge(df3['val.pm10_1_c2'], left_index=True,right_index=True,how='left') 
oxaria_pm_full_corr_rat.reset_index().to_feather(data_home+'q12021/oxaria_pm_stable15_full_corr_rat+2021_1.ftr')


In [10]:
oxaria_pm_full_corr_rat.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 481227 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-06-01 00:00:00+0000', tz='UTC'))
Data columns (total 38 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   val.mtf1            481227 non-null  float32
 1   val.pm1             481227 non-null  float32
 2   val.mtf5            481227 non-null  float32
 3   val.pm2p5           481227 non-null  float32
 4   val.mtf3            481227 non-null  float32
 5   val.pm10            481227 non-null  float32
 6   val.mtf7            481227 non-null  float32
 7   val.per             481227 non-null  float32
 8   val.sfr             481227 non-null  float32
 9   val.sht.hmd_p       481227 non-null  float32
 10  val.sht.tmp_p       481227 non-null  float32
 11  val.pm10_1          347355 non-null  float32
 12  val.pm2p5_1         347355 non-null  float32
 13  val.pm1_1    