In [1]:
# Imports
#---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.dates import DateFormatter
import matplotlib as mpl
from datetime import timezone

register_matplotlib_converters()

# File locations
#----------------
the_gases = './oxaria/data/raw/1oxaria/json/gap_filling/'
pngs = './oxaria/data/pngs/gap_filling/'


In [2]:
# Load the df 
#-------------
oxaria1_pm = pd.read_feather(the_gases+'oxaria1_pm_gf.ftr').set_index(['tag','rec'])
oxaria1_pm.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23299961 entries, ('scs-bgx-536', Timestamp('2020-01-04 09:46:21+0000', tz='UTC')) to ('scs-bgx-543', Timestamp('2021-02-28 23:59:54+0000', tz='UTC'))
Data columns (total 44 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   val.mtf1                 float32
 1   val.pm1                  float32
 2   val.mtf5                 float32
 3   val.pm2p5                float32
 4   val.bin:0                float32
 5   val.bin:1                float32
 6   val.bin:2                float32
 7   val.bin:3                float32
 8   val.bin:4                float32
 9   val.bin:5                float32
 10  val.bin:6                float32
 11  val.bin:7                float32
 12  val.bin:8                float32
 13  val.bin:9                float32
 14  val.bin:10               float32
 15  val.bin:11               float32
 16  val.bin:12               float32
 17  val.bin:13               float32
 18  v

In [2]:
oxaria1_pm_lt = oxaria1_pm.iloc[:,[0,1,2,3,28,29,30,31,32,33,34,43]]
del oxaria1_pm

In [3]:
# Checking out the data types 
#-----------------------------
print('\n'+'Loading Oxaria1 PM...\n')
oxaria1_pm_lt.info()



Loading Oxaria1 PM...

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23299961 entries, ('scs-bgx-536', Timestamp('2020-01-04 09:46:21+0000', tz='UTC')) to ('scs-bgx-543', Timestamp('2021-02-28 23:59:54+0000', tz='UTC'))
Data columns (total 12 columns):
 #   Column       Dtype  
---  ------       -----  
 0   val.mtf1     float32
 1   val.pm1      float32
 2   val.mtf5     float32
 3   val.pm2p5    float32
 4   val.mtf3     float32
 5   val.pm10     float32
 6   val.mtf7     float32
 7   val.per      float32
 8   val.sfr      float32
 9   val.sht.hmd  float32
 10  val.sht.tmp  float32
 11  name         object 
dtypes: float32(11), object(1)
memory usage: 2.0+ GB


In [3]:
# Some cleaning incase have missed anything
#-------------------------------------------
oxaria1_pm = oxaria1_pm[~oxaria1_pm.index.duplicated(keep='last')].reset_index()
#oxaria1_pm.reset_index(inplace=True)
oxaria1_pm = oxaria1_pm.dropna(axis=0,subset=['tag','rec']).set_index(['tag','rec'])
oxaria1_pm.sort_index()


Unnamed: 0_level_0,Unnamed: 1_level_0,val.mtf1,val.pm1,val.mtf5,val.pm2p5,val.bin:0,val.bin:1,val.bin:2,val.bin:3,val.bin:4,val.bin:5,...,val.sht.tmp,src,exg.iseceen2v1.pm10,exg.iseceen2v1.pm1,exg.iseceen2v1.pm2p5,exg.islin/n3/vlgw.pm10,exg.islin/n3/vlgw.pm1,exg.islin/n3/vlgw.pm2p5,insert_date,name
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
scs-bgx-536,2020-01-04 09:46:21+00:00,25.0,0.4,34.0,0.7,57.0,24.0,5.0,2.0,0.0,1.0,...,26.600000,N3,,,,,,,25/03/2021 14:41:37,High St
scs-bgx-536,2020-01-04 09:46:31+00:00,24.0,0.4,0.0,1.0,65.0,22.0,4.0,0.0,3.0,0.0,...,26.799999,N3,,,,,,,25/03/2021 14:41:37,High St
scs-bgx-536,2020-01-04 09:46:41+00:00,24.0,0.4,39.0,1.1,60.0,15.0,10.0,0.0,3.0,2.0,...,26.900000,N3,,,,,,,25/03/2021 14:41:37,High St
scs-bgx-536,2020-01-04 09:46:51+00:00,23.0,0.5,28.0,0.8,49.0,24.0,10.0,4.0,0.0,1.0,...,27.000000,N3,,,,,,,25/03/2021 14:41:37,High St
scs-bgx-536,2020-01-04 09:47:01+00:00,23.0,0.5,28.0,1.4,64.0,18.0,6.0,3.0,5.0,2.0,...,27.100000,N3,,,,,,,25/03/2021 14:41:37,High St
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scs-bgx-543,2021-02-28 23:59:14+00:00,28.0,5.1,39.0,9.7,889.0,90.0,19.0,7.0,6.0,5.0,...,14.700000,N3,,,,,,,25/03/2021 14:41:37,John Radcliffe
scs-bgx-543,2021-02-28 23:59:24+00:00,26.0,4.9,39.0,10.9,858.0,82.0,22.0,5.0,12.0,8.0,...,14.700000,N3,,,,,,,25/03/2021 14:41:37,John Radcliffe
scs-bgx-543,2021-02-28 23:59:34+00:00,27.0,4.9,35.0,11.4,863.0,79.0,19.0,8.0,7.0,10.0,...,14.700000,N3,,,,,,,25/03/2021 14:41:37,John Radcliffe
scs-bgx-543,2021-02-28 23:59:44+00:00,26.0,4.9,38.0,10.4,843.0,100.0,22.0,6.0,10.0,7.0,...,14.700000,N3,,,,,,,25/03/2021 14:41:37,John Radcliffe


In [10]:
# Get a list of unique tags
#---------------------------
tags = oxaria1_pm.index.get_level_values(0).unique()
print('\n Unique tags loaded...\n'+tags)

Index(['\n Unique tags loaded...\nscs-bgx-536',
       '\n Unique tags loaded...\nscs-bgx-537',
       '\n Unique tags loaded...\nscs-bgx-538',
       '\n Unique tags loaded...\nscs-bgx-539',
       '\n Unique tags loaded...\nscs-bgx-540',
       '\n Unique tags loaded...\nscs-bgx-541',
       '\n Unique tags loaded...\nscs-bgx-542',
       '\n Unique tags loaded...\nscs-bgx-543'],
      dtype='object', name='tag')


In [11]:
# Define the start dates for stable operation
#---------------------------------------------
start_dates = ['2020-09-25T00:00:00','2020-08-01T00:00:00','2020-06-05T00:00:00','2020-01-25T00:00:00', \
               '2020-05-01T00:00:00','2020-03-05T00:00:00','2020-02-06T00:00:00','2020-12-07T00:00:00']
dates_list = [dt.datetime.fromisoformat(date).replace(tzinfo=timezone.utc) for date in start_dates]
dates_dict = dict(zip(tags,dates_list))


In [13]:
# Select periods of stable operation from the df of all gases sesnor data
#-------------------------------------------------------------------------
print('\n Applying start date filters...\n')

tmp = []

for k,v in dates_dict.items():
    df = oxaria1_pm.query('tag == @k & rec >= @v')
    tmp.append(df)
tmpdf = pd.concat(tmp)
print(tmpdf.info())

# Apply empirical filters
#-------------------------
print("\n Applying sensible empirical filters...\n condition =  ((tmpdf['val.sfr'] < 2.0) | \
             (tmpdf['val.sht.tmp'] < -10.0) | \
             (tmpdf['val.sht.tmp'] > 35.0) | \
             (tmpdf['val.sht.hmd'] < 35.0)) \n")

cols = ['val.pm10','val.pm2p5','val.pm1']

condition =  ((tmpdf['val.sfr'] < 2.0) | \
             (tmpdf['val.sht.tmp'] < -10.0) | \
             (tmpdf['val.sht.tmp'] > 35.0) | \
             (tmpdf['val.sht.hmd'] < 35.0))

for col in cols:
    tmpdf[col+'_1'] = np.where(condition, np.nan, tmpdf[col])
tmpdf.info(null_counts=True)


 Applying start date filters...

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 18678521 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:00:02+0000', tz='UTC')) to ('scs-bgx-543', Timestamp('2021-02-28 23:59:54+0000', tz='UTC'))
Data columns (total 44 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   val.mtf1                 float32
 1   val.pm1                  float32
 2   val.mtf5                 float32
 3   val.pm2p5                float32
 4   val.bin:0                float32
 5   val.bin:1                float32
 6   val.bin:2                float32
 7   val.bin:3                float32
 8   val.bin:4                float32
 9   val.bin:5                float32
 10  val.bin:6                float32
 11  val.bin:7                float32
 12  val.bin:8                float32
 13  val.bin:9                float32
 14  val.bin:10               float32
 15  val.bin:11               float32
 16  val.bin:12               float32
 17  val.b

In [7]:
# Save to feather
#-----------------
print('\n Writing to  '+the_gases+'oxaria1_pm_stable_536_2feb21/.ftr\n')
tmpdf.reset_index().to_feather(the_gases+'oxaria1_pm_stable_536_2feb21/.ftr')
print('All done! \U0001F600')


 Applying start date filters...



NameError: name 'dates_dict' is not defined