In [1]:
# Imports
#---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.dates import DateFormatter
import matplotlib as mpl
from datetime import timezone

register_matplotlib_converters()

# File locations
#----------------
the_gases = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/2oxaria/json/gap_filling/'
pngs = '/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/pngs/gap_filling/'


In [2]:
# Load the df 
#-------------
oxaria2_gases_lt = pd.read_feather(the_gases+'oxaria2_gases_gf.ftr').set_index(['tag','rec'])
print('\n'+'Loading Oxaria1 gases (no2 only)...\n \n')
oxaria2_gases_lt.info()



Loading Oxaria1 gases (no2 only)...
 

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25337336 entries, ('scs-bgx-550', Timestamp('2020-06-01 00:00:08+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2020-10-29 16:55:47+0000', tz='UTC'))
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   val.no2.wev  float32
 1   val.no2.cnc  float32
 2   val.no2.aev  float32
 3   val.no2.wec  float32
 4   val.sht.hmd  float32
 5   val.sht.tmp  float32
 6   name         object 
dtypes: float32(6), object(1)
memory usage: 1.6+ GB


In [3]:
# Some cleaning incase have missed anything
#-------------------------------------------
oxaria2_gases_lt = oxaria2_gases_lt[~oxaria2_gases_lt.index.duplicated(keep='last')]
oxaria2_gases_lt = oxaria2_gases_lt.reset_index()
oxaria2_gases_lt = oxaria2_gases_lt.dropna(axis=0,subset=['tag']).set_index(['tag','rec'])
oxaria2_gases_lt.sort_index()
print('\n Cleaning duplicates & NAs in index (if any)...\n')


 Cleaning duplicates & NAs in index (if any)...



In [4]:
# Get a list of unique tags
#---------------------------
tags = oxaria2_gases_lt.index.get_level_values(0).unique()
print('\n Unique tags loaded...\n'+tags)

Index(['\n Unique tags loaded...\nscs-bgx-550',
       '\n Unique tags loaded...\nscs-bgx-551',
       '\n Unique tags loaded...\nscs-bgx-552',
       '\n Unique tags loaded...\nscs-bgx-553',
       '\n Unique tags loaded...\nscs-bgx-554',
       '\n Unique tags loaded...\nscs-bgx-555',
       '\n Unique tags loaded...\nscs-bgx-556',
       '\n Unique tags loaded...\nscs-bgx-557',
       '\n Unique tags loaded...\nscs-bgx-558',
       '\n Unique tags loaded...\nscs-bgx-559'],
      dtype='object', name='tag')


In [5]:
# Define the start dates for stable operation
#---------------------------------------------
start_dates = ['2020-12-15T00:00:00','2020-07-20T00:00:00','2020-12-15T00:00:00','2020-07-01T00:00:00', \
               '2020-07-01T00:00:00','2020-12-15T00:00:00','2020-07-20T00:00:00','2020-07-01T00:00:00', \
               '2020-12-15T00:00:00','2020-12-15T00:00:00']

dates_list = [dt.datetime.fromisoformat(date).replace(tzinfo=timezone.utc) for date in start_dates]
dates_dict = dict(zip(tags,dates_list))
print('\n Applying variable start date filters of each sensor...\n \n'+str(dates_dict))


 Applying variable start date filters of each sensor...
 
{'scs-bgx-550': datetime.datetime(2020, 12, 15, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-551': datetime.datetime(2020, 7, 20, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-552': datetime.datetime(2020, 12, 15, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-553': datetime.datetime(2020, 7, 1, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-554': datetime.datetime(2020, 7, 1, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-555': datetime.datetime(2020, 12, 15, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-556': datetime.datetime(2020, 7, 20, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-557': datetime.datetime(2020, 7, 1, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-558': datetime.datetime(2020, 12, 15, 0, 0, tzinfo=datetime.timezone.utc), 'scs-bgx-559': datetime.datetime(2020, 12, 15, 0, 0, tzinfo=datetime.timezone.utc)}


In [6]:
# Select periods of stable operation from the df of all gases sesnor data
#-------------------------------------------------------------------------
print('\n Applying start date filters...\n')

tmp = []

for k,v in dates_dict.items():
    df = oxaria2_gases_lt.query('tag == @k & rec >= @v')
    tmp.append(df)
tmpdf = pd.concat(tmp)
print(tmpdf.info())

# Apply empirical filters
#-------------------------

print("\n Applying sensible empirical filters...\ncondition =  ((tmpdf['val.sht.tmp'] < -10.0) | \
             (tmpdf['val.sht.tmp'] > 35.0) | \
             (tmpdf['val.sht.hmd'] < 35.0))\n")

cols = ['val.no2.cnc']

condition =  ((tmpdf['val.sht.tmp'] < -10.0) | \
             (tmpdf['val.sht.tmp'] > 35.0) | \
             (tmpdf['val.sht.hmd'] < 35.0))

for col in cols:
    tmpdf[col+'_1'] = np.where(condition, np.nan, tmpdf[col])

    
# Save to feather
#-----------------
print('\n Writing to  '+the_gases+'oxaria2_gases_stable_536_2feb21.ftr\n')
tmpdf.reset_index().to_feather(the_gases+'oxaria2_gases_stable_536_2feb21.ftr')
print('All done! \U0001F600')


 Applying start date filters...

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11812689 entries, ('scs-bgx-550', Timestamp('2021-01-01 00:00:02+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2020-12-31 23:59:55+0000', tz='UTC'))
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   val.no2.wev  float32
 1   val.no2.cnc  float32
 2   val.no2.aev  float32
 3   val.no2.wec  float32
 4   val.sht.hmd  float32
 5   val.sht.tmp  float32
 6   name         object 
dtypes: float32(6), object(1)
memory usage: 805.3+ MB
None

 Applying sensible empirical filters...
condition =  ((tmpdf['val.sht.tmp'] < -10.0) |              (tmpdf['val.sht.tmp'] > 35.0) |              (tmpdf['val.sht.hmd'] < 35.0))


 Writing to  /home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/2oxaria/json/gap_filling/oxaria2_gases_stable_536_2feb21.ftr

All done! 😀
