In [1]:
# Imports
# ---------
import sys
import pandas as pd
import numpy as np
import feather
import os
import gc
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
from matplotlib.dates import DateFormatter, MonthLocator, DayLocator
import matplotlib as mpl
import peakutils
from peakutils.plot import plot as pplot
import warnings
import pytz

warnings.filterwarnings("ignore")

register_matplotlib_converters()

# File locations
# ----------------
pngs = "/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/gap_filling/jun_to_sept_2021/"
data_home = "/home/tonyb/Gdrive/MinicondaProjects/oxaria/data/raw/0oxaria/gap_filling/jun_to_sept_2021/"


In [2]:
# Load the dfs
# --------------
oxaria_pm = pd.read_feather(
    data_home + "oxaria_pm_stable15_full_corr_oct_2021.ftr"
).set_index(["tag", "rec"])
oxaria_no2 = pd.read_feather(
    data_home + "oxaria_gases_536_stable15_rat_full_norm_corr_blc_202021.ftr"
).set_index(["tag", "rec"])
oxaria_no2["val.no2.cnc_1_lgbm_norm_cor_blc_ug"] = (
    oxaria_no2["val.no2.cnc_1_lgbm_norm_cor_blc"] * 1.9125
)
auto_merged = pd.read_feather(
    data_home + "auto_merged_ratified+2021_oct_update.ftr"
).set_index("rec")

display(oxaria_pm.info())

print('Number of sensor locations in PM dataframes: ', len(oxaria_pm["name"].unique()), '\n')
print(oxaria_pm["name"].unique(), '\n')

print('Number of sensor locations in NO2 dataframes: ',len(oxaria_no2["name"].unique()),'\n')
print(oxaria_no2["name"].unique())


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 646022 entries, ('scs-bgx-536', Timestamp('2020-09-25 00:15:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 45 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   val.mtf1                 646022 non-null  float32
 1   val.pm1                  646022 non-null  float32
 2   val.mtf5                 646022 non-null  float32
 3   val.pm2p5                646022 non-null  float32
 4   val.mtf3                 646022 non-null  float32
 5   val.pm10                 646022 non-null  float32
 6   val.mtf7                 646022 non-null  float32
 7   val.per                  646022 non-null  float32
 8   val.sfr                  646022 non-null  float32
 9   val.sht.hmd_p            646022 non-null  float32
 10  val.sht.tmp_p            646022 non-null  float32
 11  val.pm10_1               470637 non-null  float3

None

Number of sensor locations in PM dataframes:  17 

['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marsten'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Windmill School'
 'Said Business School' 'County Hall' 'Divinity Road' 'Jahlul Bayt Mosque'
 'St Giles' 'Warneford Hospital' 'Speedwell St' 'Spare'] 

Number of sensor locations in NO2 dataframes:  17 

['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marston'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Windmill School'
 'Said Business School' 'County Hall' 'Divinity Rd' 'Ahlul Bayt Centre'
 'St Giles' 'Warneford Hospital' 'Spare' 'Speedwell St']


In [3]:
# Get tid of the typos
# ----------------------
mydict = {
    "Jahlul": "Ahlul",
    "Road": "Rd",
    "Street": "St",
    "Mosque": "Centre",
    "Marsten": "Marston",
}
oxaria_pm["name"].replace(mydict, inplace=True, regex=True)
oxaria_no2["name"].replace(mydict, inplace=True, regex=True)

print(len(oxaria_pm["name"].unique()), len(oxaria_no2["name"].unique()))
print(oxaria_pm["name"].unique())
print(oxaria_no2["name"].unique())


17 17
['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marston'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Windmill School'
 'Said Business School' 'County Hall' 'Divinity Rd' 'Ahlul Bayt Centre'
 'St Giles' 'Warneford Hospital' 'Speedwell St' 'Spare']
['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marston'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Windmill School'
 'Said Business School' 'County Hall' 'Divinity Rd' 'Ahlul Bayt Centre'
 'St Giles' 'Warneford Hospital' 'Spare' 'Speedwell St']


In [4]:
# Re-organise & prep the data
# -----------------------------
dates = pd.read_csv(data_home + "sensing_dates.csv", na_values=["na"])

dates["sensorid2"] = dates["sensorid2"].astype(str).replace("\.0", "", regex=True)
dates["startdate1"] = pd.to_datetime(
    dates["startdate1"], utc=True, dayfirst=True, format="%d/%m/%Y"
)
dates["startdate2"] = pd.to_datetime(
    dates["startdate2"], utc=True, dayfirst=True, format="%d/%m/%Y"
)
dates["enddate1"] = pd.to_datetime(
    dates["enddate1"], utc=True, dayfirst=True, format="%d/%m/%Y"
)
dates["enddate2"] = pd.to_datetime(
    dates["enddate2"], utc=True, dayfirst=True, format="%d/%m/%Y"
)
dates["sensorid1"] = dates["sensorid1"].astype("int64", errors="ignore")
dates["sensorid2"] = dates["sensorid2"].astype("int64", errors="ignore")
dates["sensorid1"] = "scs-bgx-" + dates["sensorid1"].astype(str)
dates["sensorid2"] = "scs-bgx-" + dates["sensorid2"].astype(str)

dates.sort_values(by="sensorid1", inplace=True)

# Set start & end dates where NaT
# ---------------------------------
s1 = pd.to_datetime("2020-01-23 00:00:00+00:00", utc=True)
e1 = pd.to_datetime("2021-10-02 00:00:00+00:00", utc=True)
s2 = pd.to_datetime("2021-10-02 00:00:00+00:00", utc=True)
e2 = pd.to_datetime("2021-10-02 00:00:00+00:00", utc=True)

out_dates = []
for idx, row in dates.iterrows():
    if row[1] is pd.NaT:
        row[1] = s1
    if row[2] is pd.NaT:
        row[2] = e1
    if row[4] is pd.NaT:
        row[4] = s2
    if row[5] is pd.NaT:
        row[5] = e2
    out_dates.append(row)
dates = pd.DataFrame(out_dates)

# Add "final" sensor name
# -------------------------
sensor_names = (
    oxaria_pm["name"]
    .reset_index()
    .drop("rec", axis=1)
    .drop_duplicates()
    .reset_index(drop=True)
)
dates = pd.merge(
    left=dates, right=sensor_names, left_on="sensorid1", right_on="tag", how="left"
)
dates.rename(columns={"name": "fname"}, inplace=True)

dates


Unnamed: 0,sensorid1,startdate1,enddate1,sensorid2,startdate2,enddate2,tag,fname
0,scs-bgx-536,2020-10-14 00:00:00+00:00,2021-08-11 00:00:00+00:00,scs-bgx-550,2021-08-11 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-536,High St
1,scs-bgx-537,2020-07-31 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-537,South Parks Rd
2,scs-bgx-538,2020-06-04 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-538,St Ebbes
3,scs-bgx-539,2020-01-23 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-539,Jesus College
4,scs-bgx-540,2020-07-29 00:00:00+00:00,2021-06-11 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-540,New Marston
5,scs-bgx-541,2020-02-21 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-541,The Plain
6,scs-bgx-542,2020-01-23 00:00:00+00:00,2021-06-11 00:00:00+00:00,scs-bgx-540,2021-06-11 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-542,Worcester College
7,scs-bgx-543,2020-12-07 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-nan,2021-10-02 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-543,John Radcliffe
8,scs-bgx-550,2021-03-08 00:00:00+00:00,2021-05-16 00:00:00+00:00,scs-bgx-555,2021-05-25 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-550,Windmill School
9,scs-bgx-551,2020-07-29 00:00:00+00:00,2021-04-09 00:00:00+00:00,scs-bgx-558,2021-04-21 00:00:00+00:00,2021-10-02 00:00:00+00:00,scs-bgx-551,Said Business School


## Manual checks
Some manual checks on what we think the final time series should look like.

Here using params on 1st row of `dates` dataframe to select & combine what the High St time series will look like - sensor 536 & 550. 

After working with Windmill & talking to Bruno, dropping Windmill as usable data. :(

In [5]:
# Running the "final" reselect - pm
# ------------------------------

chk536 = oxaria_pm.reset_index().query(
    '"2020-10-14 00:00:00+00:00" <= rec < "2021-08-11 00:00:00+00:00" and tag == "scs-bgx-536"'
)
chk550 = oxaria_pm.reset_index().query(
    '"2021-08-11 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-550"'
)
print("Length of 536 timeseries @ High St is  " + str(len(chk536)))
print("Length of 550 timeseries @ High St is  " + str(len(chk550)))
print(
    "Length of final, combined timeseries @ High St  "
    + str(len(pd.concat([chk536, chk550])))
    + "\n"
)


chk557 = oxaria_pm.reset_index().query(
    '"2020-09-24 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-557"'
)
chknan = oxaria_pm.reset_index().query(
    '"2021-10-02 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-nan"'
)

print("Length of 557 timeseries @ Warneford is  " + str(len(chk557)))
print("Length of nan (not existing) timeseries @ Warneford is  " + str(len(chknan)))
print(
    "Length of final, combined timeseries @ Warneford  "
    + str(len(pd.concat([chk557, chknan])))
    + "\n"
)

chk542 = oxaria_pm.reset_index().query(
    '"2020-01-23 00:00:00+00:00" <= rec < "2021-06-11 00:00:00+00:00" and tag == "scs-bgx-542"'
)
chk540 = oxaria_pm.reset_index().query(
    '"2021-06-11 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-540"'
)
print("Length of 542 timeseries @ Worcester College is  " + str(len(chk542)))
print("Length of 540 timeseries @ Worcester College is  " + str(len(chk540)))
print(
    "Length of final, combined timeseries @ Worcester College  "
    + str(len(pd.concat([chk542, chk540])))
    + "\n"
)

chk550 = oxaria_pm.reset_index().query(
    '"2020-03-08 00:00:00+00:00" <= rec < "2021-06-16 00:00:00+00:00" and tag == "scs-bgx-550"'
)
chk555 = oxaria_pm.reset_index().query(
    '"2021-05-25 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-555"'
)
print("Length of 550 timeseries @ Windmill School is  " + str(len(chk550)))
print("Length of 555 timeseries @ Windmill School is  " + str(len(chk555)))
print(
    "Length of final, combine timeseries @ Windmill School  "
    + str(len(pd.concat([chk550, chk555])))
    + "\n"
)
print("Unfortunately dropping Windmill School data as unusable.\n")

chk551 = oxaria_pm.reset_index().query(
    '"2020-07-29 00:00:00+00:00" <= rec < "2021-04-09 00:00:00+00:00" and tag == "scs-bgx-551"'
)
chk558 = oxaria_pm.reset_index().query(
    '"2021-04-21 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-558"'
)
print("Length of 551 timeseries @ Said BS is  " + str(len(chk551)))
print("Length of 558 timeseries @ Said BS is  " + str(len(chk558)))
print(
    "Length of final, combined timeseries @ Said BS  "
    + str(len(pd.concat([chk551, chk558])))
    + "\n"
)

chk540 = oxaria_pm.reset_index().query(
    '"2020-07-29 00:00:00+00:00" <= rec < "2021-11-06 00:00:00+00:00" and tag == "scs-bgx-540"'
)
print("Length of 540 timeseries @ Said BS is  " + str(len(chk540)))
print(
    "Length of final, combined timeseries @ New Marsten  "
    + str(len(pd.concat([chk540])))
    + "\n"
)


Length of 536 timeseries @ High St is  26163
Length of 550 timeseries @ High St is  4867
Length of final, combined timeseries @ High St  31030

Length of 557 timeseries @ Warneford is  34907
Length of nan (not existing) timeseries @ Warneford is  0
Length of final, combined timeseries @ Warneford  34907

Length of 542 timeseries @ Worcester College is  45527
Length of 540 timeseries @ Worcester College is  10658
Length of final, combined timeseries @ Worcester College  56185

Length of 550 timeseries @ Windmill School is  11960
Length of 555 timeseries @ Windmill School is  0
Length of final, combine timeseries @ Windmill School  11960

Unfortunately dropping Windmill School data as unusable.

Length of 551 timeseries @ Said BS is  21665
Length of 558 timeseries @ Said BS is  15648
Length of final, combined timeseries @ Said BS  37313

Length of 540 timeseries @ Said BS is  41089
Length of final, combined timeseries @ New Marsten  41089



In [6]:
# Running the "final" reselect - no2
# ------------------------------

chk536 = oxaria_no2.reset_index().query(
    '"2020-10-14 00:00:00+00:00" <= rec < "2021-08-11 00:00:00+00:00" and tag == "scs-bgx-536"'
)
chk550 = oxaria_no2.reset_index().query(
    '"2021-08-11 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-550"'
)
print("Length of 536 timeseries @ High St is  " + str(len(chk536)))
print("Length of 550 timeseries @ High St is  " + str(len(chk550)))
print(
    "Length of final, combined timeseries @ High St  "
    + str(len(pd.concat([chk536, chk550])))
    + "\n"
)


chk557 = oxaria_no2.reset_index().query(
    '"2020-09-24 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-557"'
)
chknan = oxaria_no2.reset_index().query(
    '"2021-10-02 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-nan"'
)

print("Length of 557 timeseries @ Warneford is  " + str(len(chk557)))
print("Length of nan (not existing) timeseries @ Warneford is  " + str(len(chknan)))
print(
    "Length of final, combined timeseries @ Warneford  "
    + str(len(pd.concat([chk557, chknan])))
    + "\n"
)

chk542 = oxaria_no2.reset_index().query(
    '"2020-01-23 00:00:00+00:00" <= rec < "2021-06-11 00:00:00+00:00" and tag == "scs-bgx-542"'
)
chk540 = oxaria_no2.reset_index().query(
    '"2021-06-11 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-540"'
)
print("Length of 542 timeseries @ Worcester College is  " + str(len(chk542)))
print("Length of 540 timeseries @ Worcester College is  " + str(len(chk540)))
print(
    "Length of final, combined timeseries @ Worcester College  "
    + str(len(pd.concat([chk542, chk540])))
    + "\n"
)

chk550 = oxaria_no2.reset_index().query(
    '"2020-03-08 00:00:00+00:00" <= rec < "2021-06-16 00:00:00+00:00" and tag == "scs-bgx-550"'
)
chk555 = oxaria_no2.reset_index().query(
    '"2021-05-25 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-555"'
)
print("Length of 550 timeseries @ Windmill School is  " + str(len(chk550)))
print("Length of 555 timeseries @ Windmill School is  " + str(len(chk555)))
print(
    "Length of final, combine timeseries @ Windmill School  "
    + str(len(pd.concat([chk550, chk555])))
    + "\n"
)
print("Unfortunately dropping Windmill School data as unusable.\n")

chk551 = oxaria_no2.reset_index().query(
    '"2020-07-29 00:00:00+00:00" <= rec < "2021-04-09 00:00:00+00:00" and tag == "scs-bgx-551"'
)
chk558 = oxaria_no2.reset_index().query(
    '"2021-04-21 00:00:00+00:00" <= rec < "2021-10-02 00:00:00+00:00" and tag == "scs-bgx-558"'
)
print("Length of 551 timeseries @ Said BS is  " + str(len(chk551)))
print("Length of 558 timeseries @ Said BS is  " + str(len(chk558)))
print(
    "Length of final, combined timeseries @ Said BS  "
    + str(len(pd.concat([chk551, chk558])))
    + "\n"
)

chk540 = oxaria_no2.reset_index().query(
    '"2020-07-29 00:00:00+00:00" <= rec < "2021-11-06 00:00:00+00:00" and tag == "scs-bgx-540"'
)
print("Length of 540 timeseries @ Said BS is  " + str(len(chk540)))
print(
    "Length of final, combined timeseries @ New Marsten  "
    + str(len(pd.concat([chk540])))
    + "\n"
)


Length of 536 timeseries @ High St is  26878
Length of 550 timeseries @ High St is  4867
Length of final, combined timeseries @ High St  31745

Length of 557 timeseries @ Warneford is  34944
Length of nan (not existing) timeseries @ Warneford is  0
Length of final, combined timeseries @ Warneford  34944

Length of 542 timeseries @ Worcester College is  45196
Length of 540 timeseries @ Worcester College is  10726
Length of final, combined timeseries @ Worcester College  55922

Length of 550 timeseries @ Windmill School is  11889
Length of 555 timeseries @ Windmill School is  0
Length of final, combine timeseries @ Windmill School  11889

Unfortunately dropping Windmill School data as unusable.

Length of 551 timeseries @ Said BS is  21984
Length of 558 timeseries @ Said BS is  15649
Length of final, combined timeseries @ Said BS  37633

Length of 540 timeseries @ Said BS is  41158
Length of final, combined timeseries @ New Marsten  41158



In [7]:
# Deploy the reselect to get final dataset for stats calcs
# ----------------------------------------------------------
tmpdf = []
for idx, row in dates.iterrows():
    df0 = oxaria_pm.reset_index().query("@row[1] <= rec < @row[2] and tag == @row[0]")
    df0["fname"] = row[7]
    df1 = oxaria_pm.reset_index().query("@row[4] <= rec < @row[5] and tag == @row[3]")
    df1["fname"] = row[7]
    df2 = pd.concat([df0, df1])
    tmpdf.append(df2)
oxaria_pm_f = pd.concat(tmpdf).set_index(["tag", "rec"])
# Remove Windmill School
oxaria_pm_f = oxaria_pm_f[oxaria_pm_f["fname"] != "Windmill School"]
oxaria_pm_f.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 573994 entries, ('scs-bgx-536', Timestamp('2020-10-14 00:00:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 46 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   val.mtf1                 573994 non-null  float32
 1   val.pm1                  573994 non-null  float32
 2   val.mtf5                 573994 non-null  float32
 3   val.pm2p5                573994 non-null  float32
 4   val.mtf3                 573994 non-null  float32
 5   val.pm10                 573994 non-null  float32
 6   val.mtf7                 573994 non-null  float32
 7   val.per                  573994 non-null  float32
 8   val.sfr                  573994 non-null  float32
 9   val.sht.hmd_p            573994 non-null  float32
 10  val.sht.tmp_p            573994 non-null  float32
 11  val.pm10_1               430081 non-null  float3

In [8]:
# Deploy the reselect to get final dataset for stats calcs
# ----------------------------------------------------------
tmpdf = []
for idx, row in dates.iterrows():
    df0 = oxaria_no2.reset_index().query("@row[1] <= rec < @row[2] and tag == @row[0]")
    df0["fname"] = row[7]
    df1 = oxaria_no2.reset_index().query("@row[4] <= rec < @row[5] and tag == @row[3]")
    df1["fname"] = row[7]
    df2 = pd.concat([df0, df1])
    tmpdf.append(df2)
oxaria_no2_f = pd.concat(tmpdf)  # .set_index(['tag','rec'])
# Remove Windmill School & 558 which does not correct well for no2 (only)
oxaria_no2_f = oxaria_no2_f[oxaria_no2_f["fname"] != "Windmill School"]
oxaria_no2_f = oxaria_no2_f[oxaria_no2_f["tag"] != "scs-bgx-558"]
oxaria_no2_f.set_index(["tag", "rec"], inplace=True)
oxaria_no2_f.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 568438 entries, ('scs-bgx-536', Timestamp('2020-10-14 00:00:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   val.no2.wev                         568438 non-null  float32
 1   val.no2.cnc                         567226 non-null  float32
 2   val.no2.aev                         568438 non-null  float32
 3   val.no2.wec                         567226 non-null  float32
 4   val.sht.hmd_g                       567226 non-null  float32
 5   val.sht.tmp_g                       567226 non-null  float32
 6   val.no2.cnc_1                       556605 non-null  float32
 7   name                                568438 non-null  object 
 8   mag_hmd_s20                         567224 non-null  float32
 9   mag_tmp_s20                         5

In [9]:
print(oxaria_pm_f["fname"].unique(), "\n")
print(oxaria_no2_f["fname"].unique(), "\n")

print(
    'Note, "558" is has already been dropped from the no2 time series in "10c_no2_model_deployment_2020_SH_lgbm.ipynb"'
)


['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marston'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Said Business School'
 'County Hall' 'Divinity Rd' 'Ahlul Bayt Centre' 'St Giles'
 'Warneford Hospital' 'Speedwell St'] 

['High St' 'South Parks Rd' 'St Ebbes' 'Jesus College' 'New Marston'
 'The Plain' 'Worcester College' 'John Radcliffe' 'Said Business School'
 'County Hall' 'Divinity Rd' 'Ahlul Bayt Centre' 'St Giles'
 'Warneford Hospital' 'Speedwell St'] 

Note, "558" is has already been dropped from the no2 time series in "10c_no2_model_deployment_2020_SH_lgbm.ipynb"


In [10]:
# Save to feather
# -----------------
oxaria_pm_f.reset_index().to_feather(
    data_home + "oxaria_pm_stable15_full_corr_oct_2021_final_ts.ftr"
)
oxaria_no2_f.reset_index().to_feather(
    data_home + "oxaria_no2_stable15_full_corr_oct_2021_final_ts.ftr"
)



## Getting the public facing data files

oxaria_no2_f.info()


In [12]:
oxaria_pm_f.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 573994 entries, ('scs-bgx-536', Timestamp('2020-10-14 00:00:00+0000', tz='UTC')) to ('scs-bgx-559', Timestamp('2021-10-01 00:00:00+0000', tz='UTC'))
Data columns (total 46 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   val.mtf1                 573994 non-null  float32
 1   val.pm1                  573994 non-null  float32
 2   val.mtf5                 573994 non-null  float32
 3   val.pm2p5                573994 non-null  float32
 4   val.mtf3                 573994 non-null  float32
 5   val.pm10                 573994 non-null  float32
 6   val.mtf7                 573994 non-null  float32
 7   val.per                  573994 non-null  float32
 8   val.sfr                  573994 non-null  float32
 9   val.sht.hmd_p            573994 non-null  float32
 10  val.sht.tmp_p            573994 non-null  float32
 11  val.pm10_1               430081 non-null  float3

In [13]:
# Clean & tidy it to crteated a file for public dissemination
# -------------------------------------------------------------
oxaria_no2_processed = oxaria_no2_f.iloc[:, [0, 1, 2, 3, 4, 5, 23, 24, 25]]
oxaria_no2_processed = oxaria_no2_processed.rename(
    columns={"val.sht.hmd_g": "val.sht.hmd", "val.sht.tmp_g": "val.sht.tmp"}
)
oxaria_no2_processed["creation_date"] = dt.datetime.now(dt.timezone.utc)
oxaria_no2_processed.sort_index(inplace=True)
oxaria_no2_processed.reset_index().to_feather(
    data_home + "oxaria_no2_processed_20062022.ftr"
)

oxaria_pm_processed = oxaria_pm_f.iloc[
    :, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 44, 45]
]
oxaria_pm_processed = oxaria_pm_processed.rename(
    columns={"val.sht.hmd_p": "val.sht.hmd", "val.sht.tmp_p": "val.sht.tmp"}
)
oxaria_pm_processed["creation_date"] = dt.datetime.now(dt.timezone.utc)
oxaria_pm_processed.sort_index(inplace=True)
oxaria_pm_processed.reset_index().to_feather(
    data_home + "oxaria_pm_processed_20062022.ftr"
)

display(oxaria_no2_processed.head(), oxaria_pm_processed.head())


Unnamed: 0_level_0,Unnamed: 1_level_0,val.no2.wev,val.no2.cnc,val.no2.aev,val.no2.wec,val.sht.hmd,val.sht.tmp,val.no2.cnc_1_lgbm_norm_cor_blc,val.no2.cnc_1_lgbm_norm_cor_blc_ug,fname,creation_date
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
scs-bgx-536,2020-10-14 00:00:00+00:00,0.282168,-1750.422485,0.253825,0.026914,77.253929,12.678652,2.697931,5.159793,High St,2022-06-20 14:07:46.006311+00:00
scs-bgx-536,2020-10-14 00:15:00+00:00,0.282168,-1750.079956,0.253778,0.026983,77.23333,12.65,4.33778,8.296004,High St,2022-06-20 14:07:46.006311+00:00
scs-bgx-536,2020-10-14 00:30:00+00:00,0.282128,-1740.844482,0.252628,0.028832,77.013336,12.617778,6.210564,11.877705,High St,2022-06-20 14:07:46.006311+00:00
scs-bgx-536,2020-10-14 00:45:00+00:00,0.282309,-1747.987793,0.253602,0.027402,77.263336,12.60889,4.810743,9.200546,High St,2022-06-20 14:07:46.006311+00:00
scs-bgx-536,2020-10-14 01:00:00+00:00,0.282333,-1739.540039,0.252596,0.029091,77.354446,12.62,,,High St,2022-06-20 14:07:46.006311+00:00


Unnamed: 0_level_0,Unnamed: 1_level_0,val.mtf1,val.pm1,val.mtf5,val.pm2p5,val.mtf3,val.pm10,val.mtf7,val.per,val.sfr,val.sht.hmd,val.sht.tmp,val.pm10_1_c2_SH_xt_cor,val.pm2p5_1_c2_S_xt_cor,fname,creation_date
tag,rec,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
scs-bgx-536,2020-10-14 00:00:00+00:00,20.555555,2.871111,29.922222,5.732222,25.422222,15.905556,22.344444,4.9,6.903222,46.772221,20.061111,10.552414,12.175938,High St,2022-06-20 14:07:47.220007+00:00
scs-bgx-536,2020-10-14 00:15:00+00:00,20.355556,3.155555,29.5,6.136667,25.788889,15.164445,26.211111,4.9,7.031111,46.73,20.0,,15.846164,High St,2022-06-20 14:07:47.220007+00:00
scs-bgx-536,2020-10-14 00:30:00+00:00,20.444445,3.013333,30.077778,5.808889,25.388889,15.043333,21.655556,4.9,6.982333,46.612221,20.0,11.083782,12.60954,High St,2022-06-20 14:07:47.220007+00:00
scs-bgx-536,2020-10-14 00:45:00+00:00,20.588888,2.907778,29.822222,5.808889,25.388889,15.42,20.811111,4.9,6.875333,46.711113,19.967777,11.362556,12.361898,High St,2022-06-20 14:07:47.220007+00:00
scs-bgx-536,2020-10-14 01:00:00+00:00,20.544445,2.776667,29.977777,5.643333,25.211111,13.9,23.444445,4.9,6.903222,46.843334,19.947777,10.382067,13.250626,High St,2022-06-20 14:07:47.220007+00:00
