# NSW Temperature missing-value fill

- Author: Mathew Traini
- Date: 21/03/22
- Purpose: Proof-of-concept notebook to fill as many missing temperature values as possible (in an expedient manner)
Uses a combination of left-joining using the demand dataset to remove unnecessary temperature observations, 
Meteostat lookups, and short range Pandas forward- and backfilling. Results in a date/time matched dataset (to the demand data)
with only 3 missing temperature values for the entire range 2010-2021.

Only deals with NSW data for the moment.

In [32]:
import pandas as pd
from datetime import datetime, timedelta
from meteostat import Hourly


DATA_PATH = "../../data/H03-2021/"

# Set the MeteoStat ID for Bankstown
bankstown_id = "94765"

# Timezone for Sydney
tz = "Australia/Sydney"

In [51]:
temp = pd.read_csv(DATA_PATH+"temperature_nsw.csv")
demand = pd.read_csv(DATA_PATH+"totaldemand_nsw.csv")

temp.DATETIME = pd.to_datetime(temp.DATETIME)
demand.DATETIME = pd.to_datetime(demand.DATETIME)


In [59]:
temp.dtypes

LOCATION               object
DATETIME       datetime64[ns]
TEMPERATURE           float64
dtype: object

In [5]:
temp.shape

(220326, 3)

In [6]:
demand.shape


(196513, 3)

In [66]:
# Using datetime as the 'key', left-join on demand.

result = pd.merge(demand, temp, left_on=['DATETIME'], right_on=['DATETIME'], how='left')
result.head(10)

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID,LOCATION,TEMPERATURE
0,2010-01-01 00:00:00,8038.0,NSW1,Bankstown,23.1
1,2010-01-01 00:30:00,7809.31,NSW1,Bankstown,22.9
2,2010-01-01 01:00:00,7483.69,NSW1,Bankstown,22.6
3,2010-01-01 01:30:00,7117.23,NSW1,Bankstown,22.5
4,2010-01-01 02:00:00,6812.03,NSW1,Bankstown,22.5
5,2010-01-01 02:30:00,6544.33,NSW1,Bankstown,22.4
6,2010-01-01 03:00:00,6377.32,NSW1,Bankstown,22.3
7,2010-01-01 03:30:00,6282.85,NSW1,Bankstown,22.3
8,2010-01-01 04:00:00,6211.49,NSW1,Bankstown,22.1
9,2010-01-01 04:30:00,6248.31,NSW1,Bankstown,22.2


In [91]:
# Find any cases where there's missing temperature values
result[result['TEMPERATURE'].isnull()]

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID,LOCATION,TEMPERATURE
440,2010-10-01 04:00:00,6717.56,NSW1,,
514,2010-11-01 17:00:00,11380.63,NSW1,,
651,2010-01-14 13:30:00,10061.15,NSW1,,
693,2010-01-15 10:30:00,10214.32,NSW1,,
741,2010-01-16 10:30:00,9534.90,NSW1,,
...,...,...,...,...,...
147020,2018-05-21 16:30:00,8387.82,NSW1,,
147021,2018-05-21 17:00:00,8823.71,NSW1,,
169906,2019-10-09 11:00:00,7931.07,NSW1,,
188057,2020-09-22 14:00:00,6648.85,NSW1,,


In [92]:
# There are more rows in the merged dataframe than we started with. Check for duplicated values which have arisen from the merge
result[result.duplicated()]

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID,LOCATION,TEMPERATURE
17521,2011-01-01 00:00:00,8063.36,NSW1,Bankstown,21.0
31079,2011-10-10 10:30:00,9039.47,NSW1,Bankstown,18.9
31096,2011-10-10 18:30:00,9220.89,NSW1,Bankstown,16.1
31099,2011-10-10 19:30:00,9021.03,NSW1,Bankstown,15.5
35045,2012-01-01 00:00:00,7079.08,NSW1,Bankstown,15.4
52614,2013-01-01 00:00:00,7359.19,NSW1,Bankstown,21.0
70135,2014-01-01 00:00:00,7009.91,NSW1,Bankstown,20.4
87656,2015-01-01 00:00:00,7057.51,NSW1,Bankstown,20.9
105177,2016-01-01 00:00:00,7139.95,NSW1,Bankstown,16.9
122746,2017-01-01 00:00:00,7431.17,NSW1,Bankstown,22.6


In [68]:
# Drop the duplicated rows
clean = result.drop_duplicates()
clean.shape

(196513, 5)

In [60]:
clean.dtypes

DATETIME       datetime64[ns]
TOTALDEMAND           float64
REGIONID               object
LOCATION               object
TEMPERATURE           float64
dtype: object

In [69]:
# Identify all the missing temperature values from the final clean version of the merged dataframe
clean[clean['TEMPERATURE'].isnull()]

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID,LOCATION,TEMPERATURE
440,2010-10-01 04:00:00,6717.56,NSW1,,
514,2010-11-01 17:00:00,11380.63,NSW1,,
651,2010-01-14 13:30:00,10061.15,NSW1,,
693,2010-01-15 10:30:00,10214.32,NSW1,,
741,2010-01-16 10:30:00,9534.90,NSW1,,
...,...,...,...,...,...
147020,2018-05-21 16:30:00,8387.82,NSW1,,
147021,2018-05-21 17:00:00,8823.71,NSW1,,
169906,2019-10-09 11:00:00,7931.07,NSW1,,
188057,2020-09-22 14:00:00,6648.85,NSW1,,


In [71]:
# Get the row indices of all rows with missing temp values
missing_idx = clean[clean['TEMPERATURE'].isnull()].index
len(missing_idx)

579

In [72]:
# First iterate through the missing values (by row index) and attempt to find replacements in the MeteoStat database
# Not all timepoints will be present, but try to fill as many gaps as possible.
# MeteoStat only has readings on the hour...there are no :30 readings. Skip anything that is on the half-hour.

found = 0
missed = 0

for idx in missing_idx:
    # Set the start and end dates for the Meteostat query
    start_date = clean.iloc[idx]['DATETIME'].to_pydatetime()

    # Only send the query if the missing value is on the hour
    if (start_date.minute == 0):
        end_date = start_date + timedelta(hours = 1)
        
        #print("On the hour, getting ",start_date, end_date)
        meteo_data = Hourly(loc=bankstown_id, start=start_date, end=end_date, timezone=tz)
        meteo_data = meteo_data.fetch()
        if (meteo_data.empty is False):
            clean.at[idx,'TEMPERATURE'] = meteo_data.iloc[0]['temp']
            found = found+1
        else:
            missed = missed + 1
            
print("Found/missed/total: ", found, missed, found+missed )
        


Found/missed/total:  232 70 302


In [81]:
# We now filled in as many missing values as MeteoStat can provide.
# Fill in as many remaining empty values using the built-in Pandas fillna method with a limit of 3 (ie: backfilling a maximum of 3 missed row)
backfilled = clean.fillna(method='bfill', limit=3)

# Now forward fill any missing values (up to a limit of 3)
backfilled = backfilled.fillna(method='ffill', limit=3)

# Finally set all LOCATION to 'Bankstown' (this is invariant for the entire NSW dataset)
backfilled['LOCATION'] = 'Bankstown'

In [82]:
backfilled[backfilled['TEMPERATURE'].isnull()]

Unnamed: 0,DATETIME,TOTALDEMAND,REGIONID,LOCATION,TEMPERATURE
3862,2010-03-22 11:00:00,10233.91,NSW1,Bankstown,
3863,2010-03-22 11:30:00,10219.71,NSW1,Bankstown,
3864,2010-03-22 12:00:00,10273.89,NSW1,Bankstown,


In [85]:
# Write out the cleaned up temperature data in the same format as was originally supplied. This can then be used in pre-existing processing notebooks  
final = backfilled[['LOCATION','DATETIME','TEMPERATURE']]
final.to_csv('temperature_nsw_cleaned.csv', index=False)

In [86]:
final.shape

(196513, 3)

In [89]:
final.isna().sum()

LOCATION       0
DATETIME       0
TEMPERATURE    3
dtype: int64