In [1]:
# Script Name: Combination of Temperature and Energy Demand
# Author: Rahul Kumar
# Date: 4/3/22
# Description: The purpose of the script is to combine the temperature and energy demand data into a unified dataset that 
# can be used to work with

import pandas as pd
import numpy as np
from math import sqrt
import seaborn as sns
import matplotlib.pyplot as plt

# This actually sets the pandas display to show all rows and columns 
# when you are showing a dataframe, without skipping the center
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
filepath = '../../H06-2021/' # If you want to run the code, change the file path to where you store the files. This reference
                                # is to where I store the files outside the github clone
    
outpath = '../../UNSW-PROJECT-DATA/' #I store it on my comp and the copy the file to the server
# filename = 'totaldemand_nsw'
ext = '.csv'

states = ['nsw','qld','sa','vic']

In [3]:
# 'temprature', 'totaldemand', 'forecastdemand'

filename = 'temprature' #<-correct your NSW temperature filename to temprature
temp = pd.DataFrame(columns=['STATE','LOCATION','DATETIME','TEMPERATURE'])

for state in states:
    df = pd.read_csv(filepath+filename+'_'+state+ext)
    df['STATE'] = state.upper()
    temp = pd.concat([temp,df], ignore_index=True)
    
temp = temp[['STATE','LOCATION','DATETIME','TEMPERATURE']]
temp.to_csv(outpath+filename+ext)
temp.head(15)

Unnamed: 0,STATE,LOCATION,DATETIME,TEMPERATURE
0,NSW,Bankstown,1/1/2010 0:00,23.1
1,NSW,Bankstown,1/1/2010 0:01,23.1
2,NSW,Bankstown,1/1/2010 0:30,22.9
3,NSW,Bankstown,1/1/2010 0:50,22.7
4,NSW,Bankstown,1/1/2010 1:00,22.6
5,NSW,Bankstown,1/1/2010 1:30,22.5
6,NSW,Bankstown,1/1/2010 2:00,22.5
7,NSW,Bankstown,1/1/2010 2:30,22.4
8,NSW,Bankstown,1/1/2010 3:00,22.3
9,NSW,Bankstown,1/1/2010 3:30,22.3


In [4]:
filename = 'totaldemand'
demand = pd.DataFrame(columns=['STATE','DATETIME','TOTALDEMAND','REGIONID'])

for state in states:
    df = pd.read_csv(filepath+filename+'_'+state+ext)
    df['STATE'] = state.upper()
    demand = pd.concat([demand,df], ignore_index=True)

demand = demand[['STATE','DATETIME','TOTALDEMAND']]
demand.to_csv(outpath+filename+ext)

demand.head(15)

Unnamed: 0,STATE,DATETIME,TOTALDEMAND
0,NSW,1/1/2010 0:00,8038.0
1,NSW,1/1/2010 0:30,7809.31
2,NSW,1/1/2010 1:00,7483.69
3,NSW,1/1/2010 1:30,7117.23
4,NSW,1/1/2010 2:00,6812.03
5,NSW,1/1/2010 2:30,6544.33
6,NSW,1/1/2010 3:00,6377.32
7,NSW,1/1/2010 3:30,6282.85
8,NSW,1/1/2010 4:00,6211.49
9,NSW,1/1/2010 4:30,6248.31


In [5]:
# fd_nsw = fd_nsw[['FORECASTDEMAND','DATETIME']]
# fd_nsw = fd_nsw.groupby('DATETIME').mean().reset_index()

filename = 'forecastdemand'
fdemand = pd.DataFrame(columns=['STATE','FORECASTDEMAND','DATETIME'])

for state in states:
    df = pd.read_csv(filepath+filename+'_'+state+ext)
    df = df.groupby('DATETIME').mean().reset_index()
    df['STATE'] = state.upper()
    df = df[['STATE','FORECASTDEMAND','DATETIME']]
    fdemand = pd.concat([fdemand,df], ignore_index=True)
    
fdemand.to_csv(outpath+'mean_'+filename+ext)
fdemand.head(15)


Unnamed: 0,STATE,FORECASTDEMAND,DATETIME
0,NSW,7824.411831,2010-01-01 00:00:00
1,NSW,7680.510417,2010-01-01 00:30:00
2,NSW,7428.97137,2010-01-01 01:00:00
3,NSW,7084.802162,2010-01-01 01:30:00
4,NSW,6751.3696,2010-01-01 02:00:00
5,NSW,6484.270789,2010-01-01 02:30:00
6,NSW,6304.12974,2010-01-01 03:00:00
7,NSW,6158.749744,2010-01-01 03:30:00
8,NSW,6104.542405,2010-01-01 04:00:00
9,NSW,6021.6175,2010-01-01 04:30:00


In [6]:
# Knowing that Total Demand has the most rows, we will merge on that
# DF STATE Date rows - 418,012
# Temp STATE Date rows - 778,177
# TD STATE Date rows - 786,051

filename = 'combined'
combined = demand.merge(temp, left_on=['STATE','DATETIME'], right_on = ['STATE','DATETIME'],how = 'outer')
combined_fdavg = combined.merge(fdemand, left_on=['STATE','DATETIME'], right_on = ['STATE','DATETIME'],how = 'outer')

# Based on the Climate Glossary - http://www.bom.gov.au/climate/glossary/seasons.shtml the seasons are broken down as
# follows: 9,10,11 are Spring, 12,1,2 are Summer, 3,4,5 are Autumn and 6,7,8 are Winter

def Season(item):
    if item == 12 or item == 1 or item == 2:
        return 'Summer'
    elif item == 3 or item == 4 or item == 5:
        return 'Autumn'
    elif item == 6 or item == 7 or item == 8:
        return 'Winter'
    else:
        return 'Spring'
    

combined['DATETIME'] = pd.to_datetime(combined['DATETIME'])
combined['Weekday'] = combined['DATETIME'].dt.day_name()
combined['Quarter'] = combined['DATETIME'].dt.quarter
combined['Month'] = combined['DATETIME'].dt.month
combined['Season'] = combined['Month'].apply(Season)

combined_fdavg['DATETIME'] = pd.to_datetime(combined_fdavg['DATETIME'])
combined_fdavg['Weekday'] = combined_fdavg['DATETIME'].dt.day_name()
combined_fdavg['Quarter'] = combined_fdavg['DATETIME'].dt.quarter
combined_fdavg['Month'] = combined_fdavg['DATETIME'].dt.month
combined_fdavg['Season'] = combined_fdavg['Month'].apply(Season)



combined.to_csv(outpath+filename+ext)
combined_fdavg.to_csv(outpath+filename+'_fd_Avg'+ext)
combined_fdavg.head(15)

Unnamed: 0,STATE,DATETIME,TOTALDEMAND,LOCATION,TEMPERATURE,FORECASTDEMAND,Weekday,Quarter,Month,Season
0,NSW,2010-01-01 00:00:00,8038.0,Bankstown,23.1,,Friday,1,1,Summer
1,NSW,2010-01-01 00:30:00,7809.31,Bankstown,22.9,,Friday,1,1,Summer
2,NSW,2010-01-01 01:00:00,7483.69,Bankstown,22.6,,Friday,1,1,Summer
3,NSW,2010-01-01 01:30:00,7117.23,Bankstown,22.5,,Friday,1,1,Summer
4,NSW,2010-01-01 02:00:00,6812.03,Bankstown,22.5,,Friday,1,1,Summer
5,NSW,2010-01-01 02:30:00,6544.33,Bankstown,22.4,,Friday,1,1,Summer
6,NSW,2010-01-01 03:00:00,6377.32,Bankstown,22.3,,Friday,1,1,Summer
7,NSW,2010-01-01 03:30:00,6282.85,Bankstown,22.3,,Friday,1,1,Summer
8,NSW,2010-01-01 04:00:00,6211.49,Bankstown,22.1,,Friday,1,1,Summer
9,NSW,2010-01-01 04:30:00,6248.31,Bankstown,22.2,,Friday,1,1,Summer


In [7]:
combined.head(50)

Unnamed: 0,STATE,DATETIME,TOTALDEMAND,LOCATION,TEMPERATURE,Weekday,Quarter,Month,Season
0,NSW,2010-01-01 00:00:00,8038.0,Bankstown,23.1,Friday,1,1,Summer
1,NSW,2010-01-01 00:30:00,7809.31,Bankstown,22.9,Friday,1,1,Summer
2,NSW,2010-01-01 01:00:00,7483.69,Bankstown,22.6,Friday,1,1,Summer
3,NSW,2010-01-01 01:30:00,7117.23,Bankstown,22.5,Friday,1,1,Summer
4,NSW,2010-01-01 02:00:00,6812.03,Bankstown,22.5,Friday,1,1,Summer
5,NSW,2010-01-01 02:30:00,6544.33,Bankstown,22.4,Friday,1,1,Summer
6,NSW,2010-01-01 03:00:00,6377.32,Bankstown,22.3,Friday,1,1,Summer
7,NSW,2010-01-01 03:30:00,6282.85,Bankstown,22.3,Friday,1,1,Summer
8,NSW,2010-01-01 04:00:00,6211.49,Bankstown,22.1,Friday,1,1,Summer
9,NSW,2010-01-01 04:30:00,6248.31,Bankstown,22.2,Friday,1,1,Summer


In [9]:
combined.isnull().sum(axis = 0)

STATE               0
DATETIME            0
TOTALDEMAND    386506
LOCATION       394393
TEMPERATURE    394393
Weekday             0
Quarter             0
Month               0
Season              0
dtype: int64

In [12]:
combined.shape

(1172570, 9)

In [None]:
# https://meteostat.net/en/blog/obtain-weather-data-any-location-python

# Can be potentially used to fill any missing temps