In [1]:
#!pip3 install pandas
#!pip3 install seaborn
#!pip3 install holidays
#!pip3 install xgboost
#!pip3 install holidays
#!pip3 install sklearn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
import datetime

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

holidays_AT = holidays.country_holidays('AT')

## Loading data

Loading the weather dataset

In [3]:
graz_weather_df = pd.read_csv("../../data/graz_weather.csv",delimiter=',',header=9,encoding='UTF-8')

Keeping only the relative weather data

In [4]:
print(graz_weather_df.columns)

Index(['timestamp', 'Graz Temperature [2 m elevation corrected]',
       'Graz Sunshine Duration', 'Graz Shortwave Radiation',
       'Graz Direct Shortwave Radiation', 'Graz Diffuse Shortwave Radiation',
       'Graz Precipitation Total', 'Graz Snowfall Amount',
       'Graz Relative Humidity [2 m]', 'Graz Cloud Cover Total',
       'Graz Wind Speed [10 m]', 'Graz Wind Direction [10 m]'],
      dtype='object')


In [5]:
graz_weather_df = graz_weather_df[['timestamp', 'Graz Temperature [2 m elevation corrected]','Graz Shortwave Radiation',
'Graz Direct Shortwave Radiation', 'Graz Diffuse Shortwave Radiation','Graz Relative Humidity [2 m]']]

Converting the timestamp to the same format as that used in the energy datasets

In [6]:
def parse_weather_timestamp(timestamp):
    return(timestamp[6:8]+ '.' + timestamp[4:6] + '.' + timestamp[2:4] + ' ' + timestamp[9:11] + ':' + timestamp[11:])

In [7]:
def timestamp_to_datetime(timestamp):
    date_str = timestamp.split(' ')[0]
    time_str = timestamp.split(' ')[1]
    datetime_obj = datetime.datetime(int('20'+date_str.split('.')[2]), int(date_str.split('.')[1] ), int(date_str.split('.')[0]),int(time_str[:2]), int(time_str[-2:]))
    return datetime_obj
    

In [8]:
graz_weather_df

Unnamed: 0,timestamp,Graz Temperature [2 m elevation corrected],Graz Shortwave Radiation,Graz Direct Shortwave Radiation,Graz Diffuse Shortwave Radiation,Graz Relative Humidity [2 m]
0,20180101T0000,4.321370,0.00,0.000000,0.000000,73.0
1,20180101T0100,4.261370,0.00,0.000000,0.000000,65.0
2,20180101T0200,3.101370,0.00,0.000000,0.000000,69.0
3,20180101T0300,3.801370,0.00,0.000000,0.000000,66.0
4,20180101T0400,4.311370,0.00,0.000000,0.000000,67.0
...,...,...,...,...,...,...
41635,20221001T1900,18.801370,18.69,9.772658,8.917342,60.0
41636,20221001T2000,18.121370,0.00,0.000000,0.000000,66.0
41637,20221001T2100,16.971370,0.00,0.000000,0.000000,68.0
41638,20221001T2200,15.491369,0.00,0.000000,0.000000,66.0


In [9]:
graz_weather_df['timestamp'] = graz_weather_df['timestamp'].apply(parse_weather_timestamp).apply(timestamp_to_datetime)

In [10]:
#graz_weather_df.index = graz_weather_df['timestamp']

Loading the Energy Usage datasets and combining them into 1 dataframe


In [11]:
#Coloumn names english translation:
fernwarme = 'district_heating'


In [12]:
fernwarme_df = pd.read_csv("../../data/WMZFernwärmeWärme_Wertebericht_220801091521.csv",delimiter=';',names=['timestamp',fernwarme,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')


In [13]:
#Dropping the last n=9 rows as they are not part of the dataset
n = 9
fernwarme_df.drop(fernwarme_df.tail(n).index, inplace = True)


Removing bad dataframe coloumns

In [14]:
fernwarme_df = fernwarme_df[['timestamp',fernwarme]]

In [15]:
df = pd.DataFrame([])
df['timestamp'] = fernwarme_df['timestamp']
df['timestamp'] = df['timestamp'].apply(timestamp_to_datetime)
df[fernwarme] = pd.to_numeric(fernwarme_df[fernwarme].str.replace(',','.'))

In [16]:
graz_weather_df = graz_weather_df.set_index(graz_weather_df['timestamp'])
df = df.set_index(df['timestamp'])

In [17]:
graz_weather_df_upsampled = graz_weather_df[['Graz Temperature [2 m elevation corrected]',
       'Graz Shortwave Radiation', 'Graz Direct Shortwave Radiation',
       'Graz Diffuse Shortwave Radiation', 'Graz Relative Humidity [2 m]']].resample('15T').interpolate()
#graz_weather_df_upsampled['timestamp'] = graz_weather_df_upsampled.index.to_series()

### Interpolating weather data to be sampled every 15 mins

In [18]:
'''
It might make sense to interpolate weather data such that we have a reading for every 15 mins same as the energy data
a good source might be: https://www.numpyninja.com/post/interpolation-using-pandas 
'''
graz_weather_df_upsampled.reset_index(drop = True, inplace = True)
df.reset_index(drop = True, inplace = True)

df = pd.merge(df,graz_weather_df_upsampled,how='inner',left_index=True, right_index=True)

In [19]:
df = df.dropna()

In [20]:
df

Unnamed: 0,timestamp,district_heating,Graz Temperature [2 m elevation corrected],Graz Shortwave Radiation,Graz Direct Shortwave Radiation,Graz Diffuse Shortwave Radiation,Graz Relative Humidity [2 m]
0,2021-06-14 00:15:00,0.0,4.321370,0.000000,0.00000,0.000000,73.00
1,2021-06-14 00:30:00,0.0,4.306370,0.000000,0.00000,0.000000,71.00
2,2021-06-14 00:45:00,0.0,4.291370,0.000000,0.00000,0.000000,69.00
3,2021-06-14 01:00:00,0.0,4.276370,0.000000,0.00000,0.000000,67.00
4,2021-06-14 01:15:00,0.0,4.261370,0.000000,0.00000,0.000000,65.00
...,...,...,...,...,...,...,...
39614,2022-07-31 15:45:00,0.0,13.126369,448.114990,266.19872,181.916270,39.00
39615,2022-07-31 16:00:00,0.0,13.188869,426.532485,251.79243,174.740055,39.00
39616,2022-07-31 16:15:00,0.0,13.251369,404.949980,237.38614,167.563840,39.00
39617,2022-07-31 16:30:00,0.0,13.178869,373.577485,217.52228,156.055203,39.25


### Accounting for holidays and weekends

In [21]:
df.index = df['timestamp']

In [22]:
def get_is_schoolday(date_arg):
    '''
    arg: datetime object
    returns 1: if it is a school day
    returns 0: if date is either in the weekend, a public holiday or during school break in styria
    '''
    #initilising to bad dates
    semester_break_start = datetime.date(1999,1,1)
    semester_break_end = datetime.date(1999,1,1)
    easter_start = datetime.date(1999,1,1)
    easter_end = datetime.date(1999,1,1)
    pentecost_start = datetime.date(1999,1,1)
    pentecost_end = datetime.date(1999,1,1)
    summer_start = datetime.date(1999,1,1)
    summer_end = datetime.date(1999,1,1)
    autumn_start = datetime.date(1999,1,1)
    autumn_end = datetime.date(1999,1,1)
    christmas_start = datetime.date(1999,1,1)
    christmas_end = datetime.date(1999,1,1)

    if date_arg.year == 2021:
        semester_break_start = datetime.date(2021,2,15)
        semester_break_end = datetime.date(2021,2,21)
        easter_start = datetime.date(2021,3,27)
        easter_end = datetime.date(2021,4,5)
        pentecost_start = datetime.date(2021,5,22)
        pentecost_end = datetime.date(2021,5,24)
        summer_start = datetime.date(2021,7,10)
        summer_end = datetime.date(2021,9,12)
        autumn_start = datetime.date(2021,10,27)
        autumn_end = datetime.date(2021,10,31)
        christmas_start = datetime.date(2021,12,24)
        christmas_end = datetime.date(2022,1,6)
       
    elif date_arg.year == 2022:
        semester_break_start = datetime.date(2022,2,21)
        semester_break_end = datetime.date(2022,2,21)
        easter_start = datetime.date(2022,4,9)
        easter_end = datetime.date(2022,4,18)
        pentecost_start = datetime.date(2022,6,4)
        pentecost_end = datetime.date(2022,6,6)
        summer_start = datetime.date(2022,7,9)
        summer_end = datetime.date(2022,9,11)
        autumn_start = datetime.date(2022,10,27)
        autumn_end = datetime.date(2022,10,31)
        christmas_start = datetime.date(2022,12,24)
        christmas_end = datetime.date(2023,1,7)

    
    if semester_break_start <= date_arg.date() <= semester_break_end:
        return 0
    elif easter_start <= date_arg.date() <= easter_end:
        return 0
    elif pentecost_start <= date_arg.date() <= pentecost_end:
        return 0
    elif summer_start <= date_arg.date() <=  summer_end:
        return 0
    elif autumn_start <= date_arg.date() <= autumn_end:
        return 0
    elif christmas_start <= date_arg.date() <= christmas_end:
        return 0
    elif date_arg.date() in holidays_AT:
        return 0
    elif 5 <= date_arg.weekday() <= 6:
        
        return 0

    else:
        return 1
    

In [23]:
df['is_schoolday'] = df['timestamp'].apply(get_is_schoolday)

In [24]:
df['day'] = [x.day for x in df['timestamp']]
df['month'] = [x.month for x in df['timestamp']]
df['year'] = [x.year for x in df['timestamp']]
df['hour'] = [x.hour for x in df['timestamp']]
#minute not imp if we're sampling hourly
#df['minute'] = [x.minute for x in df['timestamp']] 

In [25]:
df.to_csv('../../data/preprocessed/fernwarme_weather.csv',index=False)