In [22]:
#!pip3 install pandas
#!pip3 install seaborn
#!pip3 install holidays
#!pip3 install xgboost
#!pip3 install holidays
#!pip3 install sklearn

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from datetime import date

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

## Loading data

Loading the weather dataset

In [24]:
graz_weather_df = pd.read_csv("../data/graz_weather.csv",delimiter=',',header=9,encoding='UTF-8')

Keeping only the relative weather data

In [25]:
print(graz_weather_df.columns)

Index(['timestamp', 'Graz Temperature [2 m elevation corrected]',
       'Graz Sunshine Duration', 'Graz Shortwave Radiation',
       'Graz Direct Shortwave Radiation', 'Graz Diffuse Shortwave Radiation',
       'Graz Precipitation Total', 'Graz Snowfall Amount',
       'Graz Relative Humidity [2 m]', 'Graz Cloud Cover Total',
       'Graz Wind Speed [10 m]', 'Graz Wind Direction [10 m]'],
      dtype='object')


In [26]:
graz_weather_df = graz_weather_df[['timestamp', 'Graz Temperature [2 m elevation corrected]','Graz Shortwave Radiation',
'Graz Direct Shortwave Radiation', 'Graz Diffuse Shortwave Radiation','Graz Relative Humidity [2 m]']]

Converting the timestamp to the same format as that used in the energy datasets

In [27]:
def parse_timestamp(timestamp):
    return(timestamp[6:8]+ '.' + timestamp[4:6] + '.' + timestamp[2:4] + ' ' + timestamp[9:11] + ':' + timestamp[11:])

In [28]:
graz_weather_df['timestamp'] = graz_weather_df['timestamp'].apply(parse_timestamp)

Loading the Energy Usage datasets and combining them into 1 dataframe


In [29]:
#Coloumn names english translation:
electricity_usage = 'electricity_usage'
fbh_kalte = 'fbh_kalte_df'
fbh_warme = 'fbh_warme_df'
fernwarme = 'district_heating'
warm_wasser = 'water_heating'
luftung_kalte = 'vent_cooling'
luftung_warme = 'vent_heating'
pv = 'pv_production'
turnsaal_warme = 'gym_heating'

In [30]:
electricity_usage_df = pd.read_csv("../data/WAAGNER-BIRO-STRASSE-99--8020-GRAZ-VSLEOPOLDINUM_Wertebericht_220801091637.csv",delimiter=';',names=['timestamp',electricity_usage,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
fbh_kalte_df = pd.read_csv("../data/WKZFBHKälte_Wertebericht_220801091739.csv",delimiter=';',names=['timestamp',fbh_kalte,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
fbh_warme_df = pd.read_csv("../data/WKZFBHWärme_Wertebericht_220801091812.csv",delimiter=';',names=['timestamp',fbh_warme,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
fernwarme_warme_df = pd.read_csv("../data/WMZFernwärmeWärme_Wertebericht_220801091521.csv",delimiter=';',names=['timestamp',fernwarme,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
warm_wasser_df = pd.read_csv("../data/WMZWarmwasserWärme_Wertebericht_220801092011.csv",delimiter=';',names=['timestamp',warm_wasser,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
luftung_kalte_df = pd.read_csv("../data/WKZLüftungKälte_Wertebericht_220801091709.csv",delimiter=';',names=['timestamp',luftung_kalte,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
luftung_warme_df = pd.read_csv("../data/WKZLüftungWärme_Wertebericht_220801091924.csv",delimiter=';',names=['timestamp',luftung_warme,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
pv_df = pd.read_csv("../data/WAAGNER-BIRO-STRASSE-99--8020-GRAZ-VSLEOPOLDINUM-PV_Wertebericht_220801092034.csv",delimiter=';',names=['timestamp',pv,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')
turnsaal_warme_df = pd.read_csv("../data/WMZTurnsaalWärme_Wertebericht_220801091948.csv",delimiter=';',names=['timestamp',turnsaal_warme,'',' '], skiprows=9,encoding='unicode_escape',on_bad_lines='skip')


In [31]:
#Dropping the last n=9 rows as they are not part of the dataset
n = 9
electricity_usage_df.drop(electricity_usage_df.tail(n).index, inplace = True)
fbh_kalte_df.drop(fbh_kalte_df.tail(n).index, inplace = True)
fbh_warme_df.drop(fbh_warme_df.tail(n).index, inplace = True)
fernwarme_warme_df.drop(fernwarme_warme_df.tail(n).index, inplace = True)
warm_wasser_df.drop(warm_wasser_df.tail(n).index, inplace = True)
luftung_kalte_df.drop(luftung_kalte_df.tail(n).index, inplace = True)
luftung_warme_df.drop(luftung_warme_df.tail(n).index, inplace = True)
pv_df.drop(pv_df.tail(n).index, inplace = True)
turnsaal_warme_df.drop(turnsaal_warme_df.tail(n).index, inplace = True)

Removing bad dataframe coloumns

In [32]:
#Coloumn names english translation:
electricity_usage = 'electricity_usage'
fbh_kalte = 'fbh_kalte_df'
fbh_warme = 'fbh_warme_df'
fernwarme = 'district_heating'
warm_wasser = 'water_heating'
luftung_kalte = 'vent_cooling'
luftung_warme = 'vent_heating'
pv = 'pv_production'
turnsaal_warme = 'gym_heating'

In [33]:
electricity_usage_df = electricity_usage_df[['timestamp',electricity_usage]]
fbh_kalte_df = fbh_kalte_df[['timestamp',fbh_kalte]]
fbh_warme_df = fbh_warme_df[['timestamp',fbh_warme]]
fernwarme_warme_df = fernwarme_warme_df[['timestamp',fernwarme]]
warm_wasser_df = warm_wasser_df[['timestamp',warm_wasser]]
luftung_kalte_df = luftung_kalte_df[['timestamp',luftung_kalte]]
luftung_warme_df = luftung_warme_df[['timestamp',luftung_warme]]
pv_df = pv_df[['timestamp',pv]]
turnsaal_warme_df = turnsaal_warme_df[['timestamp',turnsaal_warme]]

In [34]:
df = pd.DataFrame([])
df['timestamp'] = electricity_usage_df['timestamp']
df[electricity_usage] = pd.to_numeric(electricity_usage_df[electricity_usage].str.replace(',','.'))
df[fbh_kalte] = pd.to_numeric(fbh_kalte_df[fbh_kalte].str.replace(',','.'))
df[fbh_warme] = pd.to_numeric(fbh_warme_df[fbh_warme].str.replace(',','.'))
df[fernwarme] = pd.to_numeric(fernwarme_warme_df[fernwarme].str.replace(',','.'))
df[warm_wasser] = pd.to_numeric(warm_wasser_df[warm_wasser].str.replace(',','.'))
df[luftung_kalte] = pd.to_numeric(luftung_kalte_df[luftung_kalte].str.replace(',','.'))
df[luftung_warme] = pd.to_numeric(luftung_warme_df[luftung_warme].str.replace(',','.'))
df[pv] = pd.to_numeric(pv_df[pv].str.replace(',','.'))
df[turnsaal_warme] = pd.to_numeric(turnsaal_warme_df[turnsaal_warme].str.replace(',','.'))

In [35]:
df = pd.merge(df,graz_weather_df,how='inner')

The get_date() and get_time() functions defined below are used to convert the timestamp into more meaningful date and time objects

In [36]:
def get_date(date_and_time):
    date_str = date_and_time.split(' ')[0]
    date_split = date_str.split('.')
    curr_date = date.fromisoformat('20'+date_split[2]+'-'+date_split[1]+'-'+date_split[0])
    return curr_date

def get_time(date_and_time):
    time_str = date_and_time.split(' ')[1]
    return time_str

### Removing Na values
Here we are dropping any row for which we do not have the data for all data. 
To keep more data records we should first identify which features we will use and only then remove any missing coloumns. 
Also, not sure if it makes sense to put this line after train/test split. 

In [37]:
#Here we are dropping any row for which we do not have the data for all data
#To keep more data records we should first identify which features we will use and only then remove any missing coloumns
#Also, not sure if it makes sense to put this line after train/test split
df = df.dropna()

In [38]:
#df.set_index('timestamp')

In [39]:

df.to_csv('../data/energy_and_weather.csv',index=False)