In [1]:
import numpy as np
import pandas as pd

In [2]:
#reading the data from the different excel-files

start = 2013
stop = 2021

years = range(start, stop+1)
str_form = 'xlsx/elspot-prices_{}_hourly_eur.xlsx'

files = {}
for year in years:
    path = str_form.format(year)
    price_data = pd.read_excel(path)
    files[year] = price_data

In [3]:
#Checking out the different columns, the areas for which we have prices

''''
for year in files.keys():
    print('Year: ' + str(year))
    print(files[year].head())
'''

#Finding the intersection/columns for which we have prices
prices = None
first = True

for year in files.keys():
    cols = set(files[year].columns)
    if first:
        first = False
        prices = set(cols)
    prices = set.intersection(cols, prices)



In [4]:
#Deleting the columns which are not present in all the files

for year in files.keys():
    cols = files[year].columns
    for col in cols:
        if col not in prices:
            del files[year][col]


In [5]:
#Making a joint dataFrame

all_prices = None
first = True

for year in files.keys():
    if first:
        first = False
        all_prices = files[year]
    else:
        all_prices = pd.concat([all_prices, files[year]])


all_prices.head()

Unnamed: 0.1,Unnamed: 0,Hours,SYS,SE1,SE2,SE3,SE4,FI,DK1,DK2,Oslo,Kr.sand,Bergen,Molde,Tr.heim,Tromsø,EE,LV,LT
0,2013-01-01,00 - 01,31.05,31.04,31.04,31.04,31.04,31.04,14.03,14.03,32.98,32.98,32.98,31.04,31.04,31.04,31.12,,24.42
1,2013-01-01,01 - 02,30.47,27.51,27.51,27.51,27.51,27.51,11.06,11.06,32.97,32.97,32.97,30.81,30.81,30.81,30.61,,23.62
2,2013-01-01,02 - 03,28.92,24.44,24.44,24.44,24.44,24.44,8.5,8.5,32.59,32.59,32.59,30.77,30.77,30.77,24.44,,23.93
3,2013-01-01,03 - 04,27.88,21.81,21.81,21.81,21.81,21.81,0.1,0.1,31.53,31.53,31.53,30.71,30.71,30.71,21.81,,23.85
4,2013-01-01,04 - 05,26.96,22.37,22.37,22.37,22.37,22.37,2.01,2.01,30.54,30.54,30.54,30.63,30.63,30.63,22.37,,23.26


In [6]:
#Making the columns containing date and hours to a joint column in a date format

#print(type(all_prices['Unnamed: 0'].iloc[7])) - is of the type of pandas.timestamp, and must be altered
print(str(all_prices['Unnamed: 0'].iloc[1354])[:10] + ' ' + all_prices['Hours'].iloc[1354][:2] + ':' + '00')

2013-02-26 10:00


In [7]:
from datetime import datetime
#Format(?): format='%Y-%m-%d %H:%M'
#+ ' ' + all_prices['Hours'][:2] + ':00'

all_prices['datestring'] = all_prices['Unnamed: 0'].astype(str) + ' ' + all_prices['Hours'].apply(lambda x: x[:2])

all_prices['datetime'] = all_prices['datestring'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H'))

all_prices['datetime'].head(25)

0    2013-01-01 00:00:00
1    2013-01-01 01:00:00
2    2013-01-01 02:00:00
3    2013-01-01 03:00:00
4    2013-01-01 04:00:00
5    2013-01-01 05:00:00
6    2013-01-01 06:00:00
7    2013-01-01 07:00:00
8    2013-01-01 08:00:00
9    2013-01-01 09:00:00
10   2013-01-01 10:00:00
11   2013-01-01 11:00:00
12   2013-01-01 12:00:00
13   2013-01-01 13:00:00
14   2013-01-01 14:00:00
15   2013-01-01 15:00:00
16   2013-01-01 16:00:00
17   2013-01-01 17:00:00
18   2013-01-01 18:00:00
19   2013-01-01 19:00:00
20   2013-01-01 20:00:00
21   2013-01-01 21:00:00
22   2013-01-01 22:00:00
23   2013-01-01 23:00:00
24   2013-01-02 00:00:00
Name: datetime, dtype: datetime64[ns]

In [8]:
#Removing unvalid values, such as N/A for prices - these should be numpy float 64 and check for 

areas = ['SYS', 'SE1', 'SE2', 'SE3', 'SE4', 'FI', 'DK1',
       'DK2', 'Oslo', 'Kr.sand', 'Bergen', 'Molde', 'Tr.heim', 'Tromsø', 'EE', 'LV', 'LT']

all_prices.describe()

areas_with_missing_values = ['EE', 'LV', 'LT'] #These have approx. 59 to 63 000 values - suggested removed
#These are Estonia, Latvia and Lithuania, which are only connected to Sweden and Finland

for area in areas_with_missing_values:
    del all_prices[area]


In [9]:
all_prices.columns

Index(['Unnamed: 0', 'Hours', 'SYS', 'SE1', 'SE2', 'SE3', 'SE4', 'FI', 'DK1',
       'DK2', 'Oslo', 'Kr.sand', 'Bergen', 'Molde', 'Tr.heim', 'Tromsø',
       'datestring', 'datetime'],
      dtype='object')

In [10]:
all_prices[['SYS', 'SE1', 'SE2', 'SE3', 'SE4', 'FI', 'DK1',
       'DK2', 'Oslo', 'Kr.sand', 'Bergen', 'Molde', 'Tr.heim', 'Tromsø']].cov()

Unnamed: 0,SYS,SE1,SE2,SE3,SE4,FI,DK1,DK2,Oslo,Kr.sand,Bergen,Molde,Tr.heim,Tromsø
SYS,447.6536,261.394881,261.584704,495.00209,539.362448,496.701733,551.441492,564.2789,491.448141,485.314362,485.374694,257.007096,257.007096,239.250807
SE1,261.394881,260.946451,261.076276,287.018762,287.265975,286.610445,261.728773,280.802136,249.698535,240.883127,242.429159,237.600076,237.600076,214.800517
SE2,261.584704,261.076276,261.481942,287.448662,287.846605,287.026358,262.364073,281.425099,249.855243,241.041502,242.588167,237.991413,237.991413,214.730303
SE3,495.00209,287.018762,287.448662,675.563412,709.726313,672.276415,680.810084,726.82947,539.594057,526.916391,528.524002,258.796113,258.796113,236.416555
SE4,539.362448,287.265975,287.846605,709.726313,856.011428,717.76204,830.866323,884.570398,602.119386,593.740107,591.189827,257.814045,257.814045,229.020318
FI,496.701733,286.610445,287.026358,672.276415,717.76204,831.056668,682.416559,738.449897,525.540575,514.379457,515.217565,253.721442,253.721442,228.655428
DK1,551.441492,261.728773,262.364073,680.810084,830.866323,682.416559,1193.079773,942.823036,646.464476,648.32622,640.968633,234.20279,234.20279,203.065252
DK2,564.2789,280.802136,281.425099,726.82947,884.570398,738.449897,942.823036,1002.928795,641.514733,634.366625,630.590764,248.958546,248.958546,217.167979
Oslo,491.448141,249.698535,249.855243,539.594057,602.119386,525.540575,646.464476,641.514733,615.150499,603.637476,604.790885,250.721529,250.721529,233.607546
Kr.sand,485.314362,240.883127,241.041502,526.916391,593.740107,514.379457,648.32622,634.366625,603.637476,603.83155,600.016271,242.261711,242.261711,224.856766


In [11]:
#Finding all the dates, which can be used for the hydro reservoir levels

dates_df = all_prices['datetime']
dates_df

0      2013-01-01 00:00:00
1      2013-01-01 01:00:00
2      2013-01-01 02:00:00
3      2013-01-01 03:00:00
4      2013-01-01 04:00:00
               ...        
8756   2021-12-31 19:00:00
8757   2021-12-31 20:00:00
8758   2021-12-31 21:00:00
8759   2021-12-31 22:00:00
8760   2021-12-31 23:00:00
Name: datetime, Length: 78897, dtype: datetime64[ns]

In [12]:
dates_df.to_csv('dates.csv')

In [13]:
print(all_prices['datetime'].iloc[11242])

2014-04-14 09:00:00


In [14]:
all_prices.to_csv('area_prices.csv')