# Prepare data from pse.pl for use in ML

In [37]:
import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
from datetime import timedelta
import itertools

import warnings

In [38]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', None)

PATH_TGE='../../data/tge/'

# Read data

In [356]:
%%time
df_tge=pd.read_parquet(f'{PATH_TGE}tge_full.parquet')
df_b_moc=pd.read_parquet(f'{PATH_TGE}b_moc.parquet')
df_g_moc=pd.read_parquet(f'{PATH_TGE}g_moc.parquet')
df_g_wiatr=pd.read_parquet(f'{PATH_TGE}g_wiatr.parquet')
df_i_net=pd.read_parquet(f'{PATH_TGE}i_net.parquet')
df_w_kse=pd.read_parquet(f'{PATH_TGE}w_kse.parquet')
df_w_ubytki=pd.read_parquet(f'{PATH_TGE}w_ubytki.parquet')
df_w_wym=pd.read_parquet(f'{PATH_TGE}w_wym.parquet')
df_weather_ml=pd.read_parquet(f'{PATH_TGE}weather_ml.parquet')
df_z_kse=pd.read_parquet(f'{PATH_TGE}z_kse.parquet')

CPU times: total: 1.97 s
Wall time: 1.09 s


# Function

In [322]:
def create_datetime(d,f,h=0):
    if h==0:
        return datetime.strptime(str(d), f)
    else:
        return datetime.strptime(str(d), f) + timedelta(hours=(h-1))

In [323]:
def info_dataset(df):
    print(f'Count rows: {df.shape}')
    print(f'Min date: {df.index.min()}')
    print(f'Max date: {df.index.max()}')

In [324]:
def check_missing_datetime_index(df,resize='H'):
    if df.index.size != df.resample(resize).mean().index.size:
        print("There are missing dates in the datetime index.")
        df_err=df.resample(resize).mean()
        return df_err[df_err.isnull().any(1)]
    else:
        print("There are no missing dates in the datetime index.")   

In [355]:
def prepare_column(df,list_col):
    df=df[df['Godzina']!='2A']
    df['Godzina']=df['Godzina'].astype(int)
    df['Data']=df['Data'].astype(str)
    for x in list_col:
        df[x]=np.where(df[x].isna(),0,df[x].str.replace(',','.').replace('-','0').astype(float))
    return df

def prepare_index(df,date_format):
    df['date_hour']=df.apply(lambda x:create_datetime(x.Data,date_format,x.Godzina), axis=1)
    df.index=df['date_hour']
    df.drop(['Data','Godzina','date_hour'],axis=1,inplace=True)
    warnings.filterwarnings("ignore")
    df_res=check_missing_datetime_index(df,'H')
    print(f'count row with missing index: {len(df_res)}')
    df=df.resample('H').mean().fillna(method='backfill')
    return df

In [163]:
#df_date=pd.DataFrame(pd.date_range(start='2016-01-01', end='2023-01-01', freq="1H"), columns=['date_time'])

# Prepare TGE

In [357]:
df_tge['fixing_1_kurs']=df_tge['fixing_1_kurs'].replace(0, np.nan).fillna(method='backfill')
df_tge.head()

Unnamed: 0_level_0,date,czas,fixing_1_kurs,fixing_1_wolumen,fixing_2_kurs,fixing_2_wolumen,notowania_kurs,notowania_wolumen,date_hour
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-06-28 00:00:00,2019-06-28,1,220.54,1754.3,226.53,1245.7,0.0,5.0,2019-06-28 00:00:00
2019-06-28 01:00:00,2019-06-28,2,203.85,1674.0,191.84,1277.3,0.0,0.0,2019-06-28 01:00:00
2019-06-28 02:00:00,2019-06-28,3,203.85,1786.2,201.12,1079.6,0.0,0.0,2019-06-28 02:00:00
2019-06-28 03:00:00,2019-06-28,4,203.85,1689.4,211.05,844.5,0.0,0.0,2019-06-28 03:00:00
2019-06-28 04:00:00,2019-06-28,5,203.85,1793.9,222.5,602.3,0.0,0.0,2019-06-28 04:00:00


In [358]:
info_dataset(df_tge)

Count rows: (29328, 9)
Min date: 2019-06-28 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare b_moc

In [388]:
df_b_moc['date_hour']=df_b_moc.apply(lambda x:create_datetime(x.Data,"%Y%m%d"), axis=1)

for x in df_b_moc.drop(['Data','date_hour','Kwadrans szczytowy (typ)','Kwadrans szczytowy'],axis=1).columns.to_list():
    df_b_moc[x]=df_b_moc[x].str.replace(',','.')
    df_b_moc[x]=df_b_moc[x].str.replace(u'\xa0','').astype(float)
dfbm=df_b_moc.drop(['Kwadrans szczytowy (typ)','Kwadrans szczytowy','Data'],axis=1).groupby('date_hour').mean().reset_index()
dfbm.index=dfbm['date_hour']
dfbm.drop(['date_hour'],axis=1,inplace=True)
dfbm.head()

Unnamed: 0_level_0,Moc osi¹galna elektrowni krajowych,Elektrownie zawodowe,JWCD,pozosta³e,elektrownie przemys³owe,Ubytki mocy elektrowni przemys³owych,Ubytki mocy elektrowni zawodowych,spowodowane remontami kapitalnymi,JWCD.1,pozosta³e.1,spowodowane remontami rednimi,JWCD.2,pozosta³e.2,spowodowane remontami bie¿¹cymi,JWCD.3,pozosta³e.3,spowodowane remontami awaryjnymi,JWCD.4,pozosta³e.4,spowodowane warunkami eksploatacyjnymi,JWCD.5,pozosta³e.5,ze wzglêdu na ciep³ownictwo,JWCD.6,pozosta³e.6,na JW bedacych w okresie oswajania z inwestycji,JWCD.7,pozosta³e.7,Moc dyspozycyjna elektrowni krajowych,elektrownie zawodowe,JWCD.8,pozosta³e.8,elektrownie przemys³owe.1,Obci¹¿enie elektrowni krajowych,elektrownie zawodowe.1,JWCD.9,pozosta³e.9,elektrownie przemys³owe.2,Krajowe zapotrzebowanie na moc,Krajowe saldo wymiany miêdzysystemowej *,Ubytki mocy z uwagi na warunki pracy sieci,Rezerwa mocy w elektrowniach zawodowych:**,Rezerwa mocy w JWCD,JWCD cieplne:,rezerwa wiruj¹ca,rezerwa zimna,JWCD wodne,Rezerwa mocy pozosta³a
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
2016-01-01,39535.15,37084.35,24901.0,12183.35,2450.8,1144.2,8661.69,1351.9,1155.0,196.9,231.3,225.0,6.3,273.0,225.0,48.0,453.0,179.0,274.0,5444.06,570.0,4874.06,908.43,8.3,900.13,0.0,0.0,0.0,29733.56,28426.96,22542.1,5884.86,1306.6,16518.2,15211.6,9658.2,5553.4,1306.6,16662.526,150.05,0.0,13215.36,12883.9,11509.85,2258.85,9251.0,1374.05,331.46
2016-01-02,39535.15,37084.35,24901.0,12183.35,2450.8,1127.35,6707.09,1351.9,1155.0,196.9,231.3,225.0,6.3,273.0,225.0,48.0,485.3,205.0,280.3,3335.71,610.0,2725.71,1029.88,35.8,994.08,0.0,0.0,0.0,31716.41,30392.96,22452.85,7940.11,1323.45,19539.05,18215.6,10604.0,7611.6,1323.45,19683.1585,141.2,112.0,12065.36,11736.85,10338.5,2131.0,8207.5,1398.35,328.51
2016-01-03,39535.15,37084.35,24901.0,12183.35,2450.8,1188.8,9440.44,1351.9,1155.0,196.9,231.3,225.0,6.3,498.0,450.0,48.0,1288.4,931.3,357.1,5007.66,764.5,4243.16,1063.18,74.25,988.93,0.0,0.0,0.0,28940.06,27678.06,21334.45,6343.61,1262.0,20120.0,18858.0,12863.35,5994.65,1262.0,19792.17,-322.3,112.0,8708.06,8359.1,7188.4,340.6,6847.8,1170.7,348.96
2016-01-04,39535.15,37084.35,24901.0,12183.35,2450.8,1191.95,9150.74,1376.9,1180.0,196.9,231.3,225.0,6.3,273.0,225.0,48.0,1235.2,825.9,409.3,5049.36,862.05,4187.31,984.98,77.9,907.08,0.0,0.0,0.0,29217.41,27958.56,21528.15,6430.41,1258.85,24929.0,23670.15,17541.05,6129.1,1258.85,24590.0335,-348.35,112.0,4176.41,3875.1,2506.5,1081.2,1425.3,1368.6,301.31
2016-01-05,39535.15,37084.35,24901.0,12183.35,2450.8,1157.65,10948.54,1376.9,1180.0,196.9,231.3,225.0,6.3,290.2,225.0,65.2,1090.8,686.0,404.8,6994.81,1749.05,5245.76,964.53,63.3,901.23,0.0,0.0,0.0,27473.46,26180.31,20802.65,5377.66,1293.15,24646.15,23353.0,18298.6,5054.4,1293.15,24761.071,112.85,0.0,2827.31,2504.05,1239.4,1216.9,22.5,1264.65,323.26


In [389]:
warnings.filterwarnings("ignore")
df_res=check_missing_datetime_index(dfbm,'D')
df_res

There are no missing dates in the datetime index.


In [390]:
info_dataset(dfbm)

Count rows: (2496, 48)
Min date: 2016-01-01 00:00:00
Max date: 2022-10-31 00:00:00


In [391]:
dfbm.resample('H').ffill()[:2]

Unnamed: 0_level_0,Moc osi¹galna elektrowni krajowych,Elektrownie zawodowe,JWCD,pozosta³e,elektrownie przemys³owe,Ubytki mocy elektrowni przemys³owych,Ubytki mocy elektrowni zawodowych,spowodowane remontami kapitalnymi,JWCD.1,pozosta³e.1,spowodowane remontami rednimi,JWCD.2,pozosta³e.2,spowodowane remontami bie¿¹cymi,JWCD.3,pozosta³e.3,spowodowane remontami awaryjnymi,JWCD.4,pozosta³e.4,spowodowane warunkami eksploatacyjnymi,JWCD.5,pozosta³e.5,ze wzglêdu na ciep³ownictwo,JWCD.6,pozosta³e.6,na JW bedacych w okresie oswajania z inwestycji,JWCD.7,pozosta³e.7,Moc dyspozycyjna elektrowni krajowych,elektrownie zawodowe,JWCD.8,pozosta³e.8,elektrownie przemys³owe.1,Obci¹¿enie elektrowni krajowych,elektrownie zawodowe.1,JWCD.9,pozosta³e.9,elektrownie przemys³owe.2,Krajowe zapotrzebowanie na moc,Krajowe saldo wymiany miêdzysystemowej *,Ubytki mocy z uwagi na warunki pracy sieci,Rezerwa mocy w elektrowniach zawodowych:**,Rezerwa mocy w JWCD,JWCD cieplne:,rezerwa wiruj¹ca,rezerwa zimna,JWCD wodne,Rezerwa mocy pozosta³a
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
2016-01-01 00:00:00,39535.15,37084.35,24901.0,12183.35,2450.8,1144.2,8661.69,1351.9,1155.0,196.9,231.3,225.0,6.3,273.0,225.0,48.0,453.0,179.0,274.0,5444.06,570.0,4874.06,908.43,8.3,900.13,0.0,0.0,0.0,29733.56,28426.96,22542.1,5884.86,1306.6,16518.2,15211.6,9658.2,5553.4,1306.6,16662.526,150.05,0.0,13215.36,12883.9,11509.85,2258.85,9251.0,1374.05,331.46
2016-01-01 01:00:00,39535.15,37084.35,24901.0,12183.35,2450.8,1144.2,8661.69,1351.9,1155.0,196.9,231.3,225.0,6.3,273.0,225.0,48.0,453.0,179.0,274.0,5444.06,570.0,4874.06,908.43,8.3,900.13,0.0,0.0,0.0,29733.56,28426.96,22542.1,5884.86,1306.6,16518.2,15211.6,9658.2,5553.4,1306.6,16662.526,150.05,0.0,13215.36,12883.9,11509.85,2258.85,9251.0,1374.05,331.46


# Prepare g_moc

- Doba="20220101", Data publikacji="20220102122138"

In [362]:
%%time
df_g_moc['kod_tryb']=df_g_moc['Tryb pracy'].map(lambda x: x[:3])+'_'+df_g_moc['Kod'].map(lambda x: x[:3])
df_g_moc.drop(['Data publikacji','Kod','Nazwa','Tryb pracy'],axis=1,inplace=True)

for x in df_g_moc.drop(['Doba','kod_tryb','2'],axis=1).columns.to_list():
    df_g_moc[x]=df_g_moc[x].str.replace(',','.')
    df_g_moc[x]=df_g_moc[x].str.replace(u'\xa0','').astype(float)
df_g_moc['2']=np.where(df_g_moc['2'].isna(),(df_g_moc['1']+df_g_moc['3'])/2,df_g_moc['2'].str.replace(',','.').astype(float))

col=list(set(df_g_moc.columns.to_list())-set(['Doba','kod_tryb']))
dfgm=pd.melt(df_g_moc, id_vars=['Doba','kod_tryb'], value_vars=col)
dfgm['variable']=dfgm['variable'].astype(int)

df_pt=pd.pivot_table(dfgm, values='value', index=['Doba','variable'],columns='kod_tryb', aggfunc=np.sum, fill_value=0, margins=False)#.reset_index()
df_pt.columns = [''.join(str(s).strip() for s in col if s) for col in df_pt.columns ]
df_pt=df_pt.reset_index()

df_pt['Doba']=df_pt['Doba'].astype(str)
df_pt['date_hour']=df_pt.apply(lambda x:create_datetime(x.Doba, "%Y%m%d", x.variable), axis=1)
df_pt.index=df_pt['date_hour']
df_pt.drop(['Doba','variable'],axis=1,inplace=True)
df_pt.head()

CPU times: total: 6.53 s
Wall time: 6.54 s


Unnamed: 0_level_0,Gen_ADM,Gen_BEL,Gen_CHZ,Gen_DCH,Gen_DOD,Gen_JW2,Gen_JW3,Gen_KAR,Gen_KAT,Gen_KLE,Gen_KOZ,Gen_LD4,Gen_LEC,Gen_LGA,Gen_LZA,Gen_OPL,Gen_OSB,Gen_PAT,Gen_PLO,Gen_POL,Gen_PZR,Gen_REC,Gen_RYB,Gen_SIA,Gen_SNA,Gen_SOL,Gen_STW,Gen_TUR,Gen_WLC,Gen_WRO,Gen_WSI,Gen_WZE,Gen_ZGR,Gen_ZRN,Gen_ZYD,Pom_DCH,Pom_PZR,Pom_SOL,Pom_ZRN,Pom_ZYD,date_hour
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
2018-01-01 00:00:00,0,1738.76,150.055,0.0,271.15,0.0,147.7,107.765,88.217,141.434,631.68,55.1,120.945,71.9,284.39,409.21,160.64,446.47,0.0,373.57,35.3,82.803,401.72,71.9,68.5,0.0,0.0,520.35,0.0,120.2,360.0,0.0,98.58,0.0,0.0,0.0,2.9,0.41,17.65,3.24,2018-01-01 00:00:00
2018-01-01 01:00:00,0,1568.03,150.154,0.0,256.57,0.0,143.5,107.26,88.137,125.224,717.15,55.4,120.78,71.9,304.22,390.62,153.16,446.77,0.0,377.51,1.4,82.324,419.22,72.4,68.2,0.0,0.0,500.62,0.0,112.7,292.0,0.0,98.46,0.0,0.0,0.0,3.7,0.36,17.42,3.84,2018-01-01 01:00:00
2018-01-01 02:00:00,0,1451.86,149.955,0.0,245.93,0.0,139.1,107.365,88.255,123.499,725.39,55.1,120.835,71.7,211.76,375.77,148.28,439.85,0.0,368.2,0.0,82.238,414.04,73.4,69.34,0.0,0.0,482.66,0.0,113.0,267.0,0.0,98.47,0.0,0.0,0.0,3.4,0.38,17.45,56.74,2018-01-01 02:00:00
2018-01-01 03:00:00,0,1374.38,150.059,0.0,254.71,0.0,145.7,107.024,88.187,123.57,613.33,55.1,120.945,71.9,246.66,388.09,152.09,435.29,0.0,340.43,0.0,82.205,420.72,72.3,67.39,0.0,0.0,510.62,0.0,111.6,262.0,0.0,98.43,0.0,0.0,0.0,20.6,0.36,17.19,66.04,2018-01-01 03:00:00
2018-01-01 04:00:00,0,1398.97,150.054,0.0,256.73,0.0,146.1,108.595,88.024,123.13,639.64,55.4,120.945,71.9,296.27,390.97,152.57,441.8,0.0,364.01,0.0,82.183,422.65,72.3,67.39,0.0,0.0,513.57,0.0,107.1,268.0,0.0,98.35,0.0,0.0,0.0,137.4,0.38,16.99,112.72,2018-01-01 04:00:00


In [363]:
warnings.filterwarnings("ignore")
df_res=check_missing_datetime_index(df_pt,'H')
df_res

There are no missing dates in the datetime index.


In [364]:
info_dataset(df_pt)

Count rows: (42360, 41)
Min date: 2018-01-01 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare g_wiatr

In [365]:
df_g_wiatr.columns=['Data','Godzina','gen_wiatr','gen_fotowolt']
col=['gen_wiatr','gen_fotowolt']
df_g_wiatr=prepare_column(df_g_wiatr,col)
df_g_wiatr=prepare_index(df_g_wiatr,"%Y-%m-%d")
df_g_wiatr.head()

There are missing dates in the datetime index.
count row with missing index: 5


Unnamed: 0_level_0,gen_wiatr,gen_fotowolt
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,3893.613,0.0
2018-01-01 01:00:00,3828.699,0.0
2018-01-01 02:00:00,3982.8,0.0
2018-01-01 03:00:00,4083.114,0.0
2018-01-01 04:00:00,4089.188,0.0


In [366]:
info_dataset(df_g_wiatr)

Count rows: (42360, 2)
Min date: 2018-01-01 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare i_net

In [367]:
df_i_net.columns=['Data','Czas od','Czas do','energia_frr_export','energia_frr_import','frr_export','frr_import','frr_export_eur','frr_import_eur']
for x in ['energia_frr_export','energia_frr_import','frr_export','frr_import','frr_export_eur','frr_import_eur']:
    df_i_net[x]=np.where(df_i_net[x].isna(),0,df_i_net[x].str.replace(',','.').astype(float))
df_i_net.index=pd.to_datetime(df_i_net['Data']+' '+df_i_net['Czas od'])
df_i_net=df_i_net.resample('H').sum()
df_i_net.head()

Unnamed: 0,energia_frr_export,energia_frr_import,frr_export,frr_import,frr_export_eur,frr_import_eur
2020-02-18 00:00:00,0.0,0.0,284.96,284.96,66.712,66.712
2020-02-18 01:00:00,0.0,0.0,284.96,284.96,66.712,66.712
2020-02-18 02:00:00,0.0,0.0,284.88,284.88,66.696,66.696
2020-02-18 03:00:00,0.0,0.0,284.96,284.96,66.712,66.712
2020-02-18 04:00:00,0.0,0.0,284.96,284.96,66.712,66.712


In [368]:
warnings.filterwarnings("ignore")
df_res=check_missing_datetime_index(df_i_net,'H')
df_res

There are no missing dates in the datetime index.


In [369]:
info_dataset(df_i_net)

Count rows: (23688, 6)
Min date: 2020-02-18 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare w_kse

In [370]:
col=df_w_kse.drop(['Data','Godzina'],axis=1).columns.to_list()
df_w_kse=prepare_column(df_w_kse,col)
df_w_kse=prepare_index(df_w_kse,"%Y-%m-%d")
df_w_kse.head()

There are missing dates in the datetime index.
count row with missing index: 9


Unnamed: 0_level_0,Krajowe zapotrzebowanie na moc,Sumaryczna generacja JWCD,Generacja PI,Generacja IRZ,Sumaryczna generacja nJWCD,Krajowe saldo wymiany miêdzysystemowej równoleg³ej,Krajowe saldo wymiany miêdzysystemowej nierównoleg³ej
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00,14978.538,6227.75,33.75,0.0,8611.925,162.95,-17.313
2018-01-01 01:00:00,14397.65,6046.013,-0.2,0.0,8430.263,-19.5,-45.638
2018-01-01 02:00:00,13789.463,5713.75,-65.313,0.0,8573.775,4.063,-436.95
2018-01-01 03:00:00,13434.45,5540.725,-91.388,0.0,8664.188,-12.013,-664.763
2018-01-01 04:00:00,13285.238,5685.225,-250.463,0.0,8672.025,41.875,-863.625


In [371]:
info_dataset(df_w_kse)

Count rows: (42360, 7)
Min date: 2018-01-01 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare w_ubytki

In [372]:
df_w_ubytki.columns=['Data','Godzina','Elektrownia','Kod JW','elektro','siec','dostep']
df_w_ubytki['kod_jw']='ubytki_'+df_w_ubytki['Kod JW'].map(lambda x: x[:3])
col=['elektro','siec','dostep']
df_w_ubytki=prepare_column(df_w_ubytki,col)

df_ptu=pd.pivot_table(
    df_w_ubytki, 
    values=['elektro','siec','dostep'], 
    index=['Data','Godzina'],
    columns='kod_jw', 
    aggfunc=np.sum, fill_value=0, margins=False).reset_index()
df_ptu.columns = ['_'.join(str(s).strip() for s in col if s) for col in df_ptu.columns ]

df_ptu=prepare_index(df_ptu,"%Y%m%d")
df_ptu.head()

There are missing dates in the datetime index.
count row with missing index: 5


Unnamed: 0_level_0,dostep_ubytki_ADM,dostep_ubytki_BEL,dostep_ubytki_DOD,dostep_ubytki_JW2,dostep_ubytki_JW3,dostep_ubytki_KAR,dostep_ubytki_KOZ,dostep_ubytki_LGA,dostep_ubytki_LZA,dostep_ubytki_OPL,dostep_ubytki_OSB,dostep_ubytki_PAT,dostep_ubytki_PLO,dostep_ubytki_POL,dostep_ubytki_RYB,dostep_ubytki_SIA,dostep_ubytki_STW,dostep_ubytki_TUR,dostep_ubytki_WLC,dostep_ubytki_WZE,elektro_ubytki_ADM,elektro_ubytki_BEL,elektro_ubytki_DOD,elektro_ubytki_JW2,elektro_ubytki_JW3,elektro_ubytki_KAR,elektro_ubytki_KOZ,elektro_ubytki_LGA,elektro_ubytki_LZA,elektro_ubytki_OPL,elektro_ubytki_OSB,elektro_ubytki_PAT,elektro_ubytki_PLO,elektro_ubytki_POL,elektro_ubytki_RYB,elektro_ubytki_SIA,elektro_ubytki_STW,elektro_ubytki_TUR,elektro_ubytki_WLC,elektro_ubytki_WZE,siec_ubytki_ADM,siec_ubytki_BEL,siec_ubytki_DOD,siec_ubytki_JW2,siec_ubytki_JW3,siec_ubytki_KAR,siec_ubytki_KOZ,siec_ubytki_LGA,siec_ubytki_LZA,siec_ubytki_OPL,siec_ubytki_OSB,siec_ubytki_PAT,siec_ubytki_PLO,siec_ubytki_POL,siec_ubytki_RYB,siec_ubytki_SIA,siec_ubytki_STW,siec_ubytki_TUR,siec_ubytki_WLC,siec_ubytki_WZE
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
2018-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,370.0,0.0,0.0,0.0,87.0,1863.0,460.0,85.0,0.0,0.0,200.0,630.0,242.0,450.0,276.0,0.0,235.0,485.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-01 01:00:00,0.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,370.0,0.0,0.0,0.0,87.0,1863.0,460.0,85.0,0.0,0.0,200.0,630.0,242.0,450.0,276.0,0.0,235.0,485.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-01 02:00:00,0.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,370.0,0.0,0.0,0.0,87.0,1863.0,460.0,225.0,0.0,0.0,200.0,630.0,242.0,450.0,276.0,0.0,235.0,485.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-01 03:00:00,0.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,370.0,0.0,0.0,0.0,87.0,1863.0,460.0,225.0,0.0,0.0,200.0,630.0,242.0,450.0,276.0,0.0,235.0,485.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-01 04:00:00,0.0,0.0,0.0,0.0,0.0,125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,370.0,0.0,0.0,0.0,87.0,1863.0,460.0,225.0,0.0,0.0,200.0,630.0,242.0,450.0,276.0,0.0,235.0,485.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [373]:
info_dataset(df_ptu)

Count rows: (42360, 60)
Min date: 2018-01-01 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare w_wym

In [374]:
col=df_w_wym.drop(['Data','Godzina'],axis=1).columns.to_list()
df_w_wym=prepare_column(df_w_wym,col)
df_w_wym=prepare_index(df_w_wym,"%Y%m%d")
df_w_wym.head()

There are missing dates in the datetime index.
count row with missing index: 5


Unnamed: 0_level_0,CEPS_EXP,CEPS_IMP,SEPS_EXP,SEPS_IMP,50HzT_EXP,50HzT_IMP,SVK_EXP,SVK_IMP,UA_EXP,UA_IMP,LIT_EXP,LIT_IMP
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-01 00:00:00,-616.4,4.0,-348.8,0.0,0.0,1117.8,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-01 01:00:00,-719.7,0.0,-431.6,0.0,0.0,1130.55,-3.74,0.0,0.0,0.0,-2.35,0.0
2018-01-01 02:00:00,-735.3,0.0,-431.7,0.0,0.0,1203.9,-173.49,0.0,0.0,0.0,-232.46,0.0
2018-01-01 03:00:00,-819.8,0.4,-424.2,0.0,0.0,1240.05,-249.4,0.0,0.0,0.0,-377.81,0.0
2018-01-01 04:00:00,-834.0,1.6,-435.4,0.0,0.0,1259.85,-371.58,0.0,0.0,0.0,-466.6,0.0


In [375]:
info_dataset(df_w_wym)

Count rows: (42360, 12)
Min date: 2018-01-01 00:00:00
Max date: 2022-10-31 23:00:00


# Prepare z_kse

In [376]:
df_z_kse.columns=['Data','Godzina','demand_forecast','demand_fact']
col=['demand_fact']
df_z_kse=prepare_column(df_z_kse,col)
df_z_kse=prepare_index(df_z_kse,"%Y%m%d")
df_z_kse.head()

There are missing dates in the datetime index.
count row with missing index: 5


Unnamed: 0_level_0,demand_forecast,demand_fact
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-31 00:00:00,15600.0,15280.925
2017-12-31 01:00:00,14800.0,14513.975
2017-12-31 02:00:00,14400.0,14086.7
2017-12-31 03:00:00,14300.0,13818.15
2017-12-31 04:00:00,14300.0,13792.45


In [377]:
info_dataset(df_z_kse)

Count rows: (42360, 2)
Min date: 2017-12-31 00:00:00
Max date: 2022-10-30 23:00:00


# Prepare weather

In [398]:
df_weather_ml=pd.read_parquet(f'{PATH_TGE}weather_ml.parquet')

In [399]:
def create_datetime_weather(y,m,d,h):
    return  pd.to_datetime(str(y) + "/" + str(m) + "/" + str(d)) + timedelta(hours=h)

df_weather_ml['date_hour']=df_weather_ml.apply(lambda x:create_datetime_weather(x.year,x.month,x.day,x.hour), axis=1)
df_weather_ml.index=df_weather_ml['date_hour']
df_weather_ml.drop(['date_hour'],axis=1,inplace=True)
df_weather_ml.head()

Unnamed: 0_level_0,year,month,day,hour,Main,Description,wday,wind_deg,clouds,ID,wind_speed,dew_point,humidity,feels_like,yday,pressure,temp
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-01-01 00:00:00,2018,1,1,0,Clouds,broken clouds: 51-84%,0,220,75,803,2.1,5.52,86,6.44,1,1008,7.71
2018-01-01 01:00:00,2018,1,1,1,Clear,clear sky,0,0,0,800,1.03,6.31,93,7.37,1,1007,7.37
2018-01-01 02:00:00,2018,1,1,2,Clear,clear sky,0,0,0,800,0.51,4.45,86,6.62,1,1007,6.62
2018-01-01 03:00:00,2018,1,1,3,Clear,clear sky,0,0,0,800,0.51,5.26,93,6.31,1,1007,6.31
2018-01-01 04:00:00,2018,1,1,4,Mist,mist,0,50,0,701,2.1,5.37,100,3.7,1,1006,5.37


In [400]:
warnings.filterwarnings("ignore")
df_res=check_missing_datetime_index_hour(df_weather_ml)
df_res

There are no missing dates in the datetime index.


In [401]:
info_dataset(df_weather_ml)

Count rows: (44689, 17)
Min date: 2018-01-01 00:00:00
Max date: 2023-02-06 00:00:00


# SAVE Datasets

In [405]:
df_tge.to_parquet(f'{PATH_TGE}ml_tge.parquet')
dfbm.to_parquet(f'{PATH_TGE}ml_b_moc.parquet')
df_pt.to_parquet(f'{PATH_TGE}ml_g_moc.parquet')
df_g_wiatr.to_parquet(f'{PATH_TGE}ml_g_wiatr.parquet')
df_ptu.to_parquet(f'{PATH_TGE}ml_w_ubytki.parquet')
df_w_wym.to_parquet(f'{PATH_TGE}ml_w_wym.parquet')
df_z_kse.to_parquet(f'{PATH_TGE}ml_z_kse.parquet')
df_w_kse.to_parquet(f'{PATH_TGE}ml_w_kse.parquet')
df_i_net.to_parquet(f'{PATH_TGE}ml_i_net.parquet')
df_weather_ml.to_parquet(f'{PATH_TGE}ml_weather.parquet')