Here we split the whole dataset into 5 datasets for each country (now they contain only their country related columns). In addition we **truncate outliers** and normalize the data (for ARIMA, because other models do that within the code). We perform scaling here to analyse the values. In such a way we can then choose appropriate evaluation metrics etc.

# 1. Split data into 5 datasets

In [2]:
import pandas as pd
import numpy as np
import os
from datetime import date
from utils.helper import split_scale_dataset, add_exog_vars

In [3]:
# Create a folder named "datasets" if it doesn't exist
folder_name = "datasets"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

df = pd.read_csv("./datasets/top_5_countries.csv", index_col=0, parse_dates=True)

For each country revert UTC time to local time. Then make so that each dataset starts and ends at the same time.

In [4]:
top_5_countries = ['DE', 'GB', 'ES', 'FR', 'IT']
utc_converter = {'DE': 'Europe/Berlin', 
                 'GB': 'Europe/London', 
                 'ES': 'Europe/Madrid', 
                 'FR': 'Europe/Paris', 
                 'IT': 'Europe/Rome'}

# Split and save the datasets
for country_prefix in top_5_countries:
    # Filter columns with the specified prefix
    country_columns = [col for col in df.columns if col.startswith(country_prefix)]
    country_df = df[country_columns].copy()

    # Convert UTC to local time
    country_df.index = country_df.index.tz_convert(utc_converter[country_prefix]).tz_localize(None)
    
    # We drop first day and last day, because of missing values
    first_day = country_df.index.date.min()
    last_day = date(2020, 9, 30)

    # Drop all observations from the first and last days
    country_df = country_df[(country_df.index.date > first_day) & (country_df.index.date < last_day)]

    # Make 24 hours in each day (after daylight savings time etc) we have 23/25 hours
    # in other words - irregular intervals.
    # We will change it to have 24 hours in each day (simplest method). Then we do not break 
    # seasonalities and patterns, because at night load is small.
    # Time changes: https://stats.stackexchange.com/questions/45046/daylight-saving-time-in-time-series-modelling-e-g-load-data
    
    # Remove duplicated timestamps
    country_df = country_df[~country_df.index.duplicated(keep='first')]

    # Resample to hourly frequency to ensure all hours are included
    country_df = country_df.resample('H').asfreq()

    # Fill missing values with average of the previous and next value
    country_df = country_df.interpolate(method='linear', limit=1)
    
    # Save the DataFrame to a CSV file
    file_name = f"./datasets/{country_prefix}_data.csv"
    country_df.to_csv(file_name)


# 2. Split and scale data

Because of summer and winter time changes, we have not full days now.

In [5]:
de_df, gb_df, es_df, fr_df, it_df = [pd.read_csv(f"./datasets/{country}_data.csv", index_col=0, parse_dates=True) for country in top_5_countries] 

In [6]:
for df_country in [de_df, gb_df, es_df, fr_df, it_df]: 
    print(len(df_country))
del df_country

41616
41616
41616
41616
41616


In [7]:
# Check whether our data preparation was correct
# There were different indices in the datasets
# Because Britain has British summer time, others Central European summer time
# British: change time from 01:00 to 02:00
# Central European: change time from 02:00 to 03:00
# Source: https://www.timeanddate.com/time/europe/

if de_df.index.isin(gb_df.index).all():
    print("Indices are identical!")
else:
    print("Indices are not identical...")

    # Find non-matching elements
    non_matching_elements_in_de_df = de_df.index[~de_df.index.isin(gb_df.index)]
    non_matching_elements_in_gb_df = gb_df.index[~gb_df.index.isin(de_df.index)]
    print("Non-matching elements in de_df:", non_matching_elements_in_de_df)
    print("Non-matching elements in gb_df:", non_matching_elements_in_gb_df)

Indices are identical!


In [8]:
# days in the datasets
for df_country in [de_df, gb_df, es_df, fr_df, it_df]:
    days_in_data = len(df_country)/24
    print(f'{days_in_data:.0f} days in the dataset.')

del df_country

1734 days in the dataset.
1734 days in the dataset.
1734 days in the dataset.
1734 days in the dataset.
1734 days in the dataset.


In [9]:
# We combine dfs, because it will be easier to work with them
combined_df = pd.concat([de_df, gb_df, es_df, fr_df, it_df], axis=1)

print('Index frequency before:', combined_df.index.freq)

# Set frequency
combined_df = combined_df.asfreq('H')
print('Index frequency after:', combined_df.index.freq)

Index frequency before: None
Index frequency after: <Hour>


In [10]:
# Divide data into days

train_size = int(round(len(country_df)/24*0.7, 0))
test_size = int(round(len(country_df)/24*0.15, 0))
val_size = int(len(country_df)/24) - train_size - test_size
print(f'{train_size:.0f} days in the train dataset.\n {test_size:.0f} days in the test dataset.\n {val_size:.0f} days in the validation dataset.')

assert(days_in_data == train_size + test_size + val_size)

1214 days in the train dataset.
 260 days in the test dataset.
 260 days in the validation dataset.


In [11]:
num_train = train_size*24
num_test = test_size*24
num_vali = val_size*24
print(f'{num_train} observations in the train dataset.\n {num_test} observations in the test dataset.\n {num_vali} observations in the validation dataset.')

assert(len(country_df) == num_train + num_test + num_vali)

29136 observations in the train dataset.
 6240 observations in the test dataset.
 6240 observations in the validation dataset.


Split and scale datasets.

In [12]:
train, vali, test = split_scale_dataset(combined_df, train_split=0.7, test_split=0.15)

29136 observations in the train dataset.
6240 observations in the validation dataset. 
6240 observations in the test dataset.


In [13]:
# No zero values
tolerance = 1e-4 # change to 1e-5 and see that there is no such values
print(f"{'Column name':<40} {'Number of close to zero':>15}")

for col in train.columns[:-2]:
    print(f'{col:<40} {len(train[np.isclose(train[col], 0.0, rtol=tolerance, atol=tolerance)]):>15}')

Column name                              Number of close to zero
DE_load_actual_entsoe_transparency                     0
DE_solar_generation_actual                             0
DE_wind_generation_actual                              2
DE_wind_offshore_generation_actual                     3
DE_wind_onshore_generation_actual                      1
GB_UKM_load_actual_entsoe_transparency                 3
GB_UKM_solar_generation_actual                         2
GB_UKM_wind_generation_actual                          3
GB_UKM_wind_offshore_generation_actual                 0
GB_UKM_wind_onshore_generation_actual                  7
ES_load_actual_entsoe_transparency                     1
ES_solar_generation_actual                             4
ES_wind_onshore_generation_actual                      0
FR_load_actual_entsoe_transparency                     2
FR_solar_generation_actual                             0
FR_wind_onshore_generation_actual                      0
IT_load_actual_entsoe_t

In [14]:
train.describe().loc[['min', '25%', '50%', '75%', 'max']].round(2)

Unnamed: 0,DE_load_actual_entsoe_transparency,DE_solar_generation_actual,DE_wind_generation_actual,DE_wind_offshore_generation_actual,DE_wind_onshore_generation_actual,GB_UKM_load_actual_entsoe_transparency,GB_UKM_solar_generation_actual,GB_UKM_wind_generation_actual,GB_UKM_wind_offshore_generation_actual,GB_UKM_wind_onshore_generation_actual,ES_load_actual_entsoe_transparency,ES_solar_generation_actual,ES_wind_onshore_generation_actual,FR_load_actual_entsoe_transparency,FR_solar_generation_actual,FR_wind_onshore_generation_actual,IT_load_actual_entsoe_transparency,IT_solar_generation_actual,IT_wind_onshore_generation_actual
min,-2.5,-0.64,-1.27,-1.4,-1.19,-4.31,-0.65,-1.58,-1.46,-1.58,-2.36,-0.84,-1.71,-2.04,-0.72,-1.16,-2.19,-0.72,-1.37
25%,-0.84,-0.64,-0.78,-0.92,-0.76,-0.78,-0.65,-0.83,-0.85,-0.84,-0.85,-0.8,-0.78,-0.75,-0.72,-0.71,-0.87,-0.72,-0.83
50%,-0.03,-0.63,-0.27,-0.1,-0.3,0.1,-0.62,-0.17,-0.19,-0.17,0.04,-0.49,-0.19,-0.14,-0.63,-0.32,-0.07,-0.7,-0.24
75%,0.89,0.36,0.5,0.79,0.46,0.72,0.39,0.71,0.73,0.71,0.78,0.67,0.6,0.72,0.6,0.38,0.86,0.66,0.64
max,2.1,3.95,3.71,2.85,3.83,2.73,4.37,3.22,3.19,3.44,2.66,2.6,3.61,3.32,3.75,4.52,2.55,2.92,3.18


In [15]:
combined_df.to_csv("./datasets/combined_data.csv")

# ARIMA
# wind - not seasonal, load, solar -> seasonal

In [16]:
train_little = train.iloc[-720:, :]

vali_little = vali.iloc[:24, :]


In [17]:
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
true = vali_little['DE_solar_generation_actual']

model_2 = auto_arima(train_little['DE_solar_generation_actual'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

Best ARIMA parameters: (2, 0, 2) (1, 0, 1, 24)
0.06484039722236747
0.18460830429700179


In [18]:
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
true = vali_little['DE_load_actual_entsoe_transparency']

model_2 = auto_arima(train_little['DE_load_actual_entsoe_transparency'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

Best ARIMA parameters: (0, 1, 3) (2, 0, 2, 24)
1.1614016937403366
0.9508203784814379


In [None]:
# 10 cols
# 3 min
# 12

In [37]:
10*3*12/60

6.0

In [16]:
exog = train[['DE_wind_offshore_generation_actual',
       'DE_wind_onshore_generation_actual', 'HourOfDay', 'DayOfWeek']].copy()


exog_val = vali[['DE_wind_offshore_generation_actual',
       'DE_wind_onshore_generation_actual', 'HourOfDay', 'DayOfWeek']].copy()


In [17]:
train_little = train.iloc[-720:, :]
exog_little = exog.iloc[-720:, :]

vali_little = vali.iloc[:24, :]
exog_val_little = exog_val.iloc[:24, :]

In [1]:
from pmdarima.arima import auto_arima
model = auto_arima(train_little['DE_load_actual_entsoe_transparency'], X=train_little[['HourOfDay', 'DayOfWeek']], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model.order} {model.seasonal_order}")
forecasts, confidence = model.predict(n_periods=24, X=vali_little[['HourOfDay', 'DayOfWeek']], return_conf_int=True)

NameError: name 'train_little' is not defined

In [26]:
model = auto_arima(train_little['DE_wind_generation_actual'], X=train_little[['HourOfDay', 'DayOfWeek']], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model.order} {model.seasonal_order}")
forecasts, confidence = model.predict(n_periods=24, X=vali_little[['HourOfDay', 'DayOfWeek']], return_conf_int=True)

Best ARIMA parameters: (2, 1, 0) (2, 0, 2, 24)


In [32]:
model = auto_arima(train_little['DE_solar_generation_actual'], X=train_little[['HourOfDay', 'DayOfWeek']], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model.order} {model.seasonal_order}")
forecasts, confidence = model.predict(n_periods=24, X=vali_little[['HourOfDay', 'DayOfWeek']], return_conf_int=True)

Best ARIMA parameters: (3, 0, 1) (1, 0, 1, 24)


In [33]:
# true = vali_little['DE_load_actual_entsoe_transparency']
#true = vali_little['DE_wind_generation_actual']
true = vali_little['DE_solar_generation_actual']
from sklearn.metrics import mean_squared_error
print(mean_squared_error(true, forecasts))
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(true, forecasts))

0.10596604835547496
0.2553308275949591


In [28]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(true, forecasts))
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(true, forecasts))
# load
# 1.392177232965409
# 1.0642319788594152

1.2567079646334218
0.9953259997538365


In [29]:
model_2 = auto_arima(train_little['DE_wind_generation_actual'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

Best ARIMA parameters: (2, 1, 1) (2, 0, 2, 24)
1.2490368013501036
0.9923866808261081


In [34]:
model_2 = auto_arima(train_little['DE_solar_generation_actual'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

Best ARIMA parameters: (2, 0, 2) (1, 0, 1, 24)
0.06484039722236747
0.18460830429700179


In [35]:
model_3 = auto_arima(train_little['DE_solar_generation_actual'], maxiter=10)
print(f"Best ARIMA parameters: {model_3.order} {model_3.seasonal_order}")
forecasts_3, confidence_3 = model_3.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_3))
print(mean_absolute_error(true, forecasts_3))

Best ARIMA parameters: (2, 0, 3) (0, 0, 0, 0)
0.5133865127341192
0.6258638083755167


In [31]:
model_3 = auto_arima(train_little['DE_wind_generation_actual'], maxiter=10)
print(f"Best ARIMA parameters: {model_3.order} {model_3.seasonal_order}")
forecasts_3, confidence_3 = model_3.predict(n_periods=24, return_conf_int=True)
print(mean_squared_error(true, forecasts_3))
print(mean_absolute_error(true, forecasts_3))


Best ARIMA parameters: (2, 1, 1) (0, 0, 0, 0)
0.6958425163681312
0.7342671018513087


In [22]:
model_2 = auto_arima(train_little['DE_load_actual_entsoe_transparency'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)

Best ARIMA parameters: (0, 1, 3) (2, 0, 2, 24)


In [23]:
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

1.1614016937403366
0.9508203784814379


In [24]:
model_3 = auto_arima(train_little['DE_load_actual_entsoe_transparency'], maxiter=10)
print(f"Best ARIMA parameters: {model_3.order} {model_3.seasonal_order}")
forecasts_3, confidence_3 = model_3.predict(n_periods=24, return_conf_int=True)

Best ARIMA parameters: (4, 1, 4) (0, 0, 0, 0)


In [25]:
print(mean_squared_error(true, forecasts_3))
print(mean_absolute_error(true, forecasts_3))

2.398619931292829
1.395907471337363


In [82]:
from pmdarima.arima import auto_arima
model = auto_arima(train_little['DE_wind_generation_actual'], X=exog_little, stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model.order} {model.seasonal_order}")
forecasts, confidence = model.predict(n_periods=24, X=exog_val_little, return_conf_int=True)
# Best ARIMA parameters: (1, 0, 3) (2, 0, 0, 24)
# Best ARIMA parameters: (1, 1, 1) (2, 0, 1, 24)
# 1 min 23.8 s

Best ARIMA parameters: (0, 0, 0) (0, 0, 0, 24)


In [85]:
true = vali_little['DE_load_actual_entsoe_transparency']

In [87]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(true, forecasts))
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(true, forecasts))
#0.25075572437818033
#0.4445616873286631
 

0.9644889942407472
0.8984540151195102


In [88]:
model_2 = auto_arima(train_little['DE_wind_generation_actual'], stepwise=True, seasonal=True, m=24, maxiter=10)
print(f"Best ARIMA parameters: {model_2.order} {model_2.seasonal_order}")
forecasts_2, confidence_2 = model_2.predict(n_periods=24, return_conf_int=True)

Best ARIMA parameters: (1, 1, 1) (2, 0, 1, 24)


In [89]:
print(mean_squared_error(true, forecasts_2))
print(mean_absolute_error(true, forecasts_2))

0.7899957914056622
0.8281696675591402


In [90]:
model_3 = auto_arima(train_little['DE_wind_generation_actual'], maxiter=10)
print(f"Best ARIMA parameters: {model_3.order} {model_3.seasonal_order}")
forecasts_3, confidence_3 = model_3.predict(n_periods=24, return_conf_int=True)

Best ARIMA parameters: (1, 1, 1) (0, 0, 0, 0)


In [91]:
print(mean_squared_error(true, forecasts_3))
print(mean_absolute_error(true, forecasts_3))

0.6997970089410469
0.7787759402746929


In [None]:
model.summary()

In [None]:
model_2.summary()

Add exogenous variables.

In [None]:
train_exog, vali_exog, test_exog = add_exog_vars(time_series.iloc[:, -6:], train_size, val_size)


In [None]:
from pmdarima.arima import auto_arima
model = auto_arima(train['DE_load_actual_entsoe_transparency'], stepwise=True, seasonal=True, m=24, maxiter=10, exogenous= )
print(f"Best ARIMA parameters: {model.order} {model.seasonal_order}")
forecasts, confidence = model.predict(n_periods=24, return_conf_int=True)

# Distribution

In [None]:
train=28321
val=6577
test=8713

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
def split_scale_dataset(data, train_size, val_size, test_size=None):

    """
    data (pd.DataFrame): Dataframe with time series data.
    train_size, test_size, val_size (int): number of days in train, 
                                            test and validation datasets.

    return: Scaled datasets
   
    
    num_train = train_size*24
    if test_size is not None:
        num_test = test_size*24
    num_vali = val_size*24
"""
    num_train = train_size
    num_vali = val_size
    train_data = data.iloc[:num_train] # 0, a-1
    vali_data = data.iloc[num_train: num_train + num_vali] # a, a+b-1
    test_data = data.iloc[num_train + num_vali:] # a+b

    assert(len(data) == len(train_data) + len(test_data) + len(vali_data))

    print(f'{len(train_data)} observations in the train dataset.\n {len(test_data)} observations in the test dataset.\n {len(vali_data)} observations in the validation dataset.')

    # initialize scaler object
    scaler = StandardScaler()

    # scale data
    train_data_sc = scaler.fit_transform(train_data)
    vali_data_sc = scaler.transform(vali_data)
    test_data_sc = scaler.transform(test_data)

    train_data_sc = pd.DataFrame(train_data_sc, columns=train_data.columns, index=train_data.index)
    vali_data_sc = pd.DataFrame(vali_data_sc, columns=vali_data.columns, index=vali_data.index)
    test_data_sc = pd.DataFrame(test_data_sc, columns=test_data.columns, index=test_data.index)

    return train_data_sc, vali_data_sc, test_data_sc

#time_series = pd.read_csv("./datasets/df_most_important_columns.csv", index_col=0, parse_dates=True)
#train, vali, test = split_scale_dataset(data=time_series, train_size=train, val_size=val)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(train["DE_load_actual_entsoe_transparency"])

In [None]:
train.describe().round(2)

In [None]:
sns.histplot(vali["DE_load_actual_entsoe_transparency"])

In [None]:
vali.describe().round(2)

In [None]:
sns.histplot(test["DE_load_actual_entsoe_transparency"])

In [None]:
test.describe().round(2)

# Distribution ETTH1

In [None]:
import pandas as pd
time_series = pd.read_csv("./datasets/ETTh1.csv", index_col=0, parse_dates=True)
train, vali, test = split_scale_dataset(data=time_series, train_size=8521, val_size=2857)

In [None]:
import os
current_path = os.getcwd() + "/datasets/"
dataset = 'ETTh1.csv'

!python -u ./TSLibrary/run.py \
  --task_name long_term_forecast \
  --is_training 1 \
  --root_path $current_path \
  --data_path $dataset \
  --model_id 1 \
  --model "Informer" \
  --data custom \
  --features M \
  --seq_len 96 \
  --label_len 48 \
  --pred_len 24 \
  --e_layers 2 \
  --d_layers 1 \
  --factor 3 \
  --enc_in 7 \
  --dec_in 7 \
  --c_out 7 \
  --des 'Exp' \
  --itr 2