---

# Legacy Linear Model: benchmarks

By: Tomás Urdiales

-> Notebook for benchmarking the legacy model (using updated pipeline and data). Also contains experiments with different cross-validation training and testing sizes.

##### Libraries

In [None]:
# Change notebook CWD to the project's root, so python can read from src:
import os
os.chdir("..")

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

from src import utils, my_plots
from src.join_qh_min_data import join_qh_min_data
from src.cross_validation import backtesting_CV

##### Parameters

In [4]:
# Get root directory:
CWD = utils.get_root_dir()

# To not use darts plotting style:
plt.style.use('default')

DPI = 150 # (pixel density for figures)
ELIA_ORANGE = (250/256, 115/256, 1/256) # RGB

##### Load data

In [5]:
qh = utils.load_qh_historical_data().loc["2021":"2022"]
minute = utils.load_min_historical_data().loc["2021":"2022"]
ems = utils.load_ems_historical_data().loc["2021":"2022"]

---

## Prepare training dataframe

In [4]:
MINUTE = 3

qh_parameters = {
    "system_imbalance_cum15": {
        "lags": [0]},
    "si_mw": {
        "lags": [-1, -2, -3, -4]},
    "load_id_mw": {
        "lags": [3, 2, 1, 0, -1, -2, -3, -4]},
    "nrv_rt": {
        "lags": [-1, -2, -3, -4]},
}

minute_parameters = {
    "system_imbalance": {
        "lags": [0, -1, -2, -3]},
    "net_regulation_volume": {
        "lags": [0, -1, -2, -3]},
}

In [5]:
df = join_qh_min_data(
    qh_data=qh,
    minute_data=minute,
    qh_parameters=qh_parameters,
    minute_parameters=minute_parameters,
    minute=MINUTE,
    ems_data=None,
    ems_parameters=None,
)

df = df.dropna()

df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 70037 entries, 2021-01-01 01:03:00+01:00 to 2022-12-31 23:03:00+01:00
Columns: 25 entries, system_imbalance_cum15_current_qh to net_regulation_volume_from_minute_minus_3
dtypes: float32(25)
memory usage: 7.2 MB


---

## Linear Regression Model: current quarter hour

In [6]:
# Set up splits and model features:
tscv = TimeSeriesSplit(n_splits=52 * 7 // 4, max_train_size=4 * 24 * 7 * 8, test_size=4 * 24 * 4, gap=0)
# This configuration covers one entire year (2022)

TARGET = "system_imbalance_cum15_current_qh"
FEATURES = [feature for feature in df.columns if feature != TARGET]

In [7]:
# Adjust linear model settings:
model_linear = LinearRegression()

# Cross-validation:
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)

MODEL: LinearRegression
Time configuration: 91 splits, 4 testing days, 8 training weeks. Total predicted time: 364 days.



100%|██████████| 91/91 [00:14<00:00,  6.29it/s]


Train set average error:
MAE: 56.63MW | RMSE: 72.89MW
Test set average error:
MAE: 57.46MW | RMSE: 73.98MW | MASE: 0.4897 | P90 Error: 119.91MW | Max Error: 513.3 (2022-02-19 10:03:00+01:00)
Time elapsed: 14.51s | Time per split: ~0.16s






---

## Linear Regression Model: next quarter hour

In [8]:
qh_parameters = {
    "system_imbalance_cum15": {
        "lags": [1]},
    "si_mw": {
        "lags": [-1, -2, -3, -4]},
    "load_id_mw": {
        "lags": [3, 2, 1, 0, -1, -2, -3, -4]},
    "nrv_rt": {
        "lags": [-1, -2, -3, -4]},
}

minute_parameters = {
    "system_imbalance": {
        "lags": [0, -1, -2, -3]},
    "net_regulation_volume": {
        "lags": [0, -1, -2, -3]},
}

df = join_qh_min_data(
    qh_data=qh,
    minute_data=minute,
    qh_parameters=qh_parameters,
    minute_parameters=minute_parameters,
    minute=MINUTE,
    ems_data=None,
    ems_parameters=None,
)

df = df.dropna()

df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 70026 entries, 2021-01-01 01:03:00+01:00 to 2022-12-31 23:03:00+01:00
Columns: 25 entries, system_imbalance_cum15_from_qh_plus_1 to net_regulation_volume_from_minute_minus_3
dtypes: float32(25)
memory usage: 7.2 MB


In [9]:
# Set up splits and model features:
tscv = TimeSeriesSplit(n_splits=52 * 7 // 4, max_train_size=4 * 24 * 7 * 8, test_size=4 * 24 * 4, gap=0)
# This configuration covers one entire year (2022)

TARGET = "system_imbalance_cum15_from_qh_plus_1"
FEATURES = [feature for feature in df.columns if feature != TARGET]

In [10]:
# Adjust linear model settings:
model_linear = LinearRegression()

# Cross-validation:
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)

MODEL: LinearRegression
Time configuration: 91 splits, 4 testing days, 8 training weeks. Total predicted time: 364 days.



100%|██████████| 91/91 [00:15<00:00,  6.02it/s]


Train set average error:
MAE: 110.50MW | RMSE: 146.84MW
Test set average error:
MAE: 111.99MW | RMSE: 148.95MW | MASE: 0.7471 | P90 Error: 237.22MW | Max Error: 1089.3 (2022-09-06 21:18:00+02:00)
Time elapsed: 15.12s | Time per split: ~0.17s






---

## Experimenting with training set size

In [11]:
# Use much more training data (12 WEEKS):
tscv = TimeSeriesSplit(n_splits=52, max_train_size=4 * 24 * 7 * 12, test_size=4 * 24 * 7, gap=0)

# Test it:
model_linear = LinearRegression()
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)

MODEL: LinearRegression
Time configuration: 52 splits, 7 testing days, 12 training weeks. Total predicted time: 364 days.



100%|██████████| 52/52 [00:10<00:00,  4.89it/s]


Train set average error:
MAE: 110.13MW | RMSE: 146.37MW
Test set average error:
MAE: 112.14MW | RMSE: 149.03MW | MASE: 0.7480 | P90 Error: 237.90MW | Max Error: 1085.9 (2022-09-06 21:18:00+02:00)
Time elapsed: 10.64s | Time per split: ~0.20s






In [12]:
# Use much less data (1 WEEK):
tscv = TimeSeriesSplit(n_splits=52, max_train_size=4 * 24 * 7 * 1, test_size=4 * 24 * 7, gap=0)

# Test it:
model_linear = LinearRegression()
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)

MODEL: LinearRegression
Time configuration: 52 splits, 7 testing days, 1 training weeks. Total predicted time: 364 days.



100%|██████████| 52/52 [00:08<00:00,  5.91it/s]


Train set average error:
MAE: 108.41MW | RMSE: 143.38MW
Test set average error:
MAE: 114.90MW | RMSE: 152.71MW | MASE: 0.7665 | P90 Error: 243.62MW | Max Error: 1127.0 (2022-06-19 07:03:00+02:00)
Time elapsed: 8.81s | Time per split: ~0.17s






---

## Experimenting with different testing size

In [13]:
# Use much SHORTER test window:
tscv = TimeSeriesSplit(n_splits=52 * 7, max_train_size=4 * 24 * 7 * 4, test_size=4 * 24, gap=0)

# Test it:
model_linear = LinearRegression()
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)


MODEL: LinearRegression
Time configuration: 364 splits, 1 testing days, 4 training weeks. Total predicted time: 364 days.



100%|██████████| 364/364 [00:53<00:00,  6.75it/s]


Train set average error:
MAE: 110.12MW | RMSE: 146.25MW
Test set average error:
MAE: 112.02MW | RMSE: 148.96MW | MASE: 0.7473 | P90 Error: 237.32MW | Max Error: 1091.7 (2022-09-06 21:18:00+02:00)
Time elapsed: 53.91s | Time per split: ~0.15s






In [14]:
# Use much LONGER test window:
tscv = TimeSeriesSplit(n_splits=52 // 4, max_train_size=4 * 24 * 7 * 4, test_size=4 * 24 * 7 * 4, gap=0)

# Test it:
model_linear = LinearRegression()
results_linear = backtesting_CV(model=model_linear,
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)


MODEL: LinearRegression
Time configuration: 13 splits, 28 testing days, 4 training weeks. Total predicted time: 364 days.



100%|██████████| 13/13 [00:02<00:00,  5.22it/s]


Train set average error:
MAE: 110.10MW | RMSE: 146.09MW
Test set average error:
MAE: 112.72MW | RMSE: 149.91MW | MASE: 0.7519 | P90 Error: 238.57MW | Max Error: 1091.6 (2022-09-06 21:18:00+02:00)
Time elapsed: 2.50s | Time per split: ~0.19s






---

## Best so far

In [16]:
MINUTE = 3

qh_parameters = {
    "system_imbalance_cum15": {
        "lags": [1, -1, -2, -3, -5, -95, -671]},

    "total_load_last_mw": {
        "lags": [3, 2, 1, 0, -1, -2, -3, -4]},

    "nrv_rt": {
        "lags": [-1, -2, -3, -4]},
    "load_rt_mw": {
        "lags": [-1, -2, -3, -4]},
    "wind_rt_mw": {
        "lags": [-1, -2, -3, -4]},
}


minute_parameters = {
    "system_imbalance": {
        "lags": [0, -1, -4]},
    "net_regulation_volume": {
        "lags": [0, -1, -2, -3]},
}

# Prepare training dataframe:
df = join_qh_min_data(
    qh_data=qh,
    minute_data=minute,
    qh_parameters=qh_parameters,
    minute_parameters=minute_parameters,
    minute=MINUTE,
    # ems_data=ems,
    # ems_parameters=ems_parameters,
)
TARGET = "system_imbalance_cum15_from_qh_plus_1"
FEATURES = [feature for feature in df.columns if feature != TARGET]

df = df.dropna()

# Set up splits and model features:
tscv = TimeSeriesSplit(n_splits=52 * 7 // 4, max_train_size=4 * 24 * 7 * 8, test_size=4 * 24 * 4, gap=0)

# Cross-validation:
results_linear = backtesting_CV(model=LinearRegression(),
                                data=df,
                                time_splits=tscv,
                                features=FEATURES,
                                target=TARGET,
                                progress_bar=True,
                                verbose=False)

MODEL: LinearRegression
Time configuration: 91 splits, 4 testing days, 8 training weeks. Total predicted time: 364 days.



100%|██████████| 91/91 [00:28<00:00,  3.20it/s]


Train set average error:
MAE: 108.27MW | RMSE: 144.08MW
Test set average error:
MAE: 109.83MW | RMSE: 146.51MW | MASE: 0.7326 | P90 Error: 232.11MW | Max Error: 1062.4 (2022-09-06 21:18:00+02:00)
Time elapsed: 28.45s | Time per split: ~0.31s




