In [1]:
! pip install openstef==3.4.72 jupyter==1.0



In [2]:
# Import all required packages.
from openstef.data_classes.prediction_job import PredictionJobDataClass
from openstef.pipeline.train_model import train_model_pipeline
from IPython.display import IFrame
import pandas as pd

# Set plotly as the default pandas plotting backend.
pd.options.plotting.backend = 'plotly'

# Check if running in Google Colab.
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [18]:
# defining the prediction job, `model_kwargs` contains hyperparams
pj = dict(id=101,
        model='lgb',
        forecast_type="demand",
        horizon_minutes=120,
        resolution_minutes=60,
        name="lgb_poc_1",
        save_train_forecasts=True,
        ignore_existing_models=True,
        model_kwargs = {
          "learning_rate": 0.01,
          "early_stopping_rounds": 10,
          "n_estimators": 500,
          "num_leaves": 40
        },
        quantiles=[0.1, 0.5, 0.9]
       )

pj=PredictionJobDataClass(**pj)

In [19]:
# Inspect your prediction job here.
display(pj)

PredictionJobDataClass(id=101, model='lgb', model_kwargs={'learning_rate': 0.01, 'early_stopping_rounds': 10, 'n_estimators': 500, 'num_leaves': 40}, forecast_type='demand', horizon_minutes=120, resolution_minutes=60, lat=52.132633, lon=5.291266, name='lgb_poc_1', electricity_bidding_zone=<BiddingZone.NL: 'NL'>, train_components=None, description=None, quantiles=[0.1, 0.5, 0.9], train_split_func=None, backtest_split_func=None, train_horizons_minutes=None, default_modelspecs=None, save_train_forecasts=True, completeness_threshold=0.5, minimal_table_length=100, flatliner_threshold_minutes=1440, detect_non_zero_flatliner=False, data_balancing_ratio=None, rolling_aggregate_features=[], depends_on=[], sid=None, turbine_type=None, n_turbines=None, hub_height=None, pipelines_to_run=[<PipelineType.TRAIN: 'train'>, <PipelineType.HYPER_PARMATERS: 'hyper_parameters'>, <PipelineType.FORECAST: 'forecast'>], alternative_forecast_model_pid=None, data_prep_class=None)

In [20]:
if IN_COLAB:
    input_data=pd.read_csv("/content/master_data_with_forecasted.csv", index_col=0, parse_dates=True)
else:
    input_data=pd.read_csv("../data/master_data_with_forecasted.csv", index_col=0, parse_dates=True)

In [21]:
# Inspect all column names of the input data
print("columns in csv")
print(input_data.columns)

# dropping columns as we want
input_data = input_data.drop(columns=["date_time_com", "forecasted_load"])
print("remaining columns after dropping")
print(input_data.columns)

columns in csv
Index(['load', 'date_time_com', 'Holiday', 'Holiday_Type', 'temp', 'rhum',
       'prcp', 'wdir', 'wspd', 'pres', 'cldc', 'coco', 'forecasted_load'],
      dtype='object')
remaining columns after dropping
Index(['load', 'Holiday', 'Holiday_Type', 'temp', 'rhum', 'prcp', 'wdir',
       'wspd', 'pres', 'cldc', 'coco'],
      dtype='object')


In [22]:
pd.options.display.max_columns = None
display(input_data.head())

Unnamed: 0_level_0,load,Holiday,Holiday_Type,temp,rhum,prcp,wdir,wspd,pres,cldc,coco
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01 06:00:00+00:00,834.0,0.0,0.0,22.0,60.0,0.0,340.0,7.6,1020.2,1.0,1.0
2023-01-01 07:00:00+00:00,736.0,0.0,0.0,22.7,53.0,0.0,9.0,1.8,1018.2,1.0,1.0
2023-01-01 08:00:00+00:00,720.0,0.0,0.0,23.4,49.0,0.0,354.0,1.8,1017.3,1.0,1.0
2023-01-01 09:00:00+00:00,690.0,0.0,0.0,23.7,51.0,0.0,0.0,0.0,1017.2,0.0,1.0
2023-01-01 10:00:00+00:00,668.0,0.0,0.0,22.0,59.0,0.0,302.0,1.8,1016.9,0.0,1.0


In [23]:
# Here we are defining the limit of training data
# print(input_data.shape)
print(input_data.index.get_loc('2023-01-01 06:00:00+00:00'))
print(input_data.index.get_loc('2025-06-15 23:00:00+00:00'))
traing_data_last_index = input_data.index.get_loc('2025-06-15 23:00:00+00:00')

train_data=input_data.iloc[:traing_data_last_index+1]

0
21521


In [24]:
# checking if the limit of training data matches our expectation
print(f"starting hour of training_data {train_data.head(1).index}")
print(f"ending hour of training_data {train_data.tail(1).index}")

starting hour of training_data DatetimeIndex(['2023-01-01 06:00:00+00:00'], dtype='datetime64[ns, UTC]', name='date_time', freq=None)
ending hour of training_data DatetimeIndex(['2025-06-15 23:00:00+00:00'], dtype='datetime64[ns, UTC]', name='date_time', freq=None)


In [25]:
# cleaning up training data by removing duplicate indices and non-datetime indices
# Remove duplicate index values from train_data
train_data = train_data[~train_data.index.duplicated(keep='first')]

# Remove rows with NaT in the index
train_data = train_data[train_data.index.notna()]

In [26]:
# traning the model
import os

mlflow_dir = "./mlflow_trained_models"
mlflow_tracking_uri = os.path.abspath(mlflow_dir)

train_data, validation_data, test_data = train_model_pipeline(
    pj,
    train_data,
    check_old_model_age=False,
    mlflow_tracking_uri=mlflow_tracking_uri,
    artifact_folder="./mlflow_artifacts",
)

2025-11-05 03:54:36 [info     ] Model successfully loaded with MLflow
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11314
[LightGBM] [Info] Number of data points in the train set: 36460, number of used features: 78
[LightGBM] [Info] Start training from score 1268.377131
2025-11-05 03:54:43 [info     ] Fitted a new model, not yet stored
2025-11-05 03:54:48 [info     ] New model is better than old model, continuing with training procces
2025-11-05 03:54:56 [info     ] Model saved with MLflow        experiment_name=101
2025-11-05 03:54:58 [info     ] Logged figures to MLflow.
2025-11-05 03:54:58 [info     ] Writing reports to ./mlflow_artifacts/101


In [29]:
# checking if the limit of test data matches our expectation
test_data=input_data.iloc[traing_data_last_index+1:traing_data_last_index+25]
# print(test_data.head())

print(f"starting hour of test_data {test_data.head(1).index}")
print(f"ending hour of test_data {test_data.tail(1).index}")

starting hour of test_data DatetimeIndex(['2025-06-16 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='date_time', freq=None)
ending hour of test_data DatetimeIndex(['2025-06-16 23:00:00+00:00'], dtype='datetime64[ns, UTC]', name='date_time', freq=None)


In [30]:
import numpy as np
from openstef.pipeline.create_forecast import create_forecast_pipeline

# Prepare data to make the forecast.
realised=input_data.loc[test_data.index, 'load'].copy(deep=True)
to_forecast_data=input_data.copy(deep=True)
to_forecast_data.loc[test_data.index, 'load']=np.nan #clear the load data for the part you want to forecast

In [31]:
# Remove duplicate index values from train_data
to_forecast_data = to_forecast_data[~to_forecast_data.index.duplicated(keep='first')]

# Remove rows with NaT in the index
to_forecast_data = to_forecast_data[to_forecast_data.index.notna()]

# Location where the model was stored in the last exercise.
mlflow_tracking_uri="./mlflow_trained_models"

forecast=create_forecast_pipeline(
    pj,
    to_forecast_data,
    mlflow_tracking_uri,
)

2025-11-05 03:55:59 [info     ] Model successfully loaded with MLflow
2025-11-05 03:55:59 [info     ] Found 24 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.001000541960228457 num_values=24 pj_id=101
2025-11-05 03:56:02 [info     ] Postproces in preparation of storing


In [32]:
# Calculate absolute and percentage differences between realised and forecast
comparison_df = pd.DataFrame({
    'realised': realised,
    'forecast': forecast['forecast'].head(24)
})

comparison_df['absolute_difference'] = (comparison_df['forecast'] - comparison_df['realised']).abs()
comparison_df['percentage_difference'] = (comparison_df['absolute_difference'] / comparison_df['realised']) * 100

display(comparison_df)

Unnamed: 0_level_0,realised,forecast,absolute_difference,percentage_difference
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-06-16 00:00:00+00:00,1481.0,1443.437607,37.562393,2.536286
2025-06-16 01:00:00+00:00,1503.0,1383.288346,119.711654,7.964847
2025-06-16 02:00:00+00:00,1446.0,1387.921382,58.078618,4.016502
2025-06-16 03:00:00+00:00,1427.0,1409.921101,17.078899,1.196839
2025-06-16 04:00:00+00:00,1373.0,1404.196957,31.196957,2.272175
2025-06-16 05:00:00+00:00,1398.0,1441.861681,43.861681,3.137459
2025-06-16 06:00:00+00:00,1424.0,1450.282102,26.282102,1.845653
2025-06-16 07:00:00+00:00,1389.0,1395.749745,6.749745,0.485943
2025-06-16 08:00:00+00:00,1315.0,1308.284562,6.715438,0.51068
2025-06-16 09:00:00+00:00,1275.0,1228.057589,46.942411,3.681758
