In [14]:
! pip install openstef==3.4.72 jupyter==1.0



In [15]:
# Import all required packages.
from openstef.data_classes.prediction_job import PredictionJobDataClass
from openstef.pipeline.train_model import train_model_pipeline
from IPython.display import IFrame
import pandas as pd

# Set plotly as the default pandas plotting backend.
pd.options.plotting.backend = 'plotly'

# Check if running in Google Colab.
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

## Define the prediction job

OpenSTEF uses prediction jobs to define the properties of training and prediction.

- model: xgboost
    - This is the (opensource) machine learning model type that we train to make the forecasts.
- quantile: 10, 30, 50, 70 and 90 percent
    - This provides a confidence interval within OpenSTEF, based on the standard deviation.
- forecast_type: demand
    - What we are actually forecasting. This can be demand (load on the grid), wind or basecase.
- latitude: 52.0, longitude: 5.0
    - This is used to calculate the derived solar features (direct normal irradiance and the global tilted irradiance).*
    - Also used to retrieve weather data in openstef-dbc (database connector).
- horizon minutes: 15
    - The horizon of the desired forecast in minutes. It entails how far into the future we want to predict. The value of 15 entails that at the moment of prediction, you predict 15 minutes into the future. So let's say you make a prediction at one o'clock, than the prediction is for 13.15 o'clock.
- resolution minutes: 15 minutes
    - resulution of the forecasts made in minutes: how many minutes between each sample in the prediction.
- name: workshop_exercise_1
    - Name you give to the prediction job.
- save_train_forecasts: true
    -Indicates whether the forecasts produced during the training process should be saved.


Bonus: look at the documentation [here](https://openstef.github.io/openstef/openstef.data_classes.html#module-openstef.data_classes.prediction_job).

*Curious about how the latitude and longitude are used to calculated derived weather features? See [here](https://github.com/OpenSTEF/openstef/blob/main/openstef/feature_engineering/weather_features.py)

In [95]:
pj = dict(id=288,
        model='xgb',
        quantiles=[0.10,0.30,0.50,0.70,0.90],
        forecast_type="demand",
        horizon_minutes=120,
        resolution_minutes=60,
        name="workshop_exercise_1",
        save_train_forecasts=True,
       )

pj=PredictionJobDataClass(**pj)

In [96]:
# Inspect your prediction job here.
display(pj)

PredictionJobDataClass(id=288, model='xgb', model_kwargs=None, forecast_type='demand', horizon_minutes=120, resolution_minutes=60, lat=52.132633, lon=5.291266, name='workshop_exercise_1', electricity_bidding_zone=<BiddingZone.NL: 'NL'>, train_components=None, description=None, quantiles=[0.1, 0.3, 0.5, 0.7, 0.9], train_split_func=None, backtest_split_func=None, train_horizons_minutes=None, default_modelspecs=None, save_train_forecasts=True, completeness_threshold=0.5, minimal_table_length=100, flatliner_threshold_minutes=1440, detect_non_zero_flatliner=False, data_balancing_ratio=None, rolling_aggregate_features=[], depends_on=[], sid=None, turbine_type=None, n_turbines=None, hub_height=None, pipelines_to_run=[<PipelineType.TRAIN: 'train'>, <PipelineType.HYPER_PARMATERS: 'hyper_parameters'>, <PipelineType.FORECAST: 'forecast'>], alternative_forecast_model_pid=None, data_prep_class=None)

## Prepare and analyse the input data
OpenSTEF requires a certain input format: a dataframe with specific columns.

Exercise: look at the table and plots below and answer try to answer the following questions:
- What type of features do you see in the input data?
- How much time is there between two data points?
- Look at the plots for radiation and windspeed, do you see any paterns?
    - Hint: do you see something happening to the load when there is a peak in either radiation or wind speed? Can you explain why?
    - Note: in these plots we zoomed in on a random week, for visibility purposes.

Hint: you can zoom in on the plots to see more details.
Hint 2: the 'load' is the target that we want to forecast. So it is not a feature.

If you are working with Google Colab, just upload the data in the 'Files' section on Google Colab. You can find this at the left toolbar, the fifth item from the top.

In [97]:
if IN_COLAB:
    input_data=pd.read_csv("/content/master_data_with_forecasted.csv", index_col=0, parse_dates=True)
else:
    input_data=pd.read_csv("../data/master_data_with_forecasted.csv", index_col=0, parse_dates=True)

In [98]:
if isinstance(input_data, pd.DataFrame):
    print("The variable is a Pandas DataFrame.")

The variable is a Pandas DataFrame.


In [99]:
# Inspect all column names of the input data.
print(input_data.columns)

Index(['load', 'date_time_com', 'Holiday', 'Holiday_Type', 'temp', 'rhum',
       'prcp', 'wdir', 'wspd', 'pres', 'cldc', 'coco', 'forecasted_load'],
      dtype='object')


In [100]:
# input_data = input_data.drop(columns=["date_time_com", "Holiday", "Holiday_Type", "forecasted_load"])
input_data = input_data.drop(columns=["date_time_com", "forecasted_load"])

print(input_data.columns)

Index(['load', 'Holiday', 'Holiday_Type', 'temp', 'rhum', 'prcp', 'wdir',
       'wspd', 'pres', 'cldc', 'coco'],
      dtype='object')


In [101]:
pd.options.display.max_columns = None
display(input_data.head())

Unnamed: 0_level_0,load,Holiday,Holiday_Type,temp,rhum,prcp,wdir,wspd,pres,cldc,coco
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01 06:00:00+00:00,834.0,0.0,0.0,22.0,60.0,0.0,340.0,7.6,1020.2,1.0,1.0
2023-01-01 07:00:00+00:00,736.0,0.0,0.0,22.7,53.0,0.0,9.0,1.8,1018.2,1.0,1.0
2023-01-01 08:00:00+00:00,720.0,0.0,0.0,23.4,49.0,0.0,354.0,1.8,1017.3,1.0,1.0
2023-01-01 09:00:00+00:00,690.0,0.0,0.0,23.7,51.0,0.0,0.0,0.0,1017.2,0.0,1.0
2023-01-01 10:00:00+00:00,668.0,0.0,0.0,22.0,59.0,0.0,302.0,1.8,1016.9,0.0,1.0


In [102]:
print(input_data.shape)
print(input_data.index.get_loc('2023-01-01 06:00:00+00:00'))
print(input_data.index.get_loc('2025-06-15 23:00:00+00:00'))
traing_data_last_index = input_data.index.get_loc('2025-06-15 23:00:00+00:00')

(24066, 11)
0
21521


In [103]:
# The model should be only trained on the training part of the input data. Therefore, the input data should be split.
train_data=input_data.iloc[:traing_data_last_index+1]

In [104]:
print(train_data.tail(1))

                             load  Holiday  Holiday_Type  temp  rhum  prcp  \
date_time                                                                    
2025-06-15 23:00:00+00:00  1579.0      0.0           0.0  27.8  89.0   0.0   

                           wdir  wspd    pres  cldc  coco  
date_time                                                  
2025-06-15 23:00:00+00:00  86.0   1.8  1001.0   8.0   4.0  


In [105]:
fig_load=input_data["load"].iloc[57:729].plot()
fig_load.update_layout(
    xaxis_title='Timestamp',
    yaxis_title="Load [MW]"
)
fig_load.show()

In [106]:
fig_windspeed=input_data["temp"].iloc[57:729].plot()
fig_windspeed.update_layout(
    xaxis_title='Timestamp',
    yaxis_title="Windspeed"
)
fig_windspeed.show()

## Training the model
After defining the prediction job and preparing the input data, the model can be trained.

Exercise:
- Find out what happens in the 'train_model_pipeline'. More specifically, what are the inputs and outputs?
- Why do we only use the train_data?

Hint: find pipeline in the list provided on the OpenSTEF website, and look at the documentation [here](https://openstef.github.io/openstef/user_guides.html). Click on the pipeline openstef.pipeline.train_model to look at the documentation.

In [107]:
# Remove duplicate index values from train_data
train_data = train_data[~train_data.index.duplicated(keep='first')]

# Remove rows with NaT in the index
train_data = train_data[train_data.index.notna()]

import os

mlflow_dir = "./mlflow_trained_models"
mlflow_tracking_uri = os.path.abspath(mlflow_dir)

train_data, validation_data, test_data = train_model_pipeline(
    pj,
    train_data,
    check_old_model_age=False,
    mlflow_tracking_uri=mlflow_tracking_uri,
    artifact_folder="./mlflow_artifacts",
)

2025-11-04 07:52:33 [info     ] Model successfully loaded with MLflow
[0]	validation_0-rmse:236.25871	validation_1-rmse:207.69461
[1]	validation_0-rmse:191.50596	validation_1-rmse:165.07445
[2]	validation_0-rmse:157.79554	validation_1-rmse:136.95818
[3]	validation_0-rmse:136.19151	validation_1-rmse:119.18746
[4]	validation_0-rmse:118.99135	validation_1-rmse:107.53555
[5]	validation_0-rmse:107.45215	validation_1-rmse:100.49162
[6]	validation_0-rmse:98.65690	validation_1-rmse:95.00617
[7]	validation_0-rmse:92.69398	validation_1-rmse:90.81742
[8]	validation_0-rmse:87.72513	validation_1-rmse:88.76102
[9]	validation_0-rmse:84.07820	validation_1-rmse:86.75683
[10]	validation_0-rmse:80.17959	validation_1-rmse:85.26305
[11]	validation_0-rmse:78.00006	validation_1-rmse:84.61493
[12]	validation_0-rmse:75.90753	validation_1-rmse:84.35151
[13]	validation_0-rmse:74.97944	validation_1-rmse:84.01239
[14]	validation_0-rmse:73.67082	validation_1-rmse:84.09357
[15]	validation_0-rmse:72.22394	validation_

## Analyse the trained model
Now that the model has been trained, you can inspect the results.

Exercise: answer the following questions.
- Are all of the features in the feature importance plot in the input data? Why?
    - What are the most important features?
- Which time horizon is more accurate?
    - Hint: zoom in on the same day for both the Predictor0.25 and Predictor47.0 and examine them next to each other.
- Where is my trained model?




The first two plots are the 'predictor in action' plots for the two time horizons (0.25 means fifteen minutes ahead, 47.0 means 47 hours ahead). In these plots you can see three different data outputs: train, validation and test. For each of these, you can see an '_actual' and '_predict'. This entails that for everyone of these data outputs, the measured value and the predicted value by OpenSTEF is plotted. Thus 'train_predict' is the prediction by OpenSTEF based on the train data.  

The last plot is the feature importance, this plot shows all of your input features (radiation, windspeed, lagged load, etc, etc,) and how much they influence the forecast. If a block is relatively large, this means the feature is relatively important for the forecast. Thus, large changes in the value of this feature results in a large difference in forecast.

Note: These IFrames do not work in Google Colab. The images can be found in the folder ``mlflow_artifact'', and opened in jouw browser.

In [48]:
if not IN_COLAB:
    # Inspect local files.
    display(IFrame('./mlflow_artifacts/{}/Predictor0.25.html'.format(pj['id']), width=900, height=400))
    display(IFrame('./mlflow_artifacts/{}/Predictor47.0.html'.format(pj['id']), width=800, height=400))
    display(IFrame('./mlflow_artifacts/{}/weight_plot.html'.format(pj['id']), width=800, height=400))


## Visual Studio Code has difficulties with displaying htmls. If you are working with VSC and are not able to inspect the plots, uncomment the code below
## to open the plots in your browser.

# import webbrowser
# webbrowser.open('./mlflow_artifacts/{}/Predictor0.25.html'.format(pj['id']))
# webbrowser.open('./mlflow_artifacts/{}/Predictor47.0.html'.format(pj['id']))
# webbrowser.open('./mlflow_artifacts/{}/weight_plot.html'.format(pj['id']))

In [108]:
test_data=input_data.iloc[traing_data_last_index+1:traing_data_last_index+25]
print(test_data)

                             load  Holiday  Holiday_Type  temp  rhum  prcp  \
date_time                                                                    
2025-06-16 00:00:00+00:00  1481.0      0.0           0.0  28.5  81.0   0.0   
2025-06-16 01:00:00+00:00  1503.0      0.0           0.0  28.4  86.0   0.0   
2025-06-16 02:00:00+00:00  1446.0      0.0           0.0  29.1  82.0   0.1   
2025-06-16 03:00:00+00:00  1427.0      0.0           0.0  31.0  70.0   0.1   
2025-06-16 04:00:00+00:00  1373.0      0.0           0.0  31.6  68.0   0.3   
2025-06-16 05:00:00+00:00  1398.0      0.0           0.0  32.2  66.0   0.3   
2025-06-16 06:00:00+00:00  1424.0      0.0           0.0  33.2  62.0   0.1   
2025-06-16 07:00:00+00:00  1389.0      0.0           0.0  32.5  66.0   0.5   
2025-06-16 08:00:00+00:00  1315.0      0.0           0.0  32.2  67.0   0.1   
2025-06-16 09:00:00+00:00  1275.0      0.0           0.0  31.2  70.0   0.8   
2025-06-16 10:00:00+00:00  1250.0      0.0           0.0  30.9  

In [109]:
import numpy as np
from openstef.pipeline.create_forecast import create_forecast_pipeline

# Prepare data to make the forecast.
realised=input_data.loc[test_data.index, 'load'].copy(deep=True)
to_forecast_data=input_data.copy(deep=True)
to_forecast_data.loc[test_data.index, 'load']=np.nan #clear the load data for the part you want to forecast

In [110]:
print(test_data.index)

DatetimeIndex(['2025-06-16 00:00:00+00:00', '2025-06-16 01:00:00+00:00',
               '2025-06-16 02:00:00+00:00', '2025-06-16 03:00:00+00:00',
               '2025-06-16 04:00:00+00:00', '2025-06-16 05:00:00+00:00',
               '2025-06-16 06:00:00+00:00', '2025-06-16 07:00:00+00:00',
               '2025-06-16 08:00:00+00:00', '2025-06-16 09:00:00+00:00',
               '2025-06-16 10:00:00+00:00', '2025-06-16 11:00:00+00:00',
               '2025-06-16 12:00:00+00:00', '2025-06-16 13:00:00+00:00',
               '2025-06-16 14:00:00+00:00', '2025-06-16 15:00:00+00:00',
               '2025-06-16 16:00:00+00:00', '2025-06-16 17:00:00+00:00',
               '2025-06-16 18:00:00+00:00', '2025-06-16 19:00:00+00:00',
               '2025-06-16 20:00:00+00:00', '2025-06-16 21:00:00+00:00',
               '2025-06-16 22:00:00+00:00', '2025-06-16 23:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='date_time', freq=None)


In [111]:
print(realised)

date_time
2025-06-16 00:00:00+00:00    1481.0
2025-06-16 01:00:00+00:00    1503.0
2025-06-16 02:00:00+00:00    1446.0
2025-06-16 03:00:00+00:00    1427.0
2025-06-16 04:00:00+00:00    1373.0
2025-06-16 05:00:00+00:00    1398.0
2025-06-16 06:00:00+00:00    1424.0
2025-06-16 07:00:00+00:00    1389.0
2025-06-16 08:00:00+00:00    1315.0
2025-06-16 09:00:00+00:00    1275.0
2025-06-16 10:00:00+00:00    1250.0
2025-06-16 11:00:00+00:00    1226.0
2025-06-16 12:00:00+00:00    1203.0
2025-06-16 13:00:00+00:00    1222.0
2025-06-16 14:00:00+00:00    1277.0
2025-06-16 15:00:00+00:00    1431.0
2025-06-16 16:00:00+00:00    1534.0
2025-06-16 17:00:00+00:00    1604.0
2025-06-16 18:00:00+00:00    1649.0
2025-06-16 19:00:00+00:00    1649.0
2025-06-16 20:00:00+00:00    1597.0
2025-06-16 21:00:00+00:00    1624.0
2025-06-16 22:00:00+00:00    1595.0
2025-06-16 23:00:00+00:00    1526.0
Name: load, dtype: float64


In [112]:
print(to_forecast_data[traing_data_last_index+1:traing_data_last_index+25])

                           load  Holiday  Holiday_Type  temp  rhum  prcp  \
date_time                                                                  
2025-06-16 00:00:00+00:00   NaN      0.0           0.0  28.5  81.0   0.0   
2025-06-16 01:00:00+00:00   NaN      0.0           0.0  28.4  86.0   0.0   
2025-06-16 02:00:00+00:00   NaN      0.0           0.0  29.1  82.0   0.1   
2025-06-16 03:00:00+00:00   NaN      0.0           0.0  31.0  70.0   0.1   
2025-06-16 04:00:00+00:00   NaN      0.0           0.0  31.6  68.0   0.3   
2025-06-16 05:00:00+00:00   NaN      0.0           0.0  32.2  66.0   0.3   
2025-06-16 06:00:00+00:00   NaN      0.0           0.0  33.2  62.0   0.1   
2025-06-16 07:00:00+00:00   NaN      0.0           0.0  32.5  66.0   0.5   
2025-06-16 08:00:00+00:00   NaN      0.0           0.0  32.2  67.0   0.1   
2025-06-16 09:00:00+00:00   NaN      0.0           0.0  31.2  70.0   0.8   
2025-06-16 10:00:00+00:00   NaN      0.0           0.0  30.9  71.0   1.0   
2025-06-16 1

In [113]:
# Remove duplicate index values from train_data
to_forecast_data = to_forecast_data[~to_forecast_data.index.duplicated(keep='first')]

# Remove rows with NaT in the index
to_forecast_data = to_forecast_data[to_forecast_data.index.notna()]

# Location where the model was stored in the last exercise.
mlflow_tracking_uri="./mlflow_trained_models"

forecast=create_forecast_pipeline(
    pj,
    to_forecast_data,
    mlflow_tracking_uri,
)

2025-11-04 07:53:17 [info     ] Model successfully loaded with MLflow
2025-11-04 07:53:17 [info     ] Found 24 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.001000541960228457 num_values=24 pj_id=288
2025-11-04 07:53:20 [info     ] Postproces in preparation of storing


In [114]:
display(forecast.head(24)[['forecast']])

Unnamed: 0_level_0,forecast
date_time,Unnamed: 1_level_1
2025-06-16 00:00:00+00:00,1485.327393
2025-06-16 01:00:00+00:00,1321.171631
2025-06-16 02:00:00+00:00,1351.788086
2025-06-16 03:00:00+00:00,1353.3927
2025-06-16 04:00:00+00:00,1366.372314
2025-06-16 05:00:00+00:00,1370.308228
2025-06-16 06:00:00+00:00,1412.314087
2025-06-16 07:00:00+00:00,1390.793945
2025-06-16 08:00:00+00:00,1277.286865
2025-06-16 09:00:00+00:00,1191.430298


In [115]:
# Calculate absolute and percentage differences between realised and forecast
comparison_df = pd.DataFrame({
    'realised': realised,
    'forecast': forecast['forecast'].head(24)
})

comparison_df['absolute_difference'] = (comparison_df['forecast'] - comparison_df['realised']).abs()
comparison_df['percentage_difference'] = (comparison_df['absolute_difference'] / comparison_df['realised']) * 100

display(comparison_df)

Unnamed: 0_level_0,realised,forecast,absolute_difference,percentage_difference
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-06-16 00:00:00+00:00,1481.0,1485.327393,4.327393,0.292194
2025-06-16 01:00:00+00:00,1503.0,1321.171631,181.828369,12.097696
2025-06-16 02:00:00+00:00,1446.0,1351.788086,94.211914,6.515347
2025-06-16 03:00:00+00:00,1427.0,1353.3927,73.6073,5.158185
2025-06-16 04:00:00+00:00,1373.0,1366.372314,6.627686,0.482716
2025-06-16 05:00:00+00:00,1398.0,1370.308228,27.691772,1.980813
2025-06-16 06:00:00+00:00,1424.0,1412.314087,11.685913,0.82064
2025-06-16 07:00:00+00:00,1389.0,1390.793945,1.793945,0.129154
2025-06-16 08:00:00+00:00,1315.0,1277.286865,37.713135,2.867919
2025-06-16 09:00:00+00:00,1275.0,1191.430298,83.569702,6.554486
