In [1]:
! pip install openstef==3.4.72 jupyter==1.0

Collecting openstef==3.4.72
  Downloading openstef-3.4.72-py3-none-any.whl.metadata (8.8 kB)
Collecting jupyter==1.0
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting holidays==0.21 (from openstef==3.4.72)
  Downloading holidays-0.21-py3-none-any.whl.metadata (15 kB)
Collecting joblib==1.3.2 (from openstef==3.4.72)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting mlflow~=2.3 (from openstef==3.4.72)
  Downloading mlflow-2.22.2-py3-none-any.whl.metadata (30 kB)
Collecting optuna~=3.1 (from openstef==3.4.72)
  Downloading optuna-3.6.2-py3-none-any.whl.metadata (17 kB)
Collecting optuna-integration~=3.6 (from openstef==3.4.72)
  Downloading optuna_integration-3.6.0-py3-none-any.whl.metadata (10 kB)
Collecting pvlib==0.10.5 (from openstef==3.4.72)
  Downloading pvlib-0.10.5-py3-none-any.whl.metadata (2.8 kB)
Collecting pymsteams~=0.2.2 (from openstef==3.4.72)
  Downloading pymsteams-0.2.5-py3-none-any.whl.metadata (22 kB)
Collecting sci

In [2]:
# Imports and settings
from openstef.data_classes.prediction_job import PredictionJobDataClass
from openstef.pipeline.train_model import train_model_pipeline
from openstef.pipeline.create_forecast import create_forecast_pipeline
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go

# Plotly backend for pandas
pd.options.plotting.backend = 'plotly'

# Detect Colab
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False


#### Read and prepare master data for training

In [15]:
# Load dataset and prepare train/test windows (same as existing notebooks)
if IN_COLAB:
    csv_path = "/content/master_data_with_forecasted.csv"
else:
    csv_path = "../data/master_data_with_forecasted.csv"

input_data = pd.read_csv(csv_path, index_col=0, parse_dates=True)

# Drop columns to match prior notebooks
cols_to_drop = ["date_time_com", "forecasted_load"]
input_data = input_data.drop(columns=[c for c in cols_to_drop if c in input_data.columns])

# Ensure index cleanliness
input_data = input_data[~input_data.index.duplicated(keep="first")]
input_data = input_data[input_data.index.notna()]

#### Define boundary for training and testing

In [16]:
train_start = pd.Timestamp('2023-01-01 06:00:00+00:00')
train_end = pd.Timestamp('2025-06-15 23:00:00+00:00')
forecast_start = pd.Timestamp('2025-06-16 00:00:00+00:00')
forecast_end = pd.Timestamp('2025-06-16 23:00:00+00:00')

In [17]:
train_data = input_data.loc[train_start:train_end].copy()
forecast_index = pd.date_range(forecast_start, forecast_end, freq='H', tz=train_data.index.tz)

print(f"Training from {train_data.index.min()} to {train_data.index.max()}")
print(f"Forecast window from {forecast_index.min()} to {forecast_index.max()}")

# Display a quick peek
display(input_data.head())


Training from 2023-01-01 06:00:00+00:00 to 2025-06-15 23:00:00+00:00
Forecast window from 2025-06-16 00:00:00+00:00 to 2025-06-16 23:00:00+00:00



'H' is deprecated and will be removed in a future version, please use 'h' instead.



Unnamed: 0_level_0,load,Holiday,Holiday_Type,temp,rhum,prcp,wdir,wspd,pres,cldc,coco
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01 06:00:00+00:00,834.0,0.0,0.0,22.0,60.0,0.0,340.0,7.6,1020.2,1.0,1.0
2023-01-01 07:00:00+00:00,736.0,0.0,0.0,22.7,53.0,0.0,9.0,1.8,1018.2,1.0,1.0
2023-01-01 08:00:00+00:00,720.0,0.0,0.0,23.4,49.0,0.0,354.0,1.8,1017.3,1.0,1.0
2023-01-01 09:00:00+00:00,690.0,0.0,0.0,23.7,51.0,0.0,0.0,0.0,1017.2,0.0,1.0
2023-01-01 10:00:00+00:00,668.0,0.0,0.0,22.0,59.0,0.0,302.0,1.8,1016.9,0.0,1.0


In [5]:
# Helper: train and forecast for a given model type

def train_and_forecast(model_type: str, pj_id: int, model_name: str, model_kwargs: dict):
    assert model_type in {"xgb", "lgb"}

    pj_dict = dict(
        id=pj_id,
        model=model_type,
        forecast_type="demand",
        horizon_minutes=120,
        resolution_minutes=60,
        name=model_name,
        save_train_forecasts=True,
        ignore_existing_models=True,
        model_kwargs=model_kwargs,
        quantiles=[0.1, 0.5, 0.9],
    )
    pj = PredictionJobDataClass(**pj_dict)

    # MLflow location
    mlflow_dir = "./mlflow_trained_models"
    mlflow_tracking_uri = os.path.abspath(mlflow_dir)

    # Train
    trained_train, val, test = train_model_pipeline(
        pj,
        train_data,
        check_old_model_age=False,
        mlflow_tracking_uri=mlflow_tracking_uri,
        artifact_folder="./mlflow_artifacts",
    )

    # Prepare forecast data
    realised = input_data.loc[forecast_index, 'load'].copy(deep=True)
    to_forecast = input_data.copy(deep=True)
    to_forecast.loc[forecast_index, 'load'] = np.nan

    # Forecast
    forecast = create_forecast_pipeline(
        pj,
        to_forecast,
        mlflow_tracking_uri,
    )

    return pj, realised, forecast


## XGBoost (XGB) model: train with extensive hyperparameters and forecast


In [21]:
# Define an extensive XGB hyperparameter set, train and forecast, then plot
xgb_params = {
    # Core boosters
    "booster": "gbtree",
    # Tree growth/complexity
    "max_depth": 8,
    "min_child_weight": 3,
    "gamma": 0.0,
    "max_delta_step": 0,
    # Sampling
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "colsample_bylevel": 1.0,
    "colsample_bynode": 1.0,
    # Regularization
    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    # Optimization
    "learning_rate": 0.03,
    "n_estimators": 200,
    "early_stopping_rounds": 50,
    # Misc
    "tree_method": "hist",
    "grow_policy": "depthwise",
    "max_leaves": 0,
    "n_jobs": -1,
    "random_state": 42,
}

xgb_pj, xgb_realised, xgb_forecast = train_and_forecast(
    model_type="xgb",
    pj_id=201,
    model_name="xgb_full_params",
    model_kwargs=xgb_params,
)

# Build comparison on the 24h forecast window
xgb_comp = pd.DataFrame({
    "realised": xgb_realised,
    "forecast": xgb_forecast["forecast"].head(24),
})

2025-11-12 04:56:10 [info     ] Model successfully loaded with MLflow
[0]	validation_0-rmse:234.06221	validation_1-rmse:216.02836
[1]	validation_0-rmse:187.63510	validation_1-rmse:186.70937
[2]	validation_0-rmse:156.51518	validation_1-rmse:179.72614
[3]	validation_0-rmse:134.99543	validation_1-rmse:182.31626
[4]	validation_0-rmse:117.59023	validation_1-rmse:175.31729
[5]	validation_0-rmse:107.09031	validation_1-rmse:183.23035
[6]	validation_0-rmse:98.87529	validation_1-rmse:191.24735
[7]	validation_0-rmse:93.61122	validation_1-rmse:198.01193
[8]	validation_0-rmse:88.19907	validation_1-rmse:196.67043
[9]	validation_0-rmse:84.86192	validation_1-rmse:197.87720
[10]	validation_0-rmse:82.13195	validation_1-rmse:197.13268
[11]	validation_0-rmse:78.75018	validation_1-rmse:196.51875
[12]	validation_0-rmse:76.03010	validation_1-rmse:201.50794
[13]	validation_0-rmse:73.46461	validation_1-rmse:203.34691
[14]	validation_0-rmse:71.89786	validation_1-rmse:203.33118
2025-11-12 04:56:17 [info     ] Fi

#### Plot actual vs forecast for XGB

In [22]:

fig_xgb = go.Figure()
fig_xgb.add_trace(go.Scatter(x=xgb_comp.index, y=xgb_comp["realised"], mode="lines", name="Actual load"))
fig_xgb.add_trace(go.Scatter(x=xgb_comp.index, y=xgb_comp["forecast"], mode="lines", name="XGB forecast"))
fig_xgb.update_layout(title="XGB: Actual vs Forecast", xaxis_title="Datetime", yaxis_title="Load")
fig_xgb.show()

#### Table for xgb forecast vs actual

In [14]:
display(xgb_comp.assign(
    absolute_difference=(xgb_comp["forecast"] - xgb_comp["realised"]).abs(),
    percentage_difference=((xgb_comp["forecast"] - xgb_comp["realised"]).abs() / xgb_comp["realised"]) * 100,
))

Unnamed: 0,realised,forecast,absolute_difference,percentage_difference
2025-06-16 00:00:00+00:00,1481.0,1500.133667,19.133667,1.291942
2025-06-16 01:00:00+00:00,1503.0,1331.314209,171.685791,11.422874
2025-06-16 02:00:00+00:00,1446.0,1340.506836,105.493164,7.295516
2025-06-16 03:00:00+00:00,1427.0,1342.394165,84.605835,5.92893
2025-06-16 04:00:00+00:00,1373.0,1319.149536,53.850464,3.922102
2025-06-16 05:00:00+00:00,1398.0,1383.259277,14.740723,1.054415
2025-06-16 06:00:00+00:00,1424.0,1383.071289,40.928711,2.874207
2025-06-16 07:00:00+00:00,1389.0,1357.614136,31.385864,2.259601
2025-06-16 08:00:00+00:00,1315.0,1264.73938,50.26062,3.8221
2025-06-16 09:00:00+00:00,1275.0,1252.748535,22.251465,1.745213


## LightGBM (LGB) model: train with extensive hyperparameters and forecast


In [10]:
# Define an extensive LGB hyperparameter set, train and forecast, then plot
lgb_params = {
    # Core boosting
    "boosting_type": "gbdt",
    # Tree/leaf complexity
    "num_leaves": 64,
    "max_depth": -1,
    "min_child_samples": 20,
    "min_sum_hessian_in_leaf": 1e-3,
    "min_gain_to_split": 0.0,
    # Sampling
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "feature_fraction": 0.9,
    # Regularization
    "lambda_l1": 0.0,
    "lambda_l2": 1.0,
    # Bins and other
    "max_bin": 255,
    # Optimization
    "learning_rate": 0.03,
    "n_estimators": 800,
    "early_stopping_rounds": 50,
    # System
    "num_threads": -1,
    "verbosity": -1,
    "deterministic": True,
}

lgb_pj, lgb_realised, lgb_forecast = train_and_forecast(
    model_type="lgb",
    pj_id=202,
    model_name="lgb_full_params",
    model_kwargs=lgb_params,
)

# Build comparison on the 24h forecast window
lgb_comp = pd.DataFrame({
    "realised": lgb_realised,
    "forecast": lgb_forecast["forecast"].head(24),
})




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11311
[LightGBM] [Info] Number of data points in the train set: 36458, number of used features: 78
[LightGBM] [Info] Start training from score 1271.034176
2025-11-12 04:45:47 [info     ] Fitted a new model, not yet stored


2025/11/12 04:46:07 INFO mlflow.tracking.fluent: Experiment with name '202' does not exist. Creating a new experiment.


2025-11-12 04:46:07 [info     ] No previous model found in MLflow experiment_name=202
2025-11-12 04:46:16 [info     ] Model saved with MLflow        experiment_name=202
2025-11-12 04:46:17 [info     ] Logged figures to MLflow.
2025-11-12 04:46:17 [info     ] Writing reports to ./mlflow_artifacts/202
2025-11-12 04:46:19 [info     ] Model successfully loaded with MLflow
2025-11-12 04:46:19 [info     ] Found 24 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.001000541960228457 num_values=24 pj_id=202
2025-11-12 04:46:21 [info     ] Postproces in preparation of storing


Unnamed: 0,realised,forecast,absolute_difference,percentage_difference
2025-06-16 00:00:00+00:00,1481.0,1486.644531,5.644531,0.38113
2025-06-16 01:00:00+00:00,1503.0,1411.193463,91.806537,6.108219
2025-06-16 02:00:00+00:00,1446.0,1367.906986,78.093014,5.400623
2025-06-16 03:00:00+00:00,1427.0,1376.128487,50.871513,3.564927
2025-06-16 04:00:00+00:00,1373.0,1365.52901,7.47099,0.544136
2025-06-16 05:00:00+00:00,1398.0,1413.848956,15.848956,1.133688
2025-06-16 06:00:00+00:00,1424.0,1422.143735,1.856265,0.130356
2025-06-16 07:00:00+00:00,1389.0,1352.428315,36.571685,2.632951
2025-06-16 08:00:00+00:00,1315.0,1296.887734,18.112266,1.377359
2025-06-16 09:00:00+00:00,1275.0,1235.214652,39.785348,3.120419


#### Plot actual vs forecast for LGB

In [11]:
fig_lgb = go.Figure()
fig_lgb.add_trace(go.Scatter(x=lgb_comp.index, y=lgb_comp["realised"], mode="lines", name="Actual load"))
fig_lgb.add_trace(go.Scatter(x=lgb_comp.index, y=lgb_comp["forecast"], mode="lines", name="LGB forecast"))
fig_lgb.update_layout(title="LGB: Actual vs Forecast", xaxis_title="Datetime", yaxis_title="Load")
fig_lgb.show()


#### Table for lgb forecast vs actual

In [12]:
display(lgb_comp.assign(
    absolute_difference=(lgb_comp["forecast"] - lgb_comp["realised"]).abs(),
    percentage_difference=((lgb_comp["forecast"] - lgb_comp["realised"]).abs() / lgb_comp["realised"]) * 100,
))

Unnamed: 0,realised,forecast,absolute_difference,percentage_difference
2025-06-16 00:00:00+00:00,1481.0,1486.644531,5.644531,0.38113
2025-06-16 01:00:00+00:00,1503.0,1411.193463,91.806537,6.108219
2025-06-16 02:00:00+00:00,1446.0,1367.906986,78.093014,5.400623
2025-06-16 03:00:00+00:00,1427.0,1376.128487,50.871513,3.564927
2025-06-16 04:00:00+00:00,1373.0,1365.52901,7.47099,0.544136
2025-06-16 05:00:00+00:00,1398.0,1413.848956,15.848956,1.133688
2025-06-16 06:00:00+00:00,1424.0,1422.143735,1.856265,0.130356
2025-06-16 07:00:00+00:00,1389.0,1352.428315,36.571685,2.632951
2025-06-16 08:00:00+00:00,1315.0,1296.887734,18.112266,1.377359
2025-06-16 09:00:00+00:00,1275.0,1235.214652,39.785348,3.120419


#### Exploratory data analysis for master data

In [20]:
# EDA: Explore trends (daily, weekly, monthly, seasonal)
load_series = input_data["load"].copy()

# Basic timeline preview
fig_ts = px.line(load_series.reset_index(), x='date_time', y='load', title='Load over time')
fig_ts.show()

# Hour-of-day average profile
hod = load_series.groupby(load_series.index.hour).mean().rename('avg_load')
fig_hod = px.line(hod, title='Average load by hour of day')
fig_hod.update_layout(xaxis_title='Hour of day', yaxis_title='Average load')
fig_hod.show()

# Day-of-week average profile (0=Mon)
dow = load_series.groupby(load_series.index.dayofweek).mean().rename('avg_load')
fig_dow = px.bar(dow, title='Average load by day of week')
fig_dow.update_layout(xaxis_title='Day of week (0=Mon)', yaxis_title='Average load')
fig_dow.show()

# Monthly averages
mon = load_series.groupby(load_series.index.month).mean().rename('avg_load')
fig_mon = px.bar(mon, title='Average load by month')
fig_mon.update_layout(xaxis_title='Month', yaxis_title='Average load')
fig_mon.show()

# Seasonal averages (DJF, MAM, JJA, SON)
month_to_season = {12:'DJF', 1:'DJF', 2:'DJF', 3:'MAM', 4:'MAM', 5:'MAM', 6:'JJA', 7:'JJA', 8:'JJA', 9:'SON', 10:'SON', 11:'SON'}
seasons = load_series.groupby(load_series.index.month.map(month_to_season)).mean().rename('avg_load')
fig_season = px.bar(seasons, title='Average load by season')
fig_season.update_layout(xaxis_title='Season', yaxis_title='Average load')
fig_season.show()
