In [1]:
import pandas as pd
import duckdb as ddb
import optuna
import numpy as np

from darts import TimeSeries
from darts.models import LinearRegressionModel
from darts.dataprocessing.transformers import Scaler
from darts.metrics import rmse

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

Support for Torch based models not available. To enable them, install "darts", "u8darts[torch]" or "u8darts[all]" (with pip); or "u8darts-torch" or "u8darts-all" (with conda).


In [2]:
con = ddb.connect("./kalam_hydropower.db", read_only=True)

In [3]:
# These are devices with weak signals which we can exclude from the analysis - these were mentioned in the samplesubmission notebook
devices_to_drop = ["3", "5", "11", "14", "15", "17", "24", "25", "27", "33", "4", "9"]

## Modelling and Submission

Now that we have an idea of features and reasonable data structures, this next section aims to:
- Structure the data to be used with the Darts package - this depends on a `TimeSeries` data type rather than dataframes
- Splitting and scaling the data appropriately to avoid leakage in our test sets
- Creating an appropriate scoring and prediction function to easily format our results

In [4]:
# It is clear from our sub-plots that there is a long period of 'no signal' in the data, so I'll use data from 2024-07-01 onward
# for simplicity and to avoid too much zero biasing. Ofcourse, the fillna(0) adds some degree of biasing as well
filtered_to_forecast_df = con.sql(
    """select
        date, source, consumer_device, data_user, kwh 
    from prepared.daily_hydropower_production
""").to_df()

filtered_to_forecast_df = filtered_to_forecast_df[~(filtered_to_forecast_df["consumer_device"].isin(devices_to_drop))]

filtered_to_forecast_pivotted_df = filtered_to_forecast_df.pivot(columns="source", index="date", values="kwh").copy()
filtered_to_forecast_pivotted_df.fillna(0, inplace=True)

filtered_to_forecast_df = filtered_to_forecast_pivotted_df.melt(value_vars=list(filtered_to_forecast_pivotted_df.columns), var_name="source", value_name="kwh", ignore_index=False)

filtered_to_forecast_df.reset_index(inplace=True, drop=False)


filtered_to_forecast_df = filtered_to_forecast_df[filtered_to_forecast_df["date"] >= pd.Timestamp("2024-07-01")]

filtered_to_forecast_df.head()


Unnamed: 0,date,source,kwh
373,2024-07-01,consumer_device_10_data_user_1,0.0
374,2024-07-02,consumer_device_10_data_user_1,0.0
375,2024-07-03,consumer_device_10_data_user_1,0.0
376,2024-07-04,consumer_device_10_data_user_1,0.0
377,2024-07-05,consumer_device_10_data_user_1,0.0


In [5]:
filtered_to_forecast_df.shape

(35785, 3)

In [6]:
# Now I acutally want to use the device, user and source as static covariates in darts, this is a bit of a roundabout
# way of getting to the values but this gets the consumer device and data user numbers back
filtered_to_forecast_df[['consumer_device', 'data_user']] = filtered_to_forecast_df['source'].str.extract(r'consumer_device_(\d+)_data_user_(\d+)')
filtered_to_forecast_df["consumer_device"] = filtered_to_forecast_df["consumer_device"].astype(int)
filtered_to_forecast_df["data_user"] = filtered_to_forecast_df["data_user"].astype(int)

In [7]:
filtered_to_forecast_df.shape

(35785, 5)

In [8]:
features_df = con.sql("select * from prepared.daily_features").to_df()

features_df.head()

Unnamed: 0,date,avg_temperature,avg_dewpoint_temperature,avg_u_wind_component,avg_v_wind_component,total_precipitation,total_snowfall,avg_snow_cover_perc,temp_dew_diff,wind_speed,precip_snow_ratio,precip_minus_snow,rolling_precip_7d,rolling_snow_30d,melt_potential,wind_variability_3d,dayofyear,month,sin_doy,cos_doy
0,2023-06-03,1.86028,-3.348664,0.025054,-0.657726,0.004557,0.0,99.972656,5.208944,0.658203,4557.0,0.004557,,,185.977159,,154,6,0.47116,-0.882048
1,2023-06-04,3.99274,-1.905203,-0.180909,-0.505298,0.024096,0.0,99.972656,5.897942,0.536706,24096.402,0.024096,,,399.164785,,155,6,0.455907,-0.890028
2,2023-06-05,4.794523,-3.781657,-0.145114,-0.498183,0.01158,1.166e-06,99.972656,8.576181,0.518888,5346.08338,0.011578,,,479.321249,0.075815,156,6,0.440519,-0.897743
3,2023-06-06,6.30439,-4.670615,0.018054,-0.478623,0.008914,5.22e-08,99.972656,10.975005,0.478963,8471.506368,0.008914,,,630.266635,0.029568,157,6,0.425,-0.905193
4,2023-06-07,7.003922,-3.965763,0.083701,-0.516598,0.00865,0.0,99.972656,10.969686,0.523335,8649.5,0.00865,,,700.200726,0.024435,158,6,0.409356,-0.912375


In [None]:
# vector_ts = TimeSeries.from_group_dataframe(
#     filtered_to_forecast_df,
#     time_col="date",
#     group_cols="source",
#     static_cols=["consumer_device", "data_user"],
#     value_cols=["kwh"]
# )

In [None]:
# vector_ts[0].static_covariates_values()[0][0]

In [9]:
# I setup a dictionary with a key per series, and a value containing the darts timeseries' incl. training, testing and scaled data
# as well as the individual scalers used to invert the transformation
ts_dict = {}

for series_name in filtered_to_forecast_df["source"].unique():
        
    try:

        series_ts = TimeSeries.from_dataframe(
            df=filtered_to_forecast_df[filtered_to_forecast_df["source"] == series_name],
            time_col="date",
            value_cols=["kwh"]
        )

        scaler = Scaler(StandardScaler())

        series_scaled_ts = scaler.fit_transform(series_ts)

        train_ts, test_ts = series_ts.split_after(len(series_ts) - 31)

        train_scaler = Scaler(StandardScaler())

        train_scaled_ts = train_scaler.fit_transform(train_ts)

    except Exception as e:
        print(f"Failed to process series {series_name} due to {e}")
        continue

    ts_dict[series_name] = {
        "ts": series_ts,
        "scaled_ts": series_scaled_ts,
        "scaler": scaler,
        "train_ts": train_ts,
        "train_scaled_ts": train_scaled_ts,
        "train_scaler": train_scaler,
        "test_ts": test_ts,
    }


In [10]:
# In darts you can pass a list of timeseries to a model's .fit function so that the model is trained on multiple series at once
train_ts_list = [ts_dict[series_name]["train_scaled_ts"] for series_name in ts_dict.keys()]

# These are the full series which we will use to train the final model
ts_list = [ts_dict[series_name]["scaled_ts"] for series_name in ts_dict.keys()]

In [11]:
# The features we want to use need to also be scaled so that they can be used with a model like LinearRegression
features_ts = TimeSeries.from_dataframe(features_df, time_col="date")

features_scaler = Scaler(StandardScaler())

scaled_features_ts = features_scaler.fit_transform(features_ts)

In [12]:
# I need the sample submission file to produce an 'accurate score' i.e. how well am I doing on the series that my model will
# actually be validated on
ss_df = pd.read_csv("./data/SampleSubmission.csv")
ss_df[["date", "source"]] = ss_df["ID"].str.split("_", expand=True, n=1)

ss_df["date"] = pd.to_datetime(ss_df["date"])

ss_df.head()

Unnamed: 0,ID,kwh,date,source
0,2024-09-24_consumer_device_12_data_user_1,0,2024-09-24,consumer_device_12_data_user_1
1,2024-09-25_consumer_device_12_data_user_1,0,2024-09-25,consumer_device_12_data_user_1
2,2024-09-26_consumer_device_12_data_user_1,0,2024-09-26,consumer_device_12_data_user_1
3,2024-09-27_consumer_device_12_data_user_1,0,2024-09-27,consumer_device_12_data_user_1
4,2024-09-28_consumer_device_12_data_user_1,0,2024-09-28,consumer_device_12_data_user_1


In [13]:
def score_model(model, ts_dict, ss_df, future_covs=None, forecast_horizon=31, scaled=False):
    """A utility function to score a model based on the data in the submission set"""

    rmse_scores = []

    series_to_forecast = set(ts_dict.keys()).intersection(set(ss_df["source"].unique()))

    for index, series_name in enumerate(series_to_forecast):

        predictions = model.predict(forecast_horizon, series=ts_dict[series_name]["train_scaled_ts"], future_covariates=future_covs[index], show_warnings=False)

        if scaled:
            predictions = ts_dict[series_name]["train_scaler"].inverse_transform(predictions)
        
        rmse_scores.append(rmse(ts_dict[series_name]["test_ts"], predictions))
    
    return np.mean(rmse_scores)

In [14]:
def create_predictions(model, ts_dict, ss_df, future_covs, forecast_horizon=31, scaled=False):
    """A utility function which can easily create a submission based on the sample submission file"""

    predictions_df = pd.DataFrame()

    series_to_forecast = set(ts_dict.keys()).intersection(set(ss_df["source"].unique()))

    for index, series_name in enumerate(series_to_forecast):

        predictions = model.predict(forecast_horizon, series=ts_dict[series_name]["scaled_ts"], future_covariates=future_covs[index])

        if scaled:
            predictions = ts_dict[series_name]["scaler"].inverse_transform(predictions)

        pred_df = predictions.to_dataframe()
        pred_df["source"] = series_name

        predictions_df = pd.concat([predictions_df, pred_df])


        predictions_df.sort_values(by=["source", "date"], inplace=True)

        predictions_df["ID"] = predictions_df.index.astype(str) + "_" + predictions_df["source"]

    return predictions_df

In [None]:
model_kwargs = {
    "lags": [-1],
    "output_chunk_length": 9,
    "lags_future_covariates": [-30],
    "use_static_covariates": False,
    "random_state": 42
}

In [16]:
## Setting up my model
linear_model = LinearRegressionModel(
    **model_kwargs
)

future_covs = [scaled_features_ts[['precip_snow_ratio']] for _ in range(len(train_ts_list))]

linear_model.fit(train_ts_list, future_covariates=future_covs)

LinearRegressionModel(lags=[-1], lags_past_covariates=None, lags_future_covariates=[-30], output_chunk_length=9, output_chunk_shift=0, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=False)

In [17]:
score_model(linear_model, ts_dict, ss_df, future_covs, scaled=True)

4.910320028837711

In [18]:
## Full model - now we can use all the data available to us to make the best model possible
full_model = LinearRegressionModel(
    **model_kwargs
)

future_covs = [scaled_features_ts[['precip_snow_ratio']] for _ in range(len(ts_list))]

full_model.fit(ts_list, future_covariates=future_covs)

LinearRegressionModel(lags=[-1], lags_past_covariates=None, lags_future_covariates=[-30], output_chunk_length=9, output_chunk_shift=0, add_encoders=None, likelihood=None, quantiles=None, random_state=42, multi_models=True, use_static_covariates=False)

In [19]:
forecast_df = create_predictions(full_model, ts_dict, ss_df, future_covs, scaled=True)

forecast_df.head()

component,kwh,source,ID
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-09-24,0.16582,consumer_device_12_data_user_1,2024-09-24_consumer_device_12_data_user_1
2024-09-25,0.156552,consumer_device_12_data_user_1,2024-09-25_consumer_device_12_data_user_1
2024-09-26,0.150046,consumer_device_12_data_user_1,2024-09-26_consumer_device_12_data_user_1
2024-09-27,0.147673,consumer_device_12_data_user_1,2024-09-27_consumer_device_12_data_user_1
2024-09-28,0.146711,consumer_device_12_data_user_1,2024-09-28_consumer_device_12_data_user_1


In [20]:
forecast_df[["ID","kwh"]].to_csv("./submissions/my_forecast.csv", index=False)

In [21]:
con.close()

# But Wait, How Did You Get Those Parameters?

To find the optimal parameters for my LinearRegressor I leveraged `optuna` to test many variants of parameter combinations. Then tried using the top X results for submissions on Zindi to find the best validation score I could.

In [22]:
def covariate_objective(trial: optuna.Trial):
    lags = trial.suggest_categorical("lags", [[-1], [-1, -7], [-1, -14]])
    output_chunk_length = trial.suggest_int("output_chunk_length", 5, 20)
    future_cov_lags = trial.suggest_categorical("future_cov_lags", [[0], [-30], [-60], [-90]])

    future_cov_options = [
        ["avg_temperature"],
        ["avg_dewpoint_temperature"],
        ["wind_speed"],
        ["precip_snow_ratio"],
        ["dayofyear"],
        ["avg_temperature", "avg_dewpoint_temperature", "wind_speed", "precip_snow_ratio", "dayofyear"],
    ]
    selected_covariates = trial.suggest_categorical("future_covs", future_cov_options)
    
    future_covs = [scaled_features_ts[selected_covariates]] * len(train_ts_list)

    # Create and fit model
    linear_model = LinearRegressionModel(
        lags=lags,
        output_chunk_length=output_chunk_length,
        use_static_covariates=False, ## In my original code I tried out using the consumer devices and users as static covs
        lags_future_covariates=future_cov_lags,
        random_state=42,
    )

    linear_model.fit(train_ts_list, future_covariates=future_covs)

    mean_rmse = score_model(linear_model, ts_dict, ss_df, future_covs=future_covs, scaled=True)

    return mean_rmse


In [23]:
study = optuna.create_study(direction="minimize")

study.optimize(covariate_objective, n_trials=50, n_jobs=8, show_progress_bar=True)

[I 2025-05-19 10:26:29,357] A new study created in memory with name: no-name-2a9909f9-65c7-43be-8372-61ce9c162ef7


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-19 10:26:44,852] Trial 7 finished with value: 5.1698520936852335 and parameters: {'lags': [-1, -7], 'output_chunk_length': 17, 'future_cov_lags': [-60], 'future_covs': ['precip_snow_ratio']}. Best is trial 7 with value: 5.1698520936852335.
[I 2025-05-19 10:26:45,389] Trial 3 finished with value: 4.9538173832747185 and parameters: {'lags': [-1, -14], 'output_chunk_length': 9, 'future_cov_lags': [-60], 'future_covs': ['avg_temperature', 'avg_dewpoint_temperature', 'wind_speed', 'precip_snow_ratio', 'dayofyear']}. Best is trial 3 with value: 4.9538173832747185.
[I 2025-05-19 10:26:48,578] Trial 1 finished with value: 4.654956614360072 and parameters: {'lags': [-1], 'output_chunk_length': 8, 'future_cov_lags': [-90], 'future_covs': ['avg_temperature']}. Best is trial 1 with value: 4.654956614360072.
[I 2025-05-19 10:26:49,054] Trial 0 finished with value: 4.978743943408184 and parameters: {'lags': [-1, -7], 'output_chunk_length': 11, 'future_cov_lags': [-30], 'future_covs': ['pr

In [24]:
study.best_params

{'lags': [-1],
 'output_chunk_length': 7,
 'future_cov_lags': [-90],
 'future_covs': ['avg_temperature']}