In [None]:
import pandas as pd
import duckdb as ddb
import optuna
import numpy as np

from darts import TimeSeries
from darts.models import LinearRegressionModel
from darts.dataprocessing.transformers import Scaler
from darts.metrics import rmse

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
con = ddb.connect("./kalam_hydropower.db", read_only=True)

In [None]:
# These are devices with weak signals which we can exclude from the analysis - these were mentioned in the samplesubmission notebook
devices_to_drop = ["3", "5", "11", "14", "15", "17", "24", "25", "27", "33", "4", "9"]

## Modelling and Submission

Now that we have an idea of features and reasonable data structures, this next section aims to:
- Structure the data to be used with the Darts package - this depends on a `TimeSeries` data type rather than dataframes
- Splitting and scaling the data appropriately to avoid leakage in our test sets
- Creating an appropriate scoring and prediction function to easily format our results

In [None]:
# It is clear from our sub-plots that there is a long period of 'no signal' in the data, so I'll use data from 2024-07-01 onward
# for simplicity and to avoid too much zero biasing. Ofcourse, the fillna(0) adds some degree of biasing as well
filtered_to_forecast_df = con.sql(
    """select
        date, source, consumer_device, data_user, kwh 
    from prepared.daily_hydropower_production
""").to_df()

filtered_to_forecast_df = filtered_to_forecast_df[~(filtered_to_forecast_df["consumer_device"].isin(devices_to_drop))]

filtered_to_forecast_pivotted_df = filtered_to_forecast_df.pivot(columns="source", index="date", values="kwh").copy()
filtered_to_forecast_pivotted_df.fillna(0, inplace=True)

filtered_to_forecast_df = filtered_to_forecast_pivotted_df.melt(value_vars=list(filtered_to_forecast_pivotted_df.columns), var_name="source", value_name="kwh", ignore_index=False)

filtered_to_forecast_df.reset_index(inplace=True, drop=False)


filtered_to_forecast_df = filtered_to_forecast_df[filtered_to_forecast_df["date"] >= pd.Timestamp("2024-07-01")]

filtered_to_forecast_df.head()


In [None]:
filtered_to_forecast_df.shape

In [None]:
# Now I acutally want to use the device, user and source as static covariates in darts, this is a bit of a roundabout
# way of getting to the values but this gets the consumer device and data user numbers back
filtered_to_forecast_df[['consumer_device', 'data_user']] = filtered_to_forecast_df['source'].str.extract(r'consumer_device_(\d+)_data_user_(\d+)')
filtered_to_forecast_df["consumer_device"] = filtered_to_forecast_df["consumer_device"].astype(int)
filtered_to_forecast_df["data_user"] = filtered_to_forecast_df["data_user"].astype(int)

In [None]:
filtered_to_forecast_df.shape

In [None]:
features_df = con.sql("select * from prepared.daily_features").to_df()

features_df.head()

In [None]:
# vector_ts = TimeSeries.from_group_dataframe(
#     filtered_to_forecast_df,
#     time_col="date",
#     group_cols="source",
#     static_cols=["consumer_device", "data_user"],
#     value_cols=["kwh"]
# )

In [None]:
# vector_ts[0].static_covariates_values()[0][0]

In [None]:
# I setup a dictionary with a key per series, and a value containing the darts timeseries' incl. training, testing and scaled data
# as well as the individual scalers used to invert the transformation
ts_dict = {}

for series_name in filtered_to_forecast_df["source"].unique():
        
    try:

        series_ts = TimeSeries.from_dataframe(
            df=filtered_to_forecast_df[filtered_to_forecast_df["source"] == series_name],
            time_col="date",
            value_cols=["kwh"]
        )

        scaler = Scaler(StandardScaler())

        series_scaled_ts = scaler.fit_transform(series_ts)

        train_ts, test_ts = series_ts.split_after(len(series_ts) - 31)

        train_scaler = Scaler(StandardScaler())

        train_scaled_ts = train_scaler.fit_transform(train_ts)

    except Exception as e:
        print(f"Failed to process series {series_name} due to {e}")
        continue

    ts_dict[series_name] = {
        "ts": series_ts,
        "scaled_ts": series_scaled_ts,
        "scaler": scaler,
        "train_ts": train_ts,
        "train_scaled_ts": train_scaled_ts,
        "train_scaler": train_scaler,
        "test_ts": test_ts,
    }


In [None]:
# In darts you can pass a list of timeseries to a model's .fit function so that the model is trained on multiple series at once
train_ts_list = [ts_dict[series_name]["train_scaled_ts"] for series_name in ts_dict.keys()]

# These are the full series which we will use to train the final model
ts_list = [ts_dict[series_name]["scaled_ts"] for series_name in ts_dict.keys()]

In [None]:
# The features we want to use need to also be scaled so that they can be used with a model like LinearRegression
features_ts = TimeSeries.from_dataframe(features_df, time_col="date")

features_scaler = Scaler(StandardScaler())

scaled_features_ts = features_scaler.fit_transform(features_ts)

In [None]:
# I need the sample submission file to produce an 'accurate score' i.e. how well am I doing on the series that my model will
# actually be validated on
ss_df = pd.read_csv("./data/SampleSubmission.csv")
ss_df[["date", "source"]] = ss_df["ID"].str.split("_", expand=True, n=1)

ss_df["date"] = pd.to_datetime(ss_df["date"])

ss_df.head()

In [None]:
def score_model(model, ts_dict, ss_df, future_covs=None, forecast_horizon=31, scaled=False):
    """A utility function to score a model based on the data in the submission set"""

    rmse_scores = []

    series_to_forecast = set(ts_dict.keys()).intersection(set(ss_df["source"].unique()))

    for index, series_name in enumerate(series_to_forecast):

        predictions = model.predict(forecast_horizon, series=ts_dict[series_name]["train_scaled_ts"], future_covariates=future_covs[index], show_warnings=False)

        if scaled:
            predictions = ts_dict[series_name]["train_scaler"].inverse_transform(predictions)
        
        rmse_scores.append(rmse(ts_dict[series_name]["test_ts"], predictions))
    
    return np.mean(rmse_scores)

In [None]:
def create_predictions(model, ts_dict, ss_df, future_covs, forecast_horizon=31, scaled=False):
    """A utility function which can easily create a submission based on the sample submission file"""

    predictions_df = pd.DataFrame()

    series_to_forecast = set(ts_dict.keys()).intersection(set(ss_df["source"].unique()))

    for index, series_name in enumerate(series_to_forecast):

        predictions = model.predict(forecast_horizon, series=ts_dict[series_name]["scaled_ts"], future_covariates=future_covs[index])

        if scaled:
            predictions = ts_dict[series_name]["scaler"].inverse_transform(predictions)

        pred_df = predictions.to_dataframe()
        pred_df["source"] = series_name

        predictions_df = pd.concat([predictions_df, pred_df])


        predictions_df.sort_values(by=["source", "date"], inplace=True)

        predictions_df["ID"] = predictions_df.index.astype(str) + "_" + predictions_df["source"]

    return predictions_df

In [None]:
model_kwargs = {
    "lags": [-1],
    "output_chunk_length": 9,
    "lags_future_covariates": [-30],
    "use_static_covariates": False,
    "random_state": 42
}

In [None]:
## Setting up my model
linear_model = LinearRegressionModel(
    **model_kwargs
)

future_covs = [scaled_features_ts[['precip_snow_ratio']] for _ in range(len(train_ts_list))]

linear_model.fit(train_ts_list, future_covariates=future_covs)

In [None]:
score_model(linear_model, ts_dict, ss_df, future_covs, scaled=True)

In [None]:
## Full model - now we can use all the data available to us to make the best model possible
full_model = LinearRegressionModel(
    **model_kwargs
)

future_covs = [scaled_features_ts[['precip_snow_ratio']] for _ in range(len(ts_list))]

full_model.fit(ts_list, future_covariates=future_covs)

In [None]:
forecast_df = create_predictions(full_model, ts_dict, ss_df, future_covs, scaled=True)

forecast_df.head()

In [None]:
forecast_df[["ID","kwh"]].to_csv("./submissions/my_forecast.csv", index=False)

In [None]:
con.close()

# But Wait, How Did You Get Those Parameters?

To find the optimal parameters for my LinearRegressor I leveraged `optuna` to test many variants of parameter combinations. Then tried using the top X results for submissions on Zindi to find the best validation score I could.

In [None]:
def covariate_objective(trial: optuna.Trial):
    lags = trial.suggest_categorical("lags", [[-1], [-1, -7], [-1, -14]])
    output_chunk_length = trial.suggest_int("output_chunk_length", 5, 20)
    future_cov_lags = trial.suggest_categorical("future_cov_lags", [[0], [-30], [-60], [-90]])

    future_cov_options = [
        ["avg_temperature"],
        ["avg_dewpoint_temperature"],
        ["wind_speed"],
        ["precip_snow_ratio"],
        ["dayofyear"],
        ["avg_temperature", "avg_dewpoint_temperature", "wind_speed", "precip_snow_ratio", "dayofyear"],
    ]
    selected_covariates = trial.suggest_categorical("future_covs", future_cov_options)
    
    future_covs = [scaled_features_ts[selected_covariates]] * len(train_ts_list)

    # Create and fit model
    linear_model = LinearRegressionModel(
        lags=lags,
        output_chunk_length=output_chunk_length,
        use_static_covariates=False, ## In my original code I tried out using the consumer devices and users as static covs
        lags_future_covariates=future_cov_lags,
        random_state=42,
    )

    linear_model.fit(train_ts_list, future_covariates=future_covs)

    mean_rmse = score_model(linear_model, ts_dict, ss_df, future_covs=future_covs, scaled=True)

    return mean_rmse


In [None]:
study = optuna.create_study(direction="minimize")

study.optimize(covariate_objective, n_trials=50, n_jobs=8, show_progress_bar=True)

In [None]:
study.best_params