In [1]:
MODEL_NAME = "test_plot"

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from multiprocessing.dummy import freeze_support
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)


from dotenv import load_dotenv
load_dotenv()
os.environ['WANDB_NOTEBOOK_NAME'] = 'n-beats.ipynb'
os.environ['WANDB_API_KEY'] = os.getenv('WANDB_API_KEY')


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from darts import TimeSeries
from darts.models import NBEATSModel, NaiveDrift
from darts.dataprocessing.transformers import Scaler, MissingValuesFiller
from darts.metrics import mape, r2_score, rmse, mse

from darts import TimeSeries

from darts.datasets import EnergyDataset

import helper
import glob
import wandb

from pytorch_lightning.loggers import WandbLogger

from tqdm.contrib.concurrent import process_map
import tqdm


AVAILABLE_GPUS = torch.cuda.device_count()
AVAILABLE_CPUS = os.cpu_count()

print(f"Available GPUs: {AVAILABLE_GPUS}")
print(f"Available CPUs: {AVAILABLE_CPUS}")

wandb.init(project="Digital-Energy", name=MODEL_NAME)




Available GPUs: 2
Available CPUs: 32


[34m[1mwandb[0m: Currently logged in as: [33mtimmermansjoy[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Data

Taking the first 300 households from the London Dataset and converting them to a Darts TimeSeries.

In [None]:
# my_time_series_dataset = []
# for x in tqdm.tqdm(sorted(glob.glob("../../../Data/london_clean/*.csv"))[:1000]):
#     df = pd.read_csv(f'{x}')
#     df["DateTime"] = pd.to_datetime(df['DateTime'])
#     # df = df.groupby(pd.Grouper(key='DateTime', freq='1D')).max("KWHhh").round(3).reset_index()
#     series = TimeSeries.from_dataframe(df, time_col='DateTime', value_cols='KWHhh').astype(np.float32)
#     #plot series and save
#     series.plot()
#     plt.savefig(f'../../../Plots/testing/{x.split("/")[-1]}.png')
#     plt.close()


#     my_time_series_dataset.append(series)


In [None]:
def reader(x):
    return TimeSeries.from_csv(x, time_col='DateTime', value_cols='KWHhh',
                            fill_missing_dates=True, fillna_value=True, freq="30min").astype(np.float32)


def splitter():
    file_list = sorted(glob.glob("../../../Data/london_clean/*.csv"))[:1000]
    if file_list == []:
        raise Exception("No files found")
    return process_map(reader, file_list, chunksize=5)

if __name__ == "__main__":
    freeze_support()
    my_time_series_dataset = splitter()

In [None]:
import random
my_time_series_dataset[random.randint(0, len(my_time_series_dataset))].plot()

In [None]:
## sets
training_sets = []
validation_sets = []
for x in my_time_series_dataset:
    train, val = x.split_after(0.75)
    training_sets.append(train)
    validation_sets.append(val)

In [None]:
# for i in my_time_series_dataset:
#     helper.find_gaps(i.pd_dataframe(), delta=60*24)

# Model

We create a N-Beats model that utilizes the GPU, Weights, Biases logger and early stopping callback.

## Early stopping

An early stopping callback is used to stop the training if the validation loss does not improve after a certain number of epochs.


In [None]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.005,
    patience=15,
    verbose=True,
    mode="min"
    )

In [None]:
encoders = {
    # "datetime_attribute": {"future": ["DateTime"], "past": ["DateTime"]},
    "position": {"past": ["absolute"]},
    "transformer": Scaler(),
}

In [None]:
wandb_logger = WandbLogger(project="Digital-Energy", log_model=True)


# input chunk = The length of the input sequence fed to the model
# output chunk = The length of the output sequence predicted by the model

model_nbeats = NBEATSModel(
    input_chunk_length=3,
    output_chunk_length=1,
    generic_architecture=False,
    #num_stacks=10,
    num_blocks=3,
    num_layers=5,
    layer_widths=512,
    n_epochs=25,
    nr_epochs_val_period=1,
    batch_size=256,
    work_dir="../../../Models",
    save_checkpoints=False,
    # model_name=MODEL_NAME,
    pl_trainer_kwargs={
    "enable_progress_bar": True,
    "enable_model_summary": True,
    "accelerator": "gpu",
    "devices": 1,
    "logger": wandb_logger,
    "callbacks": [early_stop_callback],
    },
    # loss_fn=torch.nn.CrossEntropyLoss() # custom loss function
    # optimizer_cls=torch.optim.Adam,
    # add_encoders=encoders,
    random_state=42
)

In [None]:
my_time_series_dataset[551].plot()

In [None]:
#wandb_logger.watch(model_nbeats) # sadly this feature does not work for Darts models
model_nbeats.fit(series=training_sets, val_series=validation_sets, num_loader_workers=AVAILABLE_CPUS)

In [None]:
files = glob.glob("../../../Data/london_clean/*.csv")
df = pd.read_csv(files[2000])
df["DateTime"] = pd.to_datetime(df['DateTime'])
df = df.groupby(pd.Grouper(key='DateTime', freq='1D')).max("KWHhh").round(3).reset_index()
series = TimeSeries.from_dataframe(df, value_cols=['KWHhh'], time_col="DateTime").astype(np.float32)
series = series[-200:]


pred_series = model_nbeats.historical_forecasts(
    series,
    forecast_horizon=1,
    stride=1,
    retrain=False,
    verbose=True,
)

In [None]:
series.plot()
pred_series.plot()



In [None]:
naive_model = NaiveDrift(K=1)
naive_model.fit(train)
naive_forecast = naive_model.predict(series)
naive_forecast.plot()


In [None]:
plt.plot([1, 2, 3, 4])
plt.ylabel("some interesting numbers")
wandb.log({"chart": plt})

In [None]:
# plot train and validation series with matplotlib

plt.figure(figsize=(20,15))
plt.ylim(0, 3)
training_sets[1].plot(label=("train"))
validation_sets[1].plot(label=("val"))
# show plot
fig = helper.display_forecast(training_sets[1], training_sets[2], "1day", save=False)
plt.legend()
wandb.log({"chart_test": plt,
           "helper": fig})
plt.show()


In [None]:
START = 3000
for i, x in enumerate(sorted(glob.glob("../../../Data/london_clean/*.csv"))[START:START+10]):

    df = pd.read_csv(x)
    df["DateTime"] = pd.to_datetime(df['DateTime'])
    series = TimeSeries.from_dataframe(df, value_cols=['KWHhh'], time_col="DateTime").astype(np.float32)
    series = series[-400:]


    pred_series = model_nbeats.historical_forecasts(
        series,
        forecast_horizon=1,
        stride=1,
        retrain=False,
        verbose=True,
    )

    print(f"rmse: {rmse(series, pred_series)}.")
    print(f"R2 score: {r2_score(series, pred_series)}.")

    fig = helper.display_forecast(pred_series, series, "1 day", save=True, fig_name=f"{i}", model_name="test", fig_size=(20,10))

    wandb.log({
            "mape": mape(series, pred_series),
            "mse": mse(series, pred_series),
            "rmse": rmse(series, pred_series),
            "r2": r2_score(series, pred_series),
            "result": fig
    })

# Loading checkpoints of the model

loading the best checkpoint of the model. To compare the results of the model with the previous one.

In [None]:
# load the model
model_nbeats = NBEATSModel.load_from_checkpoint(work_dir="../../../Models/", model_name="n_beats_biggysmall", best=True)

In [None]:
START = 3000
for i, x in enumerate(sorted(glob.glob("../../Data/london_clean/*.csv"))[START:START+10]):

    df = pd.read_csv(x)
    df["DateTime"] = pd.to_datetime(df['DateTime'])
    series = TimeSeries.from_dataframe(df, value_cols=['KWHhh'], time_col="DateTime", fill_missing_dates=True, freq="30min").astype(np.float32)
    series = series[-600:]


    pred_series = model_nbeats.predict(
        1,
        series,
    )

    print(f"rmse: {rmse(series, pred_series)}.")
    print(f"R2 score: {r2_score(series, pred_series)}.")

    helper.display_forecast(pred_series, series, "1 day", save=True, fig_name=f"{i}", fig_size=(20,10))

In [None]:
helper.display_forecast(pred_series, series, "1 day", save=False, fig_name=f"test", model_name=f"{MODEL_NAME}", fig_size=(20,10))