In [None]:
!pip3 install darts plotly wandb python-dotenv

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
from dotenv import load_dotenv
load_dotenv()

os.environ['WANDB_NOTEBOOK_NAME'] = 'pytorch_stats_own_data.ipynb'
os.environ['WANDB_API_KEY'] = os.getenv('WANDB_API_KEY')


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from darts import TimeSeries
from darts.models import NBEATSModel
from darts.dataprocessing.transformers import Scaler, MissingValuesFiller
from darts.metrics import mape, r2_score

from darts import TimeSeries

from darts.datasets import EnergyDataset

import helper
import glob

from pytorch_lightning.loggers import WandbLogger
wandb_logger = WandbLogger(project="Digital-Energy")


AVAILABLE_GPUS = torch.cuda.device_count()
AVAILABLE_CPUS = os.cpu_count()

print(f"Available GPUs: {AVAILABLE_GPUS}")
print(f"Available CPUs: {AVAILABLE_CPUS}")

In [None]:
# Read a pandas DataFrame
# df = pd.read_parquet('../../Data/ldn_df2.parquet')
# df

In [None]:
df = pd.read_csv('../../Data/london_clean/cleaned_household_MAC000002.csv')
df

In [None]:
#df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.drop(['StdorToU'], axis=1 , inplace=True)
df.drop(['TimeOfDay'], axis=1 , inplace=True)
df.drop(["LCLid"], axis=1, inplace=True)
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df.loc[~df["DateTime"].duplicated(), :]
df.rename(columns={'KWH/hh (per half hour) ': 'KWH'} , inplace=True)
df

In [None]:
import glob
df = pd.concat([pd.read_csv(x) for x in sorted(glob.glob("../../Data/london_clean/*.csv"))[:20]])

In [None]:

df['LCLid'] = df['LCLid'].apply(lambda x: x[3:])

In [None]:
## Check for missing values in the dataset
import glob
sum = 0
f = open('../../missing.txt', 'w')
for i, x in enumerate(sorted (glob.glob("../../Data/london_clean/*.csv"))):
    df = pd.read_csv(x)
    if not helper.find_gaps(df).empty:
        print(f'{i}: {x} has gaps')
        f.write(f'{x}\n{helper.find_gaps(df)}\n-----------------------\n')
        sum += 1
print (sum)
f.close()

In [None]:
helper.find_gaps(df)

In [None]:
df["DateTime"] = df["index"].apply(lambda x: df["DateTime"][0] + (x * pd.Timedelta(minutes=30)))
df["DateTime"] = df["DateTime"].apply(lambda x: x.strftime("%Y-%m-%d"))

In [None]:
df_max = df.groupby(df["DateTime"].astype(str).str.split(" ").str[0]).max()
filler = MissingValuesFiller()
scaler = Scaler()
series = scaler.fit_transform(
    filler.transform(
        TimeSeries.from_dataframe(df_max, "DateTime", ["KWH"])
    )
).astype(np.float32)
series.plot()
plt.title("Daily peak usage in household")

# Creating training data

Darts only takes its own timeseries as input. so we have to generate this first

In [None]:
## Create data from 

my_time_series_dataset = []
for x in sorted(glob.glob("../../Data/london_clean/*.csv"))[:20]:
    series = TimeSeries.from_csv(x, time_col='DateTime', value_cols='KWHhh')
    my_time_series_dataset.append(series)

In [None]:
## old
# series = TimeSeries.from_dataframe(df_max, value_cols=['KWH'], time_col="index", fill_missing_dates=True)

## 2 different ways of splitting train and test
# train, val = series.split_after(0.85)
#train, val = series[:-48], series[-48:]

In [None]:
training_sets = []
validation_sets = []
for x in my_time_series_dataset:
    train, val = series.split_after(0.85)
    training_sets.append(train)
    validation_sets.append(val)
    

In [None]:
training_sets[0].plot(label="training")
validation_sets[0].plot(label="validation")

In [None]:
model_nbeats = NBEATSModel(
    input_chunk_length=30,
    output_chunk_length=7,
    generic_architecture=True,
    num_stacks=10,
    num_blocks=1,
    num_layers=3,
    layer_widths=512,
    n_epochs=50,
    nr_epochs_val_period=1,
    batch_size=400,
    work_dir="../../Models",
    save_checkpoints=True,
    model_name="nbeats_run",
    pl_trainer_kwargs={
      "accelerator": "gpu",
      "devices": 1,
      "logger": wandb_logger
    },
)

In [None]:
model_nbeats.fit(series=training_sets, val_series=validation_sets, verbose=True, num_loader_workers=AVAILABLE_CPUS)
# model_nbeats.save_model("../../Models/test.pth.tar")

## (optional) load model from file

In [None]:
model_nbeats = NBEATSModel.load_model("../../Models/nbeats_run/_model.pth.tar")

## Validate

We create unseen data and then do a historical forecast to see how well the model does.

In [None]:
for i in range(600,605):
    plt.figure(figsize=(20,10))
    df = pd.read_csv(f'../../Data/london_clean/cleaned_household_MAC000{i}.csv')
    series = TimeSeries.from_dataframe(df, value_cols=['KWHhh'], time_col="DateTime", fill_missing_dates=True)
    series = series[-150:]
    # save the plot
    series.plot(label=f"Household {i}")
    plt.legend()
    plt.savefig(f'../../Plots/Household_{i}.png')
    plt


In [None]:
df = pd.read_csv(f'../../Data/london_clean/cleaned_household_MAC000600.csv')
# take the max KWH value from each day
df["DateTime"] = pd.to_datetime(df['DateTime'])
df = df.groupby(pd.Grouper(key='DateTime', freq='1D')).max().round(3).reset_index()
df

In [None]:
pred_series = model_nbeats.historical_forecasts(
    series,
    forecast_horizon=1,
    stride=2,
    retrain=False,
    verbose=True,
)

In [None]:
from darts.metrics import mape, r2_score, rmse

# print(f"Mean absolute percentage error: {mape(series, pred_series)}.")
print(f"rmse: {rmse(series, pred_series)}.")
print(f"R2 score: {r2_score(series, pred_series)}.")

In [None]:
helper.display_forecast(pred_series, series, "1 day")