In [2]:
import os, sys
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import darts
from darts.utils.statistics import check_seasonality, plot_acf, stationarity_tests
from darts.dataprocessing.transformers.missing_values_filler import MissingValuesFiller
from darts.dataprocessing.transformers.boxcox import BoxCox
from darts.dataprocessing.transformers.diff import Diff
from darts.utils.statistics import plot_hist
from darts.models import LightGBMModel, XGBModel, LinearRegressionModel, TFTModel, NHiTSModel, RNNModel, TFTModel
from darts.metrics import smape, mape, mase, mse, rmse, r2_score, mae
from darts.dataprocessing.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler   
from darts.dataprocessing.transformers.scaler import Scaler
from darts.utils.missing_values import extract_subseries

from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
from wandb.xgboost import WandbCallback


from utils import *
import wandb
wandb.login()


import warnings
warnings.filterwarnings('ignore')

# Set seed
np.random.seed(42)



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaushouben[0m ([33mwattcast[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Set working directory
os.chdir(r"..") # should be the git repo root directory, checking below:
print("Current working directory: " + os.getcwd())
assert os.getcwd()[-8:] == "WattCast"
dir_path = os.path.join(os.getcwd(), 'data', 'clean_data')
model_dir = os.path.join(os.getcwd(), 'models')

Current working directory: c:\Users\nik\Desktop\Berkeley_Projects\WattCast


In [4]:
# run parameters

config_dataset = {
    'spatial_scale': '1_county',
    'temp_resolution': 60,
    'location': 'Los_Angeles',
}

config_modeldesign = {'boxcox': True,
                    'horizon_in_hours': 24, # in hours
                    'lookback_in_hours': 24, # in hours
                    'liklihood': None,
                    'weather': True,
                    'holiday': True,
                    'datetime_encodings': True,
                    }
                   

if config_dataset['temp_resolution'] == 60:
     timestep_encoding = ["hour"] 
elif config_dataset['temp_resolution'] == 15:
     timestep_encoding = ['quarter']
else:
    timestep_encoding = ["hour", "minute"]


config_encoders =  {
                    "cyclic": {"future": timestep_encoding}, 
                    "position": {"future": ["relative",]},
                    "datetime_attribute": {"future": ["dayofweek", "week"]},
                    'position': {'past': ['relative'], 'future': ['relative']},
            }



In [5]:
# calculate derived parameters
datetime_encoders = config_encoders if config_modeldesign['datetime_encodings'] else None
timesteps_per_hour = int(60 / config_dataset['temp_resolution'])
n_lags = config_modeldesign['lookback_in_hours'] * timesteps_per_hour
n_ahead = config_modeldesign['horizon_in_hours'] * timesteps_per_hour
list_metrics = [smape, mape, rmse, r2_score, mae] # evaluation metrics
eval_stride = int(np.sqrt(n_ahead)) # evaluation stride, how often to evaluate the model, in this case we evaluate every n_ahead steps

# Loading Data
df_train = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/train_target')
df_val = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/val_target')
df_test = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/test_target')

if config_modeldesign['weather']:
    df_cov_train = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/train_cov')
    df_cov_val = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/val_cov')
    df_cov_test = pd.read_hdf(os.path.join(dir_path, f'{config_dataset["spatial_scale"]}.h5'), key=f'{config_dataset["location"]}/{config_dataset["temp_resolution"]}min/test_cov')

In [6]:
# into darts format
ts_train = darts.TimeSeries.from_dataframe(df_train, freq=str(config_dataset['temp_resolution']) + 'min')
ts_train = extract_subseries(ts_train)
ts_val = darts.TimeSeries.from_dataframe(df_val, freq=str(config_dataset['temp_resolution']) + 'min')
ts_val = extract_subseries(ts_val)
ts_test = darts.TimeSeries.from_dataframe(df_test, freq=str(config_dataset['temp_resolution']) + 'min')
ts_test = extract_subseries(ts_test)

# Covariates
if config_modeldesign['weather']:
    ts_cov_train = darts.TimeSeries.from_dataframe(df_cov_train, freq=str(config_dataset['temp_resolution']) + 'min')
    ts_cov_val = darts.TimeSeries.from_dataframe(df_cov_val, freq=str(config_dataset['temp_resolution']) + 'min')
    ts_cov_test = darts.TimeSeries.from_dataframe(df_cov_test, freq=str(config_dataset['temp_resolution']) + 'min')
else:
    ts_cov_train = None
    ts_cov_val = None
    ts_cov_test = None

# Reviewing subseries to make sure they are long enough
ts_train, ts_cov_train = review_subseries(ts_train, n_lags, n_ahead, ts_cov_train)
ts_val, ts_cov_val = review_subseries(ts_val, n_lags, n_ahead, ts_cov_val)
ts_test, ts_cov_test = review_subseries(ts_test, n_lags, n_ahead, ts_cov_test)

# getting the index of the longest subseries, to be used for evaluation later
longest_ts_val_idx = get_longest_subseries_idx(ts_val)
longest_ts_test_idx = get_longest_subseries_idx(ts_test)

In [7]:
# Load pipeline
pipeline = Pipeline( # missing values have been filled in the 'data_prep.ipynb'
                    [
                    BoxCox() if config_modeldesign['boxcox'] else None,
                    Scaler(MinMaxScaler()),
                    ]
                    )

ts_train_piped = pipeline.fit_transform(ts_train)
ts_val_piped = pipeline.transform(ts_val)
ts_test_piped = pipeline.transform(ts_test)

# Weather Pipeline
if config_modeldesign['weather']:
    pipeline_weather = Pipeline([Scaler(RobustScaler())])
    ts_train_weather_piped = pipeline_weather.fit_transform(ts_cov_train)
    ts_val_weather_piped = pipeline_weather.transform(ts_cov_val)
    ts_test_weather_piped = pipeline_weather.transform(ts_cov_test)
else:
    ts_train_weather_piped = None
    ts_val_weather_piped = None
    ts_test_weather_piped = None

In [8]:
trg_train_inversed = pipeline.inverse_transform(ts_train_piped, partial=True) # inverse transform the target, we need the original values for the evaluation
trg_val_inversed = pipeline.inverse_transform(ts_val_piped, partial=True)[longest_ts_val_idx] # inverse transform the target, we need the original values for the evaluation
trg_test_inversed = pipeline.inverse_transform(ts_test_piped, partial=True)[longest_ts_test_idx] # inverse transform the target, we need the original values for the evaluation

## Hyperparameter Tuning with wandb sweep

### XGBoost

In [10]:
def train_xgb():

    wandb.init()
    wandb.config.update(config_modeldesign)
    config = wandb.config

    n_lags = config.lookback_in_hours * timesteps_per_hour

    xgb_kwargs = {
        'n_estimators': config.n_estimators,
        'max_depth': config.max_depth,
        'learning_rate': config.learning_rate,
        'min_child_weight': config.min_child_weight,
        'objective': config.objective,
        'reg_lambda': config.reg_lambda,
        'early_stopping_rounds': 10
    }

    xgb_model = XGBModel(lags=n_lags,
                    lags_future_covariates=[0],
                    add_encoders=datetime_encoders, 
                    output_chunk_length=n_ahead, 
                    likelihood=config.liklihood,
                    random_state=42,
                    **xgb_kwargs
                    )

    print("Training model...")
    xgb_model.fit(ts_train_piped, future_covariates = ts_cov_train, val_series=ts_val_piped, val_future_covariates=ts_cov_val,
                   verbose=False
                   )

    print("Evaluating model...")
    predictions, score = predict_testset(xgb_model, 
                                  ts_test_piped[longest_ts_test_idx], 
                                  ts_cov_test[longest_ts_test_idx],
                                  n_lags, n_ahead, eval_stride, pipeline,
                                  )


    print("Plotting predictions...")
    df_compare = pd.concat([trg_test_inversed.pd_dataframe(), predictions], axis=1).dropna()
    df_compare.columns = ['target', 'prediction']
    fig = px.line(df_compare, title='Predictions vs. Test Set')

    wandb.log({'eval_loss': score})
    wandb.log({'predictions': fig})
    wandb.finish()



In [11]:

config_sweep_xgb = {
    'name': 'XGBoost sweep' + config_dataset['spatial_scale'] + '_' + config_dataset['location'] + '_' + str(config_dataset['temp_resolution']),
    'method': 'bayes', #grid, random
    'metric': {
        'name': 'eval_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'n_estimators': {
            'values': [100, 500, 1000]
        },
        'learning_rate': {
            'values': [0.05, 0.1, 0.2, 0.3]
        },
        'max_depth': {
            'values': [3, 6, 12]
        },
        'min_child_weight': {
            'values': [1, 5, 10]
        },
        'objective': {
            'values': ['reg:squarederror', 'reg:pseudohubererror']
        },
        'reg_lambda': {
            'values': [0.1, 0.3, 0.5, 0.7, 1]
        },
}
}


sweep_id = wandb.sweep(config_sweep_xgb, project="WattCast_tuning")
wandb.agent(sweep_id, train_xgb, count=10)

Create sweep with ID: 59wg3kb6
Sweep URL: https://wandb.ai/wattcast/Wattcast_tuning/sweeps/59wg3kb6


[34m[1mwandb[0m: Agent Starting Run: 1czzjmb1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_child_weight: 5
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	objective: reg:pseudohubererror
[34m[1mwandb[0m: 	reg_lambda: 0.3
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Specified past encoders in `add_encoders` at model creation but model does not accept past covariates. past encoders will be ignored.


Training model...


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Run 1czzjmb1 errored: ValueError('2 different `early_stopping_rounds` are provided.  Use the one in constructor or `set_params` instead.')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 1czzjmb1 errored: ValueError('2 different `early_stopping_rounds` are provided.  Use the one in constructor or `set_params` instead.')
[34m[1mwandb[0m: Agent Starting Run: vwdsst38 with config:
[34m[1mwandb[0m: 	learning_rate: 0.3
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_child_weight: 10
[34m[1mwandb[0m: 	n_estimators: 1000
[34m[1mwandb[0m: 	objective: reg:pseudohubererror
[34m[1mwandb[0m: 	reg_lambda: 0.7
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…

### LightGBM

### GRU Model

In [10]:
def train_gru():

    wandb.init()
    config = wandb.config

    optimizer_kwargs = {}

    optimizer_kwargs['lr'] = config.lr
    
    n_lags = config.lookback_in_hours * timesteps_per_hour

    pl_trainer_kwargs = {
        'max_epochs': 50,
        'accelerator': 'gpu',
        'devices': [0],
        'callbacks': [EarlyStopping(monitor='val_loss', patience=5, mode='min')],
        'logger': WandbLogger(log_model='all'),
    }

    schedule_kwargs = {
        'patience': 2,
        'factor': 0.5,
        'min_lr': 1e-5,
        'verbose': True
        }

    model = RNNModel(  
                    model = 'GRU',
                    input_chunk_length=n_lags,
                    output_chunk_length=n_ahead,
                    hidden_dim=config.hidden_dim,
                    n_rnn_layers=config.n_rnn_layers,
                    batch_size=config.batch_size,
                    dropout=config.dropout,
                    add_encoders=datetime_encoders,
                    likelihood=None,
                    pl_trainer_kwargs=pl_trainer_kwargs,
                    optimizer_kwargs=optimizer_kwargs,
                    lr_scheduler_cls=ReduceLROnPlateau,
                    lr_scheduler_kwargs=schedule_kwargs,
                    random_state=42,
                )

    model.fit(ts_train_piped, future_covariates = ts_cov_train, val_series=ts_val_piped, val_future_covariates=ts_cov_val, verbose=True)

    predictions = predict_testset(model, ts_test_piped[longest_ts_test_idx], ts_cov_test[longest_ts_test_idx]) # visualize only the first 200 time steps

    df_compare = pd.concat([trg_test_inversed.pd_dataframe(), predictions], axis=1).dropna()
    df_compare.columns = ['target', 'prediction']
    fig = px.line(df_compare, title='Predictions vs. Test Set')

    wandb.log({'predictions': fig})


    wandb.finish()


config_sweep_gru = {
    'name': 'GRU sweep' + config_dataset['spatial_scale'] + '_' + config_dataset['location'] + '_' + str(config_dataset['temp_resolution']),
    'method': 'bayes', #grid, random
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'lr': {
            'values': [5e-3, 1e-3, 3e-4]
},
        'hidden_dim': {
            'values': [64, 512, 1024]
},
        'n_rnn_layers': {
            'values': [1, 2, 3]
},
        'dropout': {
            'values': [0.1, 0.2, 0.3]
},
        'batch_size': {
            'values': [32, 64, 128]
},
        'lookback_in_hours': {
            'values': [24]
},
}
}


sweep_id = wandb.sweep(config_sweep_gru, project="WattCast_tuning")
wandb.agent(sweep_id, train_gru, count=1)

Create sweep with ID: bx6jp0c2
Sweep URL: https://wandb.ai/wattcast/Wattcast_tuning/sweeps/bx6jp0c2


[34m[1mwandb[0m: Agent Starting Run: ifj5lzca with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	hidden_dim: 1024
[34m[1mwandb[0m: 	lookback_in_hours: 24
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	n_rnn_layers: 3
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332417533, max=1.0…

Specified past encoders in `add_encoders` at model creation but model does not accept past covariates. past encoders will be ignored.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | rnn           | GRU              | 15.8 M
4 | V             | Linear           | 1.0 K 
---------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
126.157   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting test set...


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁
trainer/global_step,▁▅█

0,1
epoch,0.0
train_loss,
trainer/global_step,136.0
val_loss,


Run ifj5lzca errored: NameError("name 'ts_test_piped' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run ifj5lzca errored: NameError("name 'ts_test_piped' is not defined")


### Transformer Model