# Multi step model (encoder-decoder with teacher forcing)

In this notebook, we will demonstrate how to implement a RNN model to predict multiple time steps into the future using an encoder decoder. The decoder part of the model uses teacher forcing, a method by which the output of one time step is fed to the input of the next time step.

In [None]:
import os
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from collections import UserDict
%matplotlib inline

from common.utils import load_data, mape, TimeSeriesTensor, create_evaluation_df

pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)
warnings.filterwarnings("ignore")

Load data into Pandas dataframe

In [None]:
if not os.path.exists(os.path.join('data', 'energy.csv')):
    # Download and move the zip file
    !wget https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip
    !mv GEFCom2014.zip ./data
    # If not done already, extract zipped data and save as csv
    %run common/extract_data.py
energy = load_data()
energy.head()

In [None]:
valid_start_dt = '2014-09-01 00:00:00'
test_start_dt = '2014-11-01 00:00:00'

T = 6
HORIZON = 3

Create training set containing only the model features

In [None]:
train = energy.copy()[energy.index < valid_start_dt][['load']]

Scale data to be in range (0, 1). This transformation should be calibrated on the training set only. This is to prevent information from the validation or test sets leaking into the training data.

In [None]:
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_scaler.fit(train[['load']])

X_scaler = MinMaxScaler()
train[['load']] = X_scaler.fit_transform(train)

Use the TimeSeriesTensor convenience class to:
1. Shift the values of the time series to create a Pandas dataframe containing all the data for a single training example
2. Discard any samples with missing values
3. Transform this Pandas dataframe into a numpy array of shape (samples, time steps, features) for input into Keras

The class takes the following parameters:

- **dataset**: original time series
- **H**: the forecast horizon
- **tensor_structure**: a dictionary discribing the tensor structure in the form { 'tensor_name' : (range(max_backward_shift, max_forward_shift), [feature, feature, ...] ) }
- **freq**: time series frequency
- **drop_incomplete**: (Boolean) whether to drop incomplete samples

In [None]:
tensor_structure = {'encoder_input':(range(-T+1, 1), ['load']), 'decoder_input':(range(0, HORIZON), ['load'])}
train_inputs = TimeSeriesTensor(train, 'load', HORIZON, tensor_structure)
train_inputs.dataframe.head()

In [None]:
look_back_dt = dt.datetime.strptime(valid_start_dt, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=T-1)
valid = energy.copy()[(energy.index >=look_back_dt) & (energy.index < test_start_dt)][['load']]
valid[['load']] = X_scaler.transform(valid)
valid_inputs = TimeSeriesTensor(valid, 'load', HORIZON, tensor_structure)

## Implement training model

We will implement a RNN forecasting model with the following structure:

![Encoder-decoder RNN model with teacher forcing](./images/encoder_decoder_teacher_forcing.png "Encoder-decoder RNN model with teacher forcing")

In [None]:
from keras.models import Model, Sequential
from keras.layers import GRU, Dense, RepeatVector, TimeDistributed, Flatten, Input
from keras.callbacks import EarlyStopping

In [None]:
BATCH_SIZE = 32
LATENT_DIM = 5
EPOCHS = 50

In [None]:
# define training encoder
encoder_input = Input(shape=(None, 1))
encoder = GRU(LATENT_DIM, return_state=True)
encoder_output, state_h = encoder(encoder_input)
encoder_states = [state_h]

# define training decoder
decoder_input = Input(shape=(None, 1))
decoder_GRU = GRU(LATENT_DIM, return_state=True, return_sequences=True)
decoder_output, _ = decoder_GRU(decoder_input, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(1))
decoder_output = decoder_dense(decoder_output)

model = Model([encoder_input, decoder_input], decoder_output)

In [None]:
model.compile(optimizer='RMSprop', loss='mse')

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)

In [None]:
train_target = train_inputs['target'].reshape(train_inputs['target'].shape[0], train_inputs['target'].shape[1], 1)
valid_target = valid_inputs['target'].reshape(valid_inputs['target'].shape[0], valid_inputs['target'].shape[1], 1)

In [None]:
model.fit([train_inputs['encoder_input'], train_inputs['decoder_input']],
          train_target,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=([valid_inputs['encoder_input'], valid_inputs['decoder_input']], valid_target),
          callbacks=[earlystop],
          verbose=1)

## Implement inference model

In [None]:
# build ingerence encoder model
encoder_model = Model(encoder_input, encoder_states)

# build ingerence decoder model
decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_states_input = [decoder_state_input_h]

decoder_output, state_h = decoder_GRU(decoder_input, initial_state=decoder_states_input)
decoder_states = [state_h]
decoder_output = decoder_dense(decoder_output)
decoder_model = Model([decoder_input] + decoder_states_input, [decoder_output] + decoder_states)

In [None]:
# Define the funtion to make single sequence prediction 
# based on scoring encoder-decoder
def predict_single_sequence(single_input_seq, horizon, n_features):
    # apply encoder model to the input_seq to get state
    states_value = encoder_model.predict(single_input_seq)
    
    # get input for decoder's first time step (which is encoder input at time t)
    dec_input = np.zeros((1, 1, n_features))
    dec_input[0, 0, 0] = single_input_seq[0, -1, :]
    
    # create final output placeholder
    output = list()
    # collect predictions
    for t in range(horizon):
        # predict next value
        yhat, h = decoder_model.predict([dec_input] + [states_value])
        # store prediction
        output.append(yhat[0,0,:])
        # update state
        state = [h]
        # update decoder input to be used as input for next prediction
        dec_input[0, 0, 0] = yhat
        
    return np.array(output)

In [None]:
# example of single sequence prediction
print(predict_single_sequence(valid_inputs['encoder_input'][0:1], HORIZON, 1))

In [None]:
# Define the funtion to make multiple sequence prediction 
# based on scoring encoder-decoder
def predict_multi_sequence(input_seq_multi, horizon, n_features):
    # create output placeholder
    predictions_all = list()
    for seq_index in range(input_seq_multi.shape[0]):       
        # Take one sequence for decoding
        input_seq = input_seq_multi[seq_index: seq_index + 1]
        # Generate prediction for the single sequence
        predictions = predict_single_sequence(input_seq, horizon, n_features)
        # store all the sequence prediction
        predictions_all.append(predictions)
        
    return np.array(predictions_all)

## Evaluate the model

In [None]:
look_back_dt = dt.datetime.strptime(test_start_dt, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=T-1)
test = energy.copy()[test_start_dt:][['load']]
test[['load']] = y_scaler.transform(test)
test_inputs = TimeSeriesTensor(test, 'load', HORIZON, tensor_structure)

In [None]:
# example of multiple sequence prediction based on validation data
test_predictions_all = predict_multi_sequence(test_inputs['encoder_input'], HORIZON, 1)
test_predictions_all.shape

In [None]:
test_predictions_all_eval = test_predictions_all.reshape(test_predictions_all.shape[0], test_predictions_all.shape[1])
test_predictions_all_eval.shape

In [None]:
eval_df = create_evaluation_df(test_predictions_all_eval, test_inputs, HORIZON, y_scaler)
eval_df.head()

In [None]:
eval_df['APE'] = (eval_df['prediction'] - eval_df['actual']).abs() / eval_df['actual']
eval_df.groupby('h')['APE'].mean()

In [None]:
mape(eval_df['prediction'], eval_df['actual'])

In [None]:
plot_df = eval_df[(eval_df.timestamp<'2014-11-08') & (eval_df.h=='t+1')][['timestamp', 'actual']]
for t in range(1, HORIZON+1):
    plot_df['t+'+str(t)] = eval_df[(eval_df.timestamp<'2014-11-08') & (eval_df.h=='t+'+str(t))]['prediction'].values

fig = plt.figure(figsize=(15, 8))
ax = plt.plot(plot_df['timestamp'], plot_df['actual'], color='red', linewidth=4.0)
ax = fig.add_subplot(111)
ax.plot(plot_df['timestamp'], plot_df['t+1'], color='blue', linewidth=4.0, alpha=0.75)
ax.plot(plot_df['timestamp'], plot_df['t+2'], color='blue', linewidth=3.0, alpha=0.5)
ax.plot(plot_df['timestamp'], plot_df['t+3'], color='blue', linewidth=2.0, alpha=0.25)
plt.xlabel('timestamp', fontsize=12)
plt.ylabel('load', fontsize=12)
ax.legend(loc='best')
plt.show()