# Time Series Neural Networks

# Setup

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.ndimage.filters import convolve1d

from pprint import pprint
from IPython.display import Image
from IPython.core.display import HTML 

import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Activation
from keras.layers import Dense
from keras.layers import Flatten

# Combinations
* There are several combinations of approaches to handling time series
* Time Series
    * Univariate 
    * Multivariate
* Model Choice
    * ETS
    * SARIMAX
    * Prophet
    * Machine Learning
    * RNN (LSTM, GRU)
    * CNN
* Output 
    * One Step
    * Multi Step
    * Parallel 

# Neural Networks
* Brief, high Level overview of neural networks

## Neuron

In [120]:
Image(url="https://miro.medium.com/max/2739/1*L9xLcwKhuZ2cuS8fF0ZjwA.png")

## Non-Linearity

In [122]:
Image(url="https://www.researchgate.net/profile/Muhammad_Hamdan9/publication/327435257/figure/fig4/AS:742898131812354@1554132125449/Activation-Functions-ReLU-Tanh-Sigmoid.ppm")

## Network

In [95]:
Image(url="https://miro.medium.com/max/2622/1*eJ36Jpf-DE9q5nKk67xT0Q.jpeg")

## Back Propagation

In [123]:
Image(url="https://miro.medium.com/max/2313/1*DcLWqOojI1b9jzQaLibUkQ.png")

# Univariate - One Step Ahead

## Problem
* Univariate, One Step Prediction
* What number is next in the sequence?
* We want to turn this into a supervised learning problem

In [92]:
series = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

## Univariate to Supervised

In [93]:
def subseries(series, steps=3):
    X, y = [], []
    size = len(series)
    for index in range(size):
        if index + steps > size - 1:
            break
        seqX = series[index:index + steps]
        seqY = series[index + steps]
        X.append(seqX)
        y.append(seqY)
    return np.array(X), np.array(y)

## Data

In [94]:
steps = 3
X, y = subseries(series)
for index in range(len(X)):
    print(X[index], y[index])

[10 20 30] 40
[20 30 40] 50
[30 40 50] 60
[40 50 60] 70
[50 60 70] 80
[60 70 80] 90
[70 80 90] 100


## Model
* This next code block is dense in terms of learning
    * Sequential
    * Dense
    * Activation - Relu
    * Input Dimension
    * Compile
    * Optimizer - Adam
    * Loss - mse
    * Fit - Epochs

In [98]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=steps))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
history = model.fit(X, y, epochs=2000, verbose=0)

In [97]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 501
Trainable params: 501
Non-trainable params: 0
_________________________________________________________________


## Predict

In [46]:
X_pred = np.array([80, 90, 100]) # --> 110
X_pred = X_pred.reshape((1, X.shape[1]))
model.predict(X_pred)

array([[110.84832]], dtype=float32)

# Univariate - Multiple Step

## Problem
* Similar to the univariate model
* We have a single time series
* We want a model that can output n steps into the future
* Which is a generalization of the single step problem where steps = 1 

In [None]:
series = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

## Univariate to MultiStep
* We want to grab the 1st, 2nd, and 3rd items in the series
* Predict the 4th & 5th item
* Continue to the end of the series

In [81]:
def subseriesMultistep(series, steps=3, output=2):
    X, y = [], []
    size = len(series)
    for index in range(size):
        if index + steps > size - output:
            break
        seqX = series[index:index + steps]
        seqY = series[index + steps: index + steps + output]
        X.append(seqX)
        y.append(seqY)
    return np.array(X), np.array(y)

## Data

In [85]:
steps  = 3
output = 2
X, y = subseriesMultistep(series, steps, output)
for index in range(len(X)):
    print(X[index], y[index])

[10 20 30] [40 50]
[20 30 40] [50 60]
[30 40 50] [60 70]
[40 50 60] [70 80]
[50 60 70] [80 90]
[60 70 80] [ 90 100]


## Model

In [86]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=steps))
model.add(Dense(output))
model.compile(optimizer='adam', loss='mse')
history = model.fit(X, y, epochs=2000, verbose=0)

## Predict

In [87]:
X_pred = np.array([80, 90, 100]) # --> [110, 120]
X_pred = X_pred.reshape((1, X.shape[1]))
model.predict(X_pred)

array([[110.62564, 121.8443 ]], dtype=float32)

# Multivariate - One Step Ahead

## Problem
* We have two series of numbers
* We would like to have them added together

In [53]:
series1 = np.array([10, 20, 30, 40,  50,  60,  70,  80,  90,  100])
series2 = np.array([50, 70, 90, 110, 130, 150, 170, 190, 210, 230])
seriesY = np.add(series1, series2)
seriesY

array([ 60,  90, 120, 150, 180, 210, 240, 270, 300, 330])

## Reshape
* Our data is in series
* We need our data in X, Y, Z format (X + Y = Z)

In [54]:
series1 = series1.reshape((series1.shape[0], 1))
series2 = series2.reshape((series2.shape[0], 1))
seriesY = seriesY.reshape((seriesY.shape[0], 1))
data = np.hstack((series1, series2, seriesY))
data

array([[ 10,  50,  60],
       [ 20,  70,  90],
       [ 30,  90, 120],
       [ 40, 110, 150],
       [ 50, 130, 180],
       [ 60, 150, 210],
       [ 70, 170, 240],
       [ 80, 190, 270],
       [ 90, 210, 300],
       [100, 230, 330]])

## Multivariate to Supervised

In [66]:
def subseriesMultivariate(series, steps=3):
    X, y = [], []
    size = len(series)
    for index in range(size):
        if index + steps > size - 1:
            break
        seqX = series[index :index + steps, :-1]
        seqY = series[index + steps - 1, -1]
        X.append(seqX)
        y.append(seqY)
    return np.array(X), np.array(y)

## Data

In [67]:
X_mv, y_mv = subseriesMultivariate(data, steps=3)
for index in range(len(X_mv)):
    print(X_mv[index], y_mv[index])

[[10 50]
 [20 70]
 [30 90]] 120
[[ 20  70]
 [ 30  90]
 [ 40 110]] 150
[[ 30  90]
 [ 40 110]
 [ 50 130]] 180
[[ 40 110]
 [ 50 130]
 [ 60 150]] 210
[[ 50 130]
 [ 60 150]
 [ 70 170]] 240
[[ 60 150]
 [ 70 170]
 [ 80 190]] 270
[[ 70 170]
 [ 80 190]
 [ 90 210]] 300


## Flatten

In [72]:
print(X_mv.shape)
n_input = X_mv.shape[1] * X_mv.shape[2]
data = X_mv.reshape((X.shape[0], n_input))
print(data.shape)
data

(7, 3, 2)
(7, 6)


array([[ 10,  50,  20,  70,  30,  90],
       [ 20,  70,  30,  90,  40, 110],
       [ 30,  90,  40, 110,  50, 130],
       [ 40, 110,  50, 130,  60, 150],
       [ 50, 130,  60, 150,  70, 170],
       [ 60, 150,  70, 170,  80, 190],
       [ 70, 170,  80, 190,  90, 210]])

## Model

In [74]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=n_input))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
history = model.fit(data, y_mv, epochs=2000, verbose=0)

## Predict

In [77]:
X_pred = np.array([[80, 190], [90, 210], [100, 230]])
X_pred = X_pred.reshape((1, n_input))
y_hat = model.predict(X_pred, verbose=0)
print(y_hat)

[[332.39563]]


# Multivariate - Multiple Step

## Problem

In [89]:
series1 = np.array([10, 20, 30, 40,  50,  60,  70,  80,  90,  100])
series2 = np.array([50, 70, 90, 110, 130, 150, 170, 190, 210, 230])
seriesY = np.add(series1, series2)
seriesY

array([ 60,  90, 120, 150, 180, 210, 240, 270, 300, 330])

## Reshape

In [90]:
series1 = series1.reshape((series1.shape[0], 1))
series2 = series2.reshape((series2.shape[0], 1))
seriesY = seriesY.reshape((seriesY.shape[0], 1))
data = np.hstack((series1, series2, seriesY))
data

array([[ 10,  50,  60],
       [ 20,  70,  90],
       [ 30,  90, 120],
       [ 40, 110, 150],
       [ 50, 130, 180],
       [ 60, 150, 210],
       [ 70, 170, 240],
       [ 80, 190, 270],
       [ 90, 210, 300],
       [100, 230, 330]])

## Multivariate to MultiStep

In [95]:
def subseriesMultivariate(series, steps=3, output=2):
    X, y = [], []
    size = len(series)
    for index in range(size):
        if index + steps > size - output:
            break
        seqX = series[index :index + steps, :-1]
        seqY = series[index + steps - 1 :index + steps + output -1, -1]
        X.append(seqX)
        y.append(seqY)
    return np.array(X), np.array(y)

## Data

In [97]:
steps  = 3
output = 2
X_mv, y_mv = subseriesMultivariate(data, steps=steps, output=output)
for index in range(len(X_mv)):
    print(X_mv[index], y_mv[index])

[[10 50]
 [20 70]
 [30 90]] [120 150]
[[ 20  70]
 [ 30  90]
 [ 40 110]] [150 180]
[[ 30  90]
 [ 40 110]
 [ 50 130]] [180 210]
[[ 40 110]
 [ 50 130]
 [ 60 150]] [210 240]
[[ 50 130]
 [ 60 150]
 [ 70 170]] [240 270]
[[ 60 150]
 [ 70 170]
 [ 80 190]] [270 300]


## Flatten

In [98]:
print(X_mv.shape)
n_input = X_mv.shape[1] * X_mv.shape[2]
data = X_mv.reshape((X.shape[0], n_input))
print(data.shape)
data

(6, 3, 2)
(6, 6)


array([[ 10,  50,  20,  70,  30,  90],
       [ 20,  70,  30,  90,  40, 110],
       [ 30,  90,  40, 110,  50, 130],
       [ 40, 110,  50, 130,  60, 150],
       [ 50, 130,  60, 150,  70, 170],
       [ 60, 150,  70, 170,  80, 190]])

## Model

In [100]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=n_input))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')
history = model.fit(data, y_mv, epochs=2000, verbose=0)

## Predict

In [101]:
X_pred = np.array([[80, 190], [90, 210], [100, 230]])
X_pred = X_pred.reshape((1, n_input))
y_hat = model.predict(X_pred, verbose=0)
print(y_hat)

[[333.90652 366.3522 ]]


# Recurrent Neural Networks

In [75]:
Image(url="https://miro.medium.com/max/750/1*T_ECcHZWpjn0Ki4_4BEzow.gif")

## Unrolled

In [78]:
Image(url="https://www.researchgate.net/profile/Lei_Tai/publication/311805526/figure/fig3/AS:667790805565446@1536225143793/Recurrent-Neural-Network-Structure-The-left-is-the-typical-RNN-structure-The-right-part.png")

## Vanilla RNN
* RNNs
    * Train relatively quickly
    * Suffer from short term memory
* Challenges
    * http://proceedings.mlr.press/v28/pascanu13.pdf
    * Vanishing gradients --> Zero
    * Exploding gradients --> Infinity (NaN)

In [89]:
Image(url="https://decker.com/wp-content/uploads/2017/09/telephone-game-kids-whispering.jpg")

In [86]:
small_error = 0.01
large_error = 100

print(f'Vanishing {small_error ** 5}')
print(f'Exploding {large_error ** 5}')

Vanishing 1.0000000000000002e-10
Exploding 10000000000


# LSTM
* https://www.researchgate.net/publication/13853244_Long_Short-term_Memory
* Long short term memory
* How do we avoid vanishing gradients!  
    * Short term memory
    * Long term memory
* Key takeaways:
    * Long term memory (Ct) can pass through
    * LSTM is computationally expensive
* Not the only approach
    * LSTM Long Short Term Memory
    * GRU Gated Recurrent Unit (Faster)

In [88]:
Image(url="https://chunml.github.io/images/projects/creating-text-generator-using-recurrent-neural-network/LSTM.png")

## Data

In [100]:
series = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

## Reshape
* Our data is in [samples, input_features] shape
* We need data in [samples, timesteps, output_features] shape

### Input
* Input [samples, features]
* Input data has 7 samples
* Input data has 3 features

In [101]:
steps = 3
X, y = subseries(series, steps=steps)
print(X.shape, y.shape)

(7, 3) (7,)


### Output
* Reshaped [samples, timesteps, features]
* Reshaped data has 7 samples (rows in X)
* Reshaped data has 3 timesteps (our input data's features)
* Reshaped data has 1 feature (one value of the timestep at a time)

In [102]:
features = 1
X = X.reshape(X.shape[0], steps, features)
print(X.shape, y.shape)

(7, 3, 1) (7,)


## Model
* LSTM
* Input_shape

In [103]:
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(steps, features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
history = model.fit(X, y, epochs=500, verbose=0)

W0304 20:32:44.914070 140735617508224 deprecation.py:323] From /Users/wilsons/anaconda3/envs/pyjup/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [104]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50)                10400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 51        
Total params: 10,451
Trainable params: 10,451
Non-trainable params: 0
_________________________________________________________________


## Predict

In [114]:
X_pred = np.array([80, 90, 100])
X_pred = X_pred.reshape((1, steps, features))
y_hat = model.predict(X_pred, verbose=0)
y_hat

array([[109.55472]], dtype=float32)

# Convolutions

## Data

In [26]:
series = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

## Sliding
* Imagine we're sliding a window across the sequence
* We look at just the values in the window (size)
* Then slide it to the right n positions (stride)

## Kernel / Filter
* We take the values in the window and multiply by weights
* In Conv1D layers, we specify how many filters / kernels / feature maps we want
* When you hear filter / kernel / feature map think weights that we learn from data
* We randomly initiatlize a filter, but for illustrative purposes I set them
    * Identity
    * Doubler
* After we apply the kernel we sum the weighted values

In [108]:
def convolution1D(series, filter1D):
    values = []
    size = len(filter1D)
    for index in range(len(series) - size + 1):
        window   = series[index : index + size] 
        filtered = window * filter1D
        summed   = filtered.sum()
        print(f'Window {window}  * Filter {filter1D}  = {filtered}  --> Sum {summed}')
        values.append(summed)
    return values

In [110]:
# Identity
c = convolution1D(series, np.array([1,1,1]))
print(f'Convolution: {c}')

Window [10, 20, 30]  * Filter [1 1 1]  = [10 20 30]  --> Sum 60
Window [20, 30, 40]  * Filter [1 1 1]  = [20 30 40]  --> Sum 90
Window [30, 40, 50]  * Filter [1 1 1]  = [30 40 50]  --> Sum 120
Window [40, 50, 60]  * Filter [1 1 1]  = [40 50 60]  --> Sum 150
Window [50, 60, 70]  * Filter [1 1 1]  = [50 60 70]  --> Sum 180
Window [60, 70, 80]  * Filter [1 1 1]  = [60 70 80]  --> Sum 210
Window [70, 80, 90]  * Filter [1 1 1]  = [70 80 90]  --> Sum 240
Window [80, 90, 100]  * Filter [1 1 1]  = [ 80  90 100]  --> Sum 270
Convolution: [60, 90, 120, 150, 180, 210, 240, 270]


In [69]:
Image(url= "https://cdn-images-1.medium.com/max/1600/1*tfESmBDSXnJzBMFxPvqZzg.gif") 

In [49]:
# Doubler
c = convolution1D(series, np.array([2,2,2]))
print(f'Convolution: {c}')

Window [10 20 30]  * Filter [2, 2, 2]  = [20 40 60]  --> Sum 120
Window [20 30 40]  * Filter [2, 2, 2]  = [40 60 80]  --> Sum 180
Window [30 40 50]  * Filter [2, 2, 2]  = [ 60  80 100]  --> Sum 240
Window [40 50 60]  * Filter [2, 2, 2]  = [ 80 100 120]  --> Sum 300
Window [50 60 70]  * Filter [2, 2, 2]  = [100 120 140]  --> Sum 360
Window [60 70 80]  * Filter [2, 2, 2]  = [120 140 160]  --> Sum 420
Window [70 80 90]  * Filter [2, 2, 2]  = [140 160 180]  --> Sum 480
Window [ 80  90 100]  * Filter [2, 2, 2]  = [160 180 200]  --> Sum 540
Convolution: [120, 180, 240, 300, 360, 420, 480, 540]


In [112]:
# What about?
[0, 0, 1]

[0, 0, 1]

In [111]:
c = convolution1D(series, np.array([0, 0, 1]))

Window [10, 20, 30]  * Filter [0 0 1]  = [ 0  0 30]  --> Sum 30
Window [20, 30, 40]  * Filter [0 0 1]  = [ 0  0 40]  --> Sum 40
Window [30, 40, 50]  * Filter [0 0 1]  = [ 0  0 50]  --> Sum 50
Window [40, 50, 60]  * Filter [0 0 1]  = [ 0  0 60]  --> Sum 60
Window [50, 60, 70]  * Filter [0 0 1]  = [ 0  0 70]  --> Sum 70
Window [60, 70, 80]  * Filter [0 0 1]  = [ 0  0 80]  --> Sum 80
Window [70, 80, 90]  * Filter [0 0 1]  = [ 0  0 90]  --> Sum 90
Window [80, 90, 100]  * Filter [0 0 1]  = [  0   0 100]  --> Sum 100


## Scipy

In [50]:
weights = [1, 1, 1]
start = 1
end   = len(weights) - start

scipy.ndimage.filters.convolve1d(series, weights = weights, origin=start, mode='constant')[: -end]

array([ 60,  90, 120, 150, 180, 210, 240, 270])

## Max Pooling 1D
* Max pooling down samples the prior convolutions
* Helps with location invariance
* This is often applied in computer vision, but used in time series
* To complete max pooling, we slide over the convolved values with a window (size)
* We take the max value in that window.
* Then we slide the window over to the right (stride)

In [65]:
def maxpool1D(series, size=2):
    values = []
    for index in range(len(series) - size + 1):
        window = series[index : index + size]
        maxed  = max(window)
        values.append(maxed)
    return values

In [67]:
# Identity
c = convolution1D(series, [1,1,1])
print(f'Convolution: {c}')

m = maxpool1D(c)
print(f'MaxPool:     {m}')

Window [10 20 30]  * Filter [1, 1, 1]  = [10 20 30]  --> Sum 60
Window [20 30 40]  * Filter [1, 1, 1]  = [20 30 40]  --> Sum 90
Window [30 40 50]  * Filter [1, 1, 1]  = [30 40 50]  --> Sum 120
Window [40 50 60]  * Filter [1, 1, 1]  = [40 50 60]  --> Sum 150
Window [50 60 70]  * Filter [1, 1, 1]  = [50 60 70]  --> Sum 180
Window [60 70 80]  * Filter [1, 1, 1]  = [60 70 80]  --> Sum 210
Window [70 80 90]  * Filter [1, 1, 1]  = [70 80 90]  --> Sum 240
Window [ 80  90 100]  * Filter [1, 1, 1]  = [ 80  90 100]  --> Sum 270
Convolution: [60, 90, 120, 150, 180, 210, 240, 270]
MaxPool:     [90, 120, 150, 180, 210, 240, 270]


## Recap Convolution
* Convolution
    * Filter slides over data
        * You decide on stride
        * You decide on padding
        * Back propagation will update your filter's weight
    * Element-wise multiplication and addition
    * Useful for position / location invariance
    * Weight resharing (fewer parameters)
* Max Pooling
    * Down samples data
    * Helpful for position invariance
    * Other approaches (average pooling)
    * Common in deep learning for computer vision

# Conv1D
* Use a Conv1D layer in a neural network

## Data

In [113]:
series = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

## Reshape

In [114]:
steps = 3
X, y = subseries(series, steps=steps)
print(X.shape, y.shape)

(7, 3) (7,)


In [115]:
features = 1
X = X.reshape(X.shape[0], steps, features)
print(X.shape, y.shape)

(7, 3, 1) (7,)


## Model

In [116]:
model = Sequential()
model.add(Conv1D(64, 2, activation='relu', input_shape=(steps, features)))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
history = model.fit(X, y, epochs=1000, verbose=0)

W0304 20:54:33.206201 140735617508224 deprecation_wrapper.py:119] From /Users/wilsons/anaconda3/envs/pyjup/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



In [117]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 2, 64)             192       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                3250      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 51        
Total params: 3,493
Trainable params: 3,493
Non-trainable params: 0
_________________________________________________________________


## Predict

In [118]:
X_pred = np.array([80, 90, 100])
X_pred = X_pred.reshape((1, steps, features))
y_hat = model.predict(X_pred)
y_hat

array([[111.74727]], dtype=float32)

# Summary
* Many ways to use neural networks for sequences (time series)
    * Focus on buiding sequences (numpy)
    * Nets give you flexibility in composition
* Feed Forward Network
* Recurrent (LSTM)
* 1D Convolutional Network