In [None]:
from keras.layers import Input, Dense, Layer, LSTM, TimeDistributed
from keras.models import Model
import numpy as np
import importlib

# import keras-layer-zoo from parent directory
import sys
sys.path.append("./..")
from kulc import attention, layer_normalization

# Demos

This notebook contains some very rudimentary examples how to use (some of) the layers implemented in the *Keras Utility & Layer Collection (kulc)*.

- [Scaled Dot-Product Attention](#sdpattention)
- [Multi-Head Attention](#mhatn)
- [Layer Normalization](#layernorm)
- [Sequencewise Attention](#seqatn)
- [Attention Wrapper](#atnwrapper)

### Scaled Dot-Product Attention <a name="sdpattention"></a>

Implementation as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762). Performs a non-linear transformation on the values `V` by comparing the queries `Q` with the keys `K`. The illustration below is taken from the paper cited above.

In [None]:
# input: time series with 16 steps
# each step has a 256dim valuethe output sequences
# of a LSTM, RNN, etc.
net_input = Input(shape=(16, 256))
net = TimeDistributed(Dense(256))(net_input)

# queries
net_q = TimeDistributed(Dense(256))(net_input)
# values
net_v = TimeDistributed(Dense(256))(net_input)
# keys
net_k = TimeDistributed(Dense(256))(net_input)

# add one ScaledDotProductAttention layer
net = attention.ScaledDotProductAttention(name="attention", return_attention=False)([net_q, net_v, net_k])

net_output = TimeDistributed(Dense(128))(net)

model = Model(inputs=net_input, outputs=net_output)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam")

# dummy data
x = np.random.rand(64, 16, 256)
y = np.random.rand(64, 16, 128)

model.fit(x, y, batch_size=16, epochs=1)

### Multi-Head Attention <a name="mhatn"></a>
Implementation as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762). This is basically just a bunch a [Scaled Dot-Product Attention](#sdpattention) blocks whose output is combined with a linear transformation. The illustration below is taken from the paper cited above.

In [None]:
# input: time series with 16 steps
# each step has a 256dim valuethe output sequences
# of a LSTM, RNN, etc.
net_input = Input(shape=(16, 256))
net = TimeDistributed(Dense(256))(net_input)

# queries
net_q = TimeDistributed(Dense(256))(net_input)
# values
net_v = TimeDistributed(Dense(256))(net_input)
# keys
net_k = TimeDistributed(Dense(256))(net_input)

# h: the number of parallel attention heads
net = attention.MultiHeadAttention(h=2, name="attention", return_attention=False)([net_q, net_v, net_k])

net_output = TimeDistributed(Dense(128))(net)

model = Model(inputs=net_input, outputs=net_output)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam")

# dummy data
x = np.random.rand(64, 16, 256)
y = np.random.rand(64, 16, 128)

model.fit(x, y, batch_size=16, epochs=1)

### Layer Normalization <a name="layernorm"></a>

In [None]:
# input: time series with 16 steps
# each step has a 256dim valuethe output sequences
# of a LSTM, RNN, etc.
net_input = Input(shape=(16, 256))
net = TimeDistributed(Dense(128))(net_input)

net_output = layer_normalization.LayerNormalization(name="normalization")(net)

model = Model(inputs=net_input, outputs=net_output)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam")

# dummy data
x = np.random.rand(64, 16, 256)
y = np.random.rand(64, 16, 128)

model.fit(x, y, batch_size=16, epochs=1)

### Sequencewise Attention <a name="seqatn"></a>

This layer applies various attention transformations on data. It needs a time-series of queries and a time-series of values to calculate the attention and the final linear transformation to obtain the output. This is a faster version of the general attention technique. It is similar to the global attention method described in [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025)

It takes two inputs of the shape (batch_size, T, dim1) and (batch_size, T, dim2),
whereby the first item is the source data and the second one the key data.
This layer then calculates for each batch's element and each time step a softmax attention 
vector between the key data and the source data. Finally, this attention vector is multiplied
with the source data to obtain a weighted output. This means, that the key data is used to
interpret the source data in a special way to create an output of the same shape as the source data.

In [None]:
# extension of the Seq2Seq example from:
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
latent_dim = 256
n_encoder_tokens = 16
n_decoder_tokens = 16

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, n_encoder_tokens), name="encoder_inputs")
encoder = LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder")
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, n_decoder_tokens), name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder")
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(n_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# The follwing two lines of code are the only difference to the tutorial's code.
# They add a sequencewise attention layer to the model.

# The layer performs on the encoder state sequence and the
# decoder state sequence.
# It takes two inputs of the shape (batch_size, T, dim1) and (batch_size, T, dim2),
# whereby the first item is the source data and the second one the key data.
# This layer then calculates for each batch's element and each time step a softmax attention 
# vector between the key data and the source data. Finally, this attention vector is multiplied
# with the source data to obtain a weighted output. This means, that the key data is used to
# interpret the source data in a special way to create an output of the same shape as the source data.
decoder_attention = attention.SequenceAttention(similarity="additive", name="attention")
decoder_outputs = decoder_attention([decoder_outputs, encoder_outputs])

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(loss="mse", optimizer="adam")

# dummy data
x_encoder = np.random.randn(64, 20, n_encoder_tokens)
x_decoder = np.random.randn(64, 20, n_encoder_tokens)
y = np.random.randn(64, 20, n_decoder_tokens)

model.fit(x=[x_encoder, x_decoder], y=y, batch_size=32, epochs=1)

### Attention Wrapper/Input Feeding Attention <a name="atnwrapper"></a>
The idea of the implementation is based on the paper:
    [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025) by Luong et al.

This layer is an attention layer, which can be wrapped around arbitrary RNN layers.
This way, after each time step an attention vector is calculated
based on the current output of the LSTM and the entire input time series.
This attention vector is then used as a weight vector to choose special values
from the input data. This data is then finally concatenated to the next input
time step's data. On this a linear transformation in the same space as the input data's space
is performed before the data is fed into the RNN cell again.

This technique is similar to the *input-feeding* method described in the paper cited

In [None]:
# extension of the Seq2Seq example from:
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
latent_dim = 256
n_encoder_tokens = 16
n_decoder_tokens = 16

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, n_encoder_tokens), name="encoder_inputs")
encoder = LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder")
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, n_decoder_tokens), name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_pure_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder")

# This line is the only difference compared to the Seq2Seq example from the Keras blog.
# It adds a attention layer, which is wrapped around the decoder LSTM.
# This way, after each time step an attention vector is calculated
# based on the current output of the LSTM and the entire input time series.
# This attention vector is then used as a weight vector to choose special values
# from the input data. This data is then finally concatenated to the next input
# time step's data. On this a linear transformation in the same space as the input data's space
# is performed before the data is fed into the RNN cell again.
decoder_lstm = attention.AttentionRNNWrapper(decoder_pure_lstm)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(n_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(loss="mse", optimizer="adam")

# dummy data
x_encoder = np.random.randn(64, 20, n_encoder_tokens)
x_decoder = np.random.randn(64, 20, n_encoder_tokens)
y = np.random.randn(64, 20, n_decoder_tokens)

model.fit(x=[x_encoder, x_decoder], y=y, batch_size=32, epochs=1)