##### ref
* https://github.com/CyberZHG/keras-multi-head

# 1. multi-head

**MultiHead is not MultiHeadAttention**

# 2D data

In [1]:
import numpy as np

x_data2 = np.random.randint(10, size=(10,6,18))
y_data2 = np.random.randint(2, size=(10,2))

## without multi-head-attention

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Conv2D, MaxPooling2D, Convolution2D, ZeroPadding2D,LSTM
from keras_multi_head import MultiHead

# LSTM model
model = Sequential()
model.add(LSTM(4, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', return_sequences=True))
model.add(LSTM(6))
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam')

batch_size = 128

# fit model to data
# paramter validation_data to show validation-data for each step of training
# parameter callbacks to use tensorboard
model.fit(x_data2, y_data2,
          epochs=10)


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f1bf80ac2b0>

In [3]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 6, 4)              368       
_________________________________________________________________
lstm_2 (LSTM)                (None, 6)                 264       
_________________________________________________________________
dense_1 (Dense)              (None, 20)                140       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 42        
Total params: 814
Trainable params: 814
Non-trainable params: 0
_________________________________________________________________


## with multi-head-attention

**should reshape dataset, then use 1d-attention**

In [4]:
x_data2 = x_data2.reshape(x_data2.shape[0], x_data2.shape[1]*x_data2.shape[2])
y_data2 = y_data2

In [5]:
import keras
from keras_multi_head import MultiHead


model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=5, name='Multi-LSTMs'))
model.add(keras.layers.Flatten(name='Flatten'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
Multi-LSTMs (MultiHead)      (None, 32, 5)             33920     
_________________________________________________________________
Flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
Dense (Dense)                (None, 2)                 322       
Total params: 36,402
Trainable params: 36,402
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f1be4e27550>

## other layers with attention

### (1) multi-lstm layers

In [6]:
import keras
from keras_multi_head import MultiHead


model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTMs'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTMs2'))
model.add(keras.layers.Flatten(name='Flatten'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
Multi-LSTMs (MultiHead)      (None, 32, 1)             6784      
_________________________________________________________________
Multi-LSTMs2 (MultiHead)     (None, 32, 1)             4352      
_________________________________________________________________
Flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
Dense (Dense)                (None, 2)                 66        
Total params: 13,362
Trainable params: 13,362
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f1bdef84b38>

### (2) lstm+dense layers

In [7]:
import keras
from keras_multi_head import MultiHead


model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTMs2'))
model.add(keras.layers.Flatten(name='Flatten'))
model.add(MultiHead(keras.layers.Dense(units=32), layer_num=1, name='Multi-LSTMs'))
model.add(keras.layers.Flatten(name='Flatten2'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
Multi-LSTMs2 (MultiHead)     (None, 32, 1)             6784      
_________________________________________________________________
Flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
Multi-LSTMs (MultiHead)      (None, 32, 1)             1056      
_________________________________________________________________
Flatten2 (Flatten)           (None, 32)                0         
_________________________________________________________________
Dense (Dense)                (None, 2)                 66        
Total params: 10,066
Trainable params: 10,066
Non-trainable params: 0
__________________________________________________

<keras.callbacks.callbacks.History at 0x7f1bde0a78d0>

### (3) lstm+cnn+dense layers

In [8]:
import keras
from keras_multi_head import MultiHead


model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTMs2'))
model.add(keras.layers.Flatten(name='Flatten'))
model.add(MultiHead(keras.layers.Dense(units=32), layer_num=1, name='Multi-LSTMs'))
model.add(MultiHead(keras.layers.Conv1D(filters=32, kernel_size=3, padding='same'), layer_num=1, name='Multi-CNN'))
model.add(keras.layers.Flatten(name='Flatten2'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
Multi-LSTMs2 (MultiHead)     (None, 32, 1)             6784      
_________________________________________________________________
Flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
Multi-LSTMs (MultiHead)      (None, 32, 1)             1056      
_________________________________________________________________
Multi-CNN (MultiHead)        (None, 32, 32, 1)         128       
_________________________________________________________________
Flatten2 (Flatten)           (None, 1024)              0         
_________________________________________________________________
Dense (Dense)                (None, 2)                

<keras.callbacks.callbacks.History at 0x7f1bdd499a20>

In [34]:
from keras.layers import Input, Deconvolution2D

model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTMs2'))
model.add(keras.layers.Flatten(name='Flatten'))
model.add(MultiHead(keras.layers.Dense(units=32), layer_num=1, name='Multi-LSTMs'))
model.add(MultiHead(keras.layers.Conv1D(filters=32, kernel_size=3, padding='same'), layer_num=1, name='Multi-CNN'))
model.add(keras.layers.Flatten(name='Flatten2'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()


Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
Multi-LSTMs2 (MultiHead)     (None, 32, 1)             6784      
_________________________________________________________________
Flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
Multi-LSTMs (MultiHead)      (None, 32, 1)             1056      
_________________________________________________________________
Multi-CNN (MultiHead)        (None, 32, 32, 1)         128       
_________________________________________________________________
Flatten2 (Flatten)           (None, 1024)              0         
_________________________________________________________________
Dense (Dense)                (None, 2)               

In [26]:
import keras
from keras_multi_head import MultiHeadAttention

input_layer = keras.layers.Input(
    shape=(2, 3),
    name='Input',
)
att_layer = MultiHeadAttention(
    head_num=3,
    name='Multi-Head',
)(input_layer)
model = keras.models.Model(inputs=input_layer, outputs=att_layer)
model.compile(
    optimizer='adam',
    loss='mse',
    metrics={},
)
model.summary()


Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 2, 3)              0         
_________________________________________________________________
Multi-Head (MultiHeadAttenti (None, 2, 3)              48        
Total params: 48
Trainable params: 48
Non-trainable params: 0
_________________________________________________________________


# 2. MultiHeadAttention

## 2.1 LSTM+MultiHeadAttention

In [74]:
import keras
from keras_multi_head import MultiHeadAttention


model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=x_data2.shape[1], output_dim=20, name='Embedding'))
model.add(MultiHeadAttention(head_num=20))
model.add(keras.layers.LSTM(units=32))
#model.add(keras.layers.Flatten(name='Flatten'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 20)          2160      
_________________________________________________________________
multi_head_attention_28 (Mul (None, None, 20)          1680      
_________________________________________________________________
lstm_50 (LSTM)               (None, 32)                6784      
_________________________________________________________________
Dense (Dense)                (None, 2)                 66        
Total params: 10,690
Trainable params: 10,690
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f1b9977eb38>

## 2.2 DCNN+MultiHeadAttention+Complex

In [79]:
from keras.layers import Input, Deconvolution2D

model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=500, output_dim=26, name='Embedding'))
model.add(MultiHeadAttention(head_num=26))
model.add(MultiHead(keras.layers.LSTM(units=32), layer_num=1, name='Multi-LSTM',reg_factor=0.1))

model.add(keras.layers.Flatten(name='Flatten'))
model.add(MultiHead(keras.layers.Dense(units=32), layer_num=1, name='Multi-Dense',reg_factor=0.1))

model.add(MultiHead(keras.layers.Conv1D(filters=32, kernel_size=3, padding='same'), layer_num=1, name='Multi-CNN'))
model.add(Deconvolution2D(6, kernel_size=(2,1), activation='relu'))
model.add(Deconvolution2D(8, kernel_size=(3,2), activation='relu'))
model.add(Deconvolution2D(4, kernel_size=(2,2), activation='relu'))
model.add(Deconvolution2D(9, kernel_size=(1,2), activation='relu'))

model.add(keras.layers.Flatten(name='Flatten2'))
model.add(keras.layers.Dense(units=2, activation='softmax', name='Dense'))
model.compile(loss='binary_crossentropy', optimizer='adam')

model.build()
model.summary()

model.fit(x_data2,y_data2,epochs=10)

Model: "sequential_69"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 26)          13000     
_________________________________________________________________
multi_head_attention_32 (Mul (None, None, 26)          2808      
_________________________________________________________________
Multi-LSTM (MultiHead)       (None, 32, 1)             7552      
_________________________________________________________________
Flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
Multi-Dense (MultiHead)      (None, 32, 1)             1056      
_________________________________________________________________
Multi-CNN (MultiHead)        (None, 32, 32, 1)         128       
_________________________________________________________________
conv2d_transpose_85 (Conv2DT (None, 33, 32, 6)       

<keras.callbacks.callbacks.History at 0x7f1b95847b70>