In [90]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, sequence_len=None, embedding_dim=None, mode='sum', **kwargs):
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim
        self.mode = mode
        super(PositionalEncoding, self).__init__(**kwargs)

    def call(self, x):
        if (self.embedding_dim == None) or (self.mode == 'sum'):
            self.embedding_dim = int(x.shape[-1])
        
        position_embedding = np.array([
            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
            for pos in range(self.sequence_len)])

        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
        
        position_embedding = tf.cast(position_embedding, dtype=tf.float32)
        
        if self.mode == 'sum':
            return position_embedding + x
        
        elif self.mode == 'concat':
            position_embedding = tf.reshape(
                tf.tile(position_embedding, (int(x.shape[0]), 1)), 
                (-1, self.sequence_len, self.embedding_dim)
                )

            return tf.concat([position_embedding, x], 2)
        
    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2]+self.embedding_dim)


In [91]:
# PositionalEncoding 测试

position_embedding_layer = PositionalEncoding(50, 64, 'sum')
position_embedding_layer_output = position_embedding_layer(tf.random.uniform((10, 50, 64)))

print(position_embedding_layer_output)


tf.Tensor(
[[[ 0.09794044  1.5221956   0.05093884 ...  1.2205979   0.6255225
    1.9426765 ]
  [ 1.2793872   1.707086    1.4439819  ...  1.9408028   0.14384009
    1.64903   ]
  [ 1.4325817   0.57568574  1.7065923  ...  1.8288414   0.07602505
    1.646029  ]
  ...
  [ 0.7831629   0.20508784  1.539695   ...  1.2742875   0.90087354
    1.5430839 ]
  [-0.11898029  0.08894062  1.2983346  ...  1.5916389   0.9796848
    1.623994  ]
  [-0.38765883  0.93811846  1.0189443  ...  1.5155734   0.62063557
    1.49345   ]]

 [[ 0.75529814  1.7238648   0.30029643 ...  1.4552534   0.05495417
    1.0411552 ]
  [ 1.2798775   1.2864672   0.93499756 ...  1.2959702   0.0053605
    1.846127  ]
  [ 1.7781134   0.8382853   1.0850062  ...  1.5704403   0.604845
    1.371224  ]
  ...
  [ 0.32015985 -0.54433995  1.4011699  ...  1.1479545   0.7128234
    1.8522563 ]
  [-0.6383225   0.08011305  1.6697282  ...  1.1402829   0.5857736
    1.4564639 ]
  [-0.34154665  0.9058945   0.6852306  ...  1.6051351   0.24876395
  

In [92]:
def padding_mask(seq):
    
    # 获取为 0的padding项
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # 扩充维度用于attention矩阵
    return seq[:, np.newaxis, np.newaxis, :] # (batch_size, 1, 1, seq_len)

In [93]:
# padding mask 测试

padding_mask_list = padding_mask([[0,1,2,3], 
              [3,4,5,0]])

print(padding_mask_list)


tf.Tensor(
[[[[1. 0. 0. 0.]]]


 [[[0. 0. 0. 1.]]]], shape=(2, 1, 1, 4), dtype=float32)


In [94]:
def scaled_dot_product_attention(q, k, v, mask):
    
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dim_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dim_k)
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)

    return output

In [95]:
# dot_product_attention 测试

test_k = tf.constant([[[10,0,0,1,1,1],
                      [0,10,0,2,1,1],
                      [0,0,10,3,1,1],
                      [0,0,10,4,1,1]],
                      
                      [[1,0,0,0,1,1],
                      [0,2,0,1,1,1],
                      [0,3,10,1,1,1],
                      [1,0,4,1,0,1]]], dtype=tf.float32)  # (2, 4, 6)

test_v = tf.constant([[[10,0,0,1,1,1],
                      [0,10,0,2,1,1],
                      [0,0,10,3,1,1],
                      [0,0,10,4,1,1]],
                      
                      [[1,0,0,0,1,1],
                      [0,2,0,1,1,1],
                      [0,3,10,1,1,1],
                      [1,0,4,1,0,1]]], dtype=tf.float32)  # (2, 4, 6)

test_q = tf.constant([[[0,10,0,1,1,1]],
                      [[0,8,2,5,3,1]]], dtype=tf.float32)


mask = None
test_out = scaled_dot_product_attention(test_q, test_k, test_v, mask)

#mask = padding_mask(tf.constant([[1, 2, 0, 0], [3, 0, 1, 1]]))
#test_out = scaled_dot_product_attention(test_k, test_k, test_v, mask)

print("\n")
print("test_out: ")
print(test_out)



test_out: 
tf.Tensor(
[[[1.2379461e-17 1.0000000e+01 7.0140370e-17 2.0000000e+00 1.0000000e+00
   1.0000000e+00]]

 [[1.2375409e-07 2.9999888e+00 9.9998913e+00 1.0000000e+00 9.9999988e-01
   1.0000000e+00]]], shape=(2, 1, 6), dtype=float32)


In [96]:
# 多头注意力函数 测试

batch_size = 2
head_num = 2
depth = 3
d_model = 6


def split_heads(x, batch_size):
    # 分头, 将 "头个数" 的维度放到 seq_len 前面
    
    x = tf.reshape(x, (batch_size, -1, head_num, depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])


querys = split_heads(test_q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
keys = split_heads(test_k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
values = split_heads(test_v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
print("\n")
print("querys: ")
print(querys)
print("\n")

mask = padding_mask(tf.constant([[1, 2, 0, 0], [3, 0, 1, 1]]))
multi_head_test_out = scaled_dot_product_attention(querys, keys, values, mask) # (batch_size, num_heads, seq_len_q, depth)
print("\n")
print("multi_head_test_out: ")
print(multi_head_test_out)


# 把多头维度后移
scaled_attention = tf.transpose(multi_head_test_out, [0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
print("\n")
print("scaled_attention: ")
print(scaled_attention)

# 合并多头
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, d_model))

print("\n")
print("concat_attention: ")
print(concat_attention)



querys: 
tf.Tensor(
[[[[ 0. 10.  0.]]

  [[ 1.  1.  1.]]]


 [[[ 0.  8.  2.]]

  [[ 5.  3.  1.]]]], shape=(2, 2, 1, 3), dtype=float32)




multi_head_test_out: 
tf.Tensor(
[[[[8.4332741e-25 1.0000000e+01 0.0000000e+00]]

  [[1.6404574e+00 1.0000000e+00 1.0000000e+00]]]


 [[[9.4977648e-10 3.0000000e+00 1.0000000e+01]]

  [[9.5476758e-01 8.5647416e-01 1.0000000e+00]]]], shape=(2, 2, 1, 3), dtype=float32)


scaled_attention: 
tf.Tensor(
[[[[8.4332741e-25 1.0000000e+01 0.0000000e+00]
   [1.6404574e+00 1.0000000e+00 1.0000000e+00]]]


 [[[9.4977648e-10 3.0000000e+00 1.0000000e+01]
   [9.5476758e-01 8.5647416e-01 1.0000000e+00]]]], shape=(2, 1, 2, 3), dtype=float32)


concat_attention: 
tf.Tensor(
[[[8.4332741e-25 1.0000000e+01 0.0000000e+00 1.6404574e+00 1.0000000e+00
   1.0000000e+00]]

 [[9.4977648e-10 3.0000000e+00 1.0000000e+01 9.5476758e-01 8.5647416e-01
   1.0000000e+00]]], shape=(2, 1, 6), dtype=float32)


In [97]:
# 构造 multi head attention 层

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        # d_model 必须可以正确分为各个头
        assert d_model % num_heads == 0
        
        # 分头后的维度
        self.depth = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # 分头, 将头个数的维度 放到 seq_len 前面
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        q, k, v, mask = inputs
        batch_size = tf.shape(q)[0]

        # 分头前的前向网络，获取q、k、v语义
        q = self.wq(q) # (batch_size, seq_len, d_model)
        k = self.wk(k)
        v = self.wv(v)

        # 分头
        q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
        
        # 通过缩放点积注意力层
        scaled_attention = scaled_dot_product_attention(q, k, v, mask) # (batch_size, num_heads, seq_len_q, depth)
        
        # “多头维度” 后移
        scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)

        # 合并 “多头维度”
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        # 全连接层
        output = self.dense(concat_attention)
        
        return output

In [98]:
# 测试 MultiHeadAttention

temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
input_data = tf.random.uniform((5, 60, 512))
mask = None

output = temp_mha([input_data, input_data, input_data, mask])
print(output)


tf.Tensor(
[[[ 0.10577798 -0.25881276  0.41238633 ... -1.0608275  -0.49233449
   -0.71212834]
  [ 0.10273671 -0.25958157  0.4206068  ... -1.0604633  -0.48400968
   -0.7194559 ]
  [ 0.11328715 -0.2694013   0.41302037 ... -1.0594761  -0.4867704
   -0.71812385]
  ...
  [ 0.1060189  -0.25882116  0.4104219  ... -1.0648378  -0.4930231
   -0.7188878 ]
  [ 0.10444206 -0.26108062  0.4155717  ... -1.0717115  -0.4878379
   -0.7211157 ]
  [ 0.10722721 -0.26775098  0.41619438 ... -1.0661967  -0.4871063
   -0.71592176]]

 [[ 0.18043017 -0.22418204  0.5168557  ... -1.0312057  -0.4068474
   -0.6909212 ]
  [ 0.17852384 -0.22538428  0.519729   ... -1.0182251  -0.40997308
   -0.68516874]
  [ 0.17620301 -0.22790715  0.5144684  ... -1.0203692  -0.41394687
   -0.68758833]
  ...
  [ 0.17873383 -0.22363052  0.5182944  ... -1.0251366  -0.41794103
   -0.69461   ]
  [ 0.17429072 -0.22895774  0.51766104 ... -1.0286089  -0.41128057
   -0.6883719 ]
  [ 0.18109483 -0.22909716  0.51299834 ... -1.0268948  -0.41131327


In [99]:
def point_wise_feed_forward_network(d_model, middle_units):
    
    return tf.keras.Sequential([
        tf.keras.layers.Dense(middle_units, activation='relu'),
        tf.keras.layers.Dense(d_model)])


In [100]:
"""
# point_wise_feed_forward_network 测试

sample_fnn = point_wise_feed_forward_network(512, 2048)
input_data = tf.random.uniform((64, 50, 512))

point_wise_feed_forward_network = sample_fnn(input_data)
print(point_wise_feed_forward_network.shape)

"""

'\n# point_wise_feed_forward_network 测试\n\nsample_fnn = point_wise_feed_forward_network(512, 2048)\ninput_data = tf.random.uniform((64, 50, 512))\n\npoint_wise_feed_forward_network = sample_fnn(input_data)\nprint(point_wise_feed_forward_network.shape)\n\n'

In [101]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
        self.eps = epsilon
        super(LayerNormalization, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                     initializer=tf.ones_initializer(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                    initializer=tf.zeros_initializer(), trainable=True)
        super(LayerNormalization, self).build(input_shape)
        
    def call(self, x):
        mean = tf.keras.backend.mean(x, axis=-1, keepdims=True)
        std = tf.keras.backend.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta
    
    def compute_output_shape(self, input_shape):
        return input_shape

In [102]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, middle_units, epsilon=1e-6, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, middle_units)
        
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        
    def call(self, inputs, mask, training):
        # 多头注意力网络
        att_output = self.mha([inputs, inputs, inputs, mask])
        att_output = self.dropout1(att_output, training=training)
        out1 = self.layernorm1(inputs + att_output)  # (batch_size, input_seq_len, d_model)
        
        # 前向网络
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)   # (batch_size, input_seq_len, d_model)
        
        return out2

In [103]:
# EncoderLayer 测试

sample_encoder_layer = EncoderLayer(512, 8, 2048)
input_data = tf.random.uniform((64, 43, 512))

sample_encoder_layer_output = sample_encoder_layer(input_data, None, False)
print(sample_encoder_layer_output.shape)

(64, 43, 512)


In [104]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, n_layers, d_model, num_heads, middle_units,
                max_seq_len, epsilon=1e-6, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.n_layers = n_layers
        self.d_model = d_model
        self.pos_embedding = PositionalEncoding(sequence_len=max_seq_len, embedding_dim=d_model)

        self.encode_layer = [EncoderLayer(d_model=d_model, num_heads=num_heads, 
                                          middle_units=middle_units, 
                                          epsilon=epsilon, dropout_rate=dropout_rate)
                            for _ in range(n_layers)]
        
    def call(self, inputs, mask, training):
        emb = inputs
        emb = self.pos_embedding(emb)
        
        for i in range(self.n_layers):
            emb = self.encode_layer[i](emb, mask, training)

        return emb

In [105]:
# Encoder 测试

sample_encoder = Encoder(2, 512, 8, 1024, 60)
sample_encoder_output = sample_encoder(tf.random.uniform((15, 60, 512)),
                                      None, False)

print(sample_encoder_output.shape)


(15, 60, 512)


In [80]:
# 文本分类实验

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import *
import pandas as pd



# 1. 数据信息
max_features = 20000
maxlen = 64
batch_size = 32


print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="/notebook/Keras-Recommendation/scripts/multi_head_attention/imdb.npz", \
                                                      num_words=max_features)
y_train, y_test = pd.get_dummies(y_train), pd.get_dummies(y_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')


x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
25000 train sequences
25000 test sequences
x_train shape: (25000, 64)
x_test shape: (25000, 64)


In [89]:
# 2. 构造模型，及训练模型

inputs = Input(shape=(64,), dtype='int32')
embeddings = Embedding(max_features, 128)(inputs)

print("\n"*2)
print("embeddings:")
print(embeddings)

mask_inputs = padding_mask(inputs)

out_seq = Encoder(2, 128, 4, 256, maxlen)(embeddings, mask_inputs, False)

print("\n"*2)
print("out_seq:")
print(out_seq)

out_seq = GlobalAveragePooling1D()(out_seq)

print("\n"*2)
print("out_seq:")
print(out_seq)

out_seq = Dropout(0.3)(out_seq)
outputs = Dense(64, activation='relu')(out_seq)

out_seq = Dropout(0.3)(out_seq)
outputs = Dense(16, activation='relu')(out_seq)

out_seq = Dropout(0.3)(out_seq)
outputs = Dense(2, activation='softmax')(out_seq)

model = Model(inputs=inputs, outputs=outputs)
print(model.summary())


opt = Adam(lr=0.0002, decay=0.00001)
loss = 'categorical_crossentropy'
model.compile(loss=loss,
             optimizer=opt,
             metrics=['accuracy'])


print('Train...')
history = model.fit(x_train, y_train,
         batch_size=batch_size,
         epochs=10,
         validation_data=(x_test, y_test))





embeddings:
Tensor("embedding_7/Identity:0", shape=(None, 64, 128), dtype=float32)



out_seq:
Tensor("encoder_10/Identity:0", shape=(None, 64, 128), dtype=float32)



out_seq:
Tensor("global_average_pooling1d_6/Identity:0", shape=(None, 128), dtype=float32)
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_Equal_4 (TensorFlow [(None, 64)]         0           input_8[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_Cast_6 (TensorFlowO [(None, 64)]         0           tf_op_layer_Equal_4[0][0]        
____________________________