In [4]:
import numpy as np
np.random.normal(0, 1, [4, 3])

array([[ 0.22378181,  0.04090982, -1.17047502],
       [ 1.608533  , -0.3095742 , -0.27756728],
       [-0.00699097, -0.62198901, -0.72033624],
       [ 0.45114196, -0.14505998,  0.35111736]])

In [None]:
def position_encoding(position, d_model):
    x = np.random.normal(0, 1, [position, d_model])
    x[:, 0::2] = np.sin(x[:, 0::2])
    x[:, 1::2] = np.cos(x[:, 1::2])
    return tf.cast([np.newaxis, ...], dtype=tf.float32)

In [None]:
d_model = 512
max_position_encoding = 1000

# From FR to EN
input_vocab_size = 50000 + 2  # FR
target_vocab_size = 50000 + 2  # EN

# Encoder

# 1.Embedding
input = tf.kears.layers.Input(shape=(None, ))
x = tf.keras.layers.Embedding(input_vocab_size, d_model)(input)

# 2.Position Encoding
pos = position_encoding(max_position_encoding, d_model)
x = tf.keras.layers.Add()([x, pos[:, :tf.shape(x)[1], :]])

# 3.Attention
query = tf.keras.layers.Dense(d_model)(x)  # dense = Q Matrix
value = tf.keras.layers.Dense(d_model)(x)
key = tf.keras.layers.Dense(d_model)(x)
attention = tf.keras.layers.Attention()([query, value, key])  # 区别
x = tf.keras.layers.Add()([x, attention])
x = tf.keras.layers.LayerNormalization()(x)

# 4.FNN
dense = tf.keras.layers.Dense(d_model, activation='relu')(x)
dense = tf.keras.layers.Dense(d_model)(dense)
x = tf.keras.layers.Add()([x, dense])
encoder = tf.keras.layers.LayerNormalization()(x)

# Decoder

# 5.Embedding
target = tf.kears.layers.Input(shape=(None, ))
x = tf.keras.layers.Embedding(target_vocab_size, d_model)(target)

# 6.Position Encoding
pos = position_encoding(max_position_encoding, d_model)
x = tf.keras.layers.Add()([x, pos[:, :tf.shape(x)[1], :]])

# 7.Self-Attention
query = tf.keras.layers.Dense(d_model)(x)  # dense = Q Matrix
value = tf.keras.layers.Dense(d_model)(x)
key = tf.keras.layers.Dense(d_model)(x)
attention = tf.keras.layers.Attention()([query, value, key])
x = tf.keras.layers.Add()([x, attention])
x = tf.keras.layers.LayerNormalization()(x)

# 8.Encoder-Decoder Attention
query = tf.keras.layers.Dense(d_model)(x)  # dense = Q Matrix
value = tf.keras.layers.Dense(d_model)(encoder)
key = tf.keras.layers.Dense(d_model)(encoder)
attention = tf.keras.layers.Attention()([query, value, key])  # 区别
x = tf.keras.layers.Add()([x, attention])
x = tf.keras.layers.LayerNormalization()(x)

# 9.FNN
dense = tf.keras.layers.Dense(d_model, activation='relu')(x)
dense = tf.keras.layers.Dense(d_model)(dense)
x = tf.keras.layers.Add()([x, dense])
decoder = tf.keras.layers.LayerNormalization()(x)

# 10.final dense
x = tf.keras.layers.Dense(target_vocab_size)(decoder)
base_model = tf.keras.model.Model(inputs=[input, target], outputs=x)

# 11.optimizer, loss, metrics
optimizer = tf.keras.optimizer.Adam()
loss = tf.keras.losses.SparseCategoricalCrossEntropy(from_logits=True, reduction='none')
metrics = []

# 12.model compile
base_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# 13.training
base_model.fit(x = [input, target[:, :-1]], y = target[:, 1:])

# 14.predictions
result = [target_vocab_size - 2]
for _ in range(40):
    predict_result = base_model.predict([input, np.asarray(result)])
    result.append(np.argmax(predict_result[-1, -1]))
    if result[-1] == target_vocab_size -1:
        break

In [None]:
input = tf.keras.layers.Input(shape=(None, ))
target = tf.keras.layers.Input(shape=(None, ))

encoder = Encoder(...)
decoder = Decoder(...)

x = encoder(input)
x = decoder([target, x])
x = tf.keras.layers.Dense(target_vocab_size)(x)



In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model=512, num_heads=8):
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0, "the number of heads must be divided by model emb size!"
        depth = d_model // num_heads

        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_head, depth)
        self.w_query = tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
        self.split_reshape_query = tf.keras.layers.Reshape((-1, num_head, depth))  # (batch_size, seq_len, num_head, depth)
        self.split_permute_query = tf.keras.layers.Permute((2, 1, 3)) # (batch_size, num_head, seq_len, depth)

        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_head, depth)
        self.w_value = tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
        self.split_reshape_value = tf.keras.layers.Reshape((-1, num_head, depth))  # (batch_size, seq_len, num_head, depth)
        self.split_permute_value = tf.keras.layers.Permute((2, 1, 3)) # (batch_size, num_head, seq_len, depth)

        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_head, depth)
        self.w_key = tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
        self.split_reshape_key = tf.keras.layers.Reshape((-1, num_head, depth))  # (batch_size, seq_len, num_head, depth)
        self.split_permute_key = tf.keras.layers.Permute((2, 1, 3)) # (batch_size, num_head, seq_len, depth)

        self.attention = tf.keras.layers.Attention()
        self.join_permute = tf.keras.layers.Permute((2, 1, 3))
        self.join_reshape = tf.keras.layers.Reshape((-1, d_model))
    
    def call(self, inputs):
        q, v, k = inputs

        query = self.split_permute_query(self.split_reshape_query(self.w_query(q)))
        value = self.split_permute_value(self.split_reshape_value(self.w_value(v)))
        key = self.split_permute_key(self.split_reshape_key(self.w_key(k)))

        attention = self.attention([query, value, key], mask=True)
        attention = self.join_reshape(self.join_permute(attention))

        return attention

In [None]:
class Encoder(tf.keras.layers.Layers):
    def __init__():

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model, mask_zero=True)
        self.multi_head_attention = MultiHeadAttention
    
    def call(self, inputs):
        x = self.embedding(inputs)

        embedding_mask = x.embedding.compute_mask(inputs)
        x = self.multi_head_attention(x, mask=embedding_mask)
