In [1]:
!pip install BPEmb
import math
import numpy as np
import tensorflow as tf
from bpemb import BPEmb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting BPEmb
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, BPEmb
Successfully installed BPEmb-0.3.4 sentencepiece-0.1.97



#Attention(Q, K,V) = softmax(QK¹/√dk)V


In [2]:
def scaled_dot_product_attention (query, key, value, mask=None):
  key_dim = tf.cast(tf.shape(key) [-1], tf.float32)
  scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)
  if mask is not None:
    scaled_scores = tf.where (mask==0, -np.inf, scaled_scores)
  softmax = tf.keras.layers.Softmax()
  weights = softmax(scaled_scores)
  return tf.matmul(weights, value), weights

####TESTING "scaled_dot_product_attention" using random queries,keys,values matrices

In [3]:
seq_len = 3
embed_dim = 4
queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values = np.random.rand(seq_len, embed_dim)
print("Queries: \n", queries)

Queries: 
 [[0.40459511 0.5565327  0.60170997 0.2437919 ]
 [0.92323341 0.01134934 0.81965197 0.2727533 ]
 [0.44399669 0.52053036 0.32596123 0.16678462]]


In [4]:
output, attn_weights = scaled_dot_product_attention(queries, keys, values)
print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.6590466  0.71541196 0.22662729 0.6481204 ]
 [0.6560365  0.71541536 0.22508144 0.65121156]
 [0.6669714  0.7139002  0.22793788 0.6444146 ]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.32888445 0.37981954 0.291296  ]
 [0.32391748 0.3857926  0.29028997]
 [0.3362787  0.36108956 0.3026317 ]], shape=(3, 3), dtype=float32)


####GENERATE QUERY,VALUE, KEYS MATRICES FOR MULTI HEAD ATTENTION USING RANDOM WEIGHTS

In [5]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads
print (f"Dimension of each head: {head_dim}")
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Dimension of each head: 4
Input shape:  (1, 3, 12) 

Input:
 [[[0.7 0.1 0.3 0.5 0.8 0.9 0.3 0.1 0.9 0.8 0.1 0.4]
  [0.2 0.8 0.5 0.7 0.1 0.7 0.6 0.1 0.9 0.2 0.9 0.4]
  [0.  0.  0.4 0.7 0.6 0.8 0.7 0.1 0.2 0.1 1.  1. ]]]


In [6]:
# The query weights for each head.
wq0 = np. random.rand(embed_dim, head_dim).round(1)
wq1 = np.random.rand(embed_dim, head_dim).round(1)
wq2 = np.random.rand(embed_dim, head_dim).round(1)
# The key weights for each head.
wk0 = np.random.rand(embed_dim, head_dim).round(1)
wk1 = np.random.rand(embed_dim, head_dim).round(1)
wk2 = np.random.rand(embed_dim, head_dim).round(1)
# The value weights for each head.
wv0 = np.random.rand(embed_dim, head_dim).round(1)
wv1 = np.random.rand(embed_dim, head_dim).round(1)
wv2 = np.random.rand(embed_dim, head_dim).round(1)

In [7]:
print("The three sets of query weights (one for each head): ")
print("wq0: \n", wq0)
print("wq1: \n", wq1)
print("wq2: \n", wq1) 

The three sets of query weights (one for each head): 
wq0: 
 [[0.3 0.9 0.6 0.6]
 [0.9 0.4 0.9 0.9]
 [0.3 0.3 0.5 0.5]
 [0.3 0.8 0.1 0.7]
 [0.8 0.2 0.3 0.7]
 [0.9 0.6 0.5 0.6]
 [0.2 0.3 0.2 0.5]
 [0.3 0.7 0.3 0.2]
 [0.5 0.6 0.4 0.5]
 [0.1 0.1 0.4 0.3]
 [0.8 0.3 0.3 0.8]
 [0.9 0.8 0.4 0.4]]
wq1: 
 [[0.  0.4 0.6 0.5]
 [0.5 1.  0.6 0.2]
 [0.7 0.4 0.6 1. ]
 [0.8 0.2 0.5 0.7]
 [0.9 0.8 0.5 0.7]
 [0.4 0.6 0.6 0.7]
 [0.9 0.9 0.2 0.5]
 [0.1 0.2 0.  0.4]
 [0.3 0.4 0.3 0.9]
 [0.2 0.3 0.4 0.9]
 [0.  0.8 0.9 0.5]
 [0.3 0.  0.8 0. ]]
wq2: 
 [[0.  0.4 0.6 0.5]
 [0.5 1.  0.6 0.2]
 [0.7 0.4 0.6 1. ]
 [0.8 0.2 0.5 0.7]
 [0.9 0.8 0.5 0.7]
 [0.4 0.6 0.6 0.7]
 [0.9 0.9 0.2 0.5]
 [0.1 0.2 0.  0.4]
 [0.3 0.4 0.3 0.9]
 [0.2 0.3 0.4 0.9]
 [0.  0.8 0.9 0.5]
 [0.3 0.  0.8 0. ]]


In [8]:
# Geneated queries, keys, and values for the first head.
q0 = np.dot(x, wq0)
k0 = np.dot(x, wk0)
v0 = np.dot(x, wv0)
# Geneated queries, keys, and values for the second head.
q1 = np.dot(x, wq1)
k1 = np.dot(x, wk1)
v1 = np.dot(x, wv1)
# Geneated queries, keys, and values for the third head.
q2 = np.dot(x, wq2)
k2= np.dot(x, wk2)
v2 = np.dot(x, wv2)

In [9]:
print("Q, K, and V for first head: \n")
print(f" q0 {q0.shape}: \n", q0, "\n")
print(f"k0 {k0.shape}: \n", k0, "\n")
print(f"v0 {v0.shape}: \n", v0)

Q, K, and V for first head: 

 q0 (1, 3, 4): 
 [[[3.05 2.99 2.36 3.21]
  [3.55 3.05 2.56 3.78]
  [3.51 2.79 1.84 3.29]]] 

k0 (1, 3, 4): 
 [[[3.26 3.48 2.15 4.2 ]
  [3.47 2.7  2.39 4.1 ]
  [3.39 2.39 1.88 4.03]]] 

v0 (1, 3, 4): 
 [[[2.56 3.71 2.46 2.16]
  [2.45 3.17 3.01 2.53]
  [2.52 3.13 3.48 2.6 ]]]


Now that we have our Q, K, V vectors, we can just pass them to our self-attention operation. Here we're calculating the output and attention weights for the first head.

In [10]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)
out1, attn_weights1 = scaled_dot_product_attention(q1, k1, v1)
out2, attn_weights2 = scaled_dot_product_attention(q2, k2, v2)

print("Output from first attention head: ", out0, "\n")
print("Output from second attention head: ", out1, "\n")
print("Output from third attention head: ", out2,)
print("Attention weights from first head: ", attn_weights0, "\n")
print("Attention weights from second head: ", attn_weights1, "\n")
print("Attention weights from third head: ", attn_weights2)

Output from first attention head:  tf.Tensor(
[[[2.5237308 3.5005746 2.7084117 2.3069787]
  [2.523203  3.5003133 2.7060704 2.306899 ]
  [2.522348  3.4881163 2.727444  2.3161442]]], shape=(1, 3, 4), dtype=float32) 

Output from second attention head:  tf.Tensor(
[[[1.387991  2.8214273 3.0571883 3.2637637]
  [1.367864  2.868446  3.0218394 3.2517724]
  [1.3553139 2.888186  3.0052195 3.241209 ]]], shape=(1, 3, 4), dtype=float32) 

Output from third attention head:  tf.Tensor(
[[[2.4486806 2.6604676 2.7093885 2.7420452]
  [2.4256036 2.601138  2.6989818 2.7314444]
  [2.4373336 2.6289148 2.7032614 2.7362404]]], shape=(1, 3, 4), dtype=float32)
Attention weights from first head:  tf.Tensor(
[[[0.6182333  0.29997995 0.08178674]
  [0.61730003 0.30698657 0.0757134 ]
  [0.5962577  0.30717665 0.09656565]]], shape=(1, 3, 3), dtype=float32) 

Attention weights from second head:  tf.Tensor(
[[[0.33176637 0.6026759  0.06555772]
  [0.28280646 0.6534152  0.06377833]
  [0.25085124 0.66698104 0.08216766]]],

In [11]:
combined_out_a = np.concatenate((out0, out1, out2), axis=-1)
print(f"Combined output from all heads {combined_out_a.shape}:")
print(combined_out_a)


# The final step would be to run combined_out_a through a linear/dense layer for further processing.

Combined output from all heads (1, 3, 12):
[[[2.5237308 3.5005746 2.7084117 2.3069787 1.387991  2.8214273 3.0571883
   3.2637637 2.4486806 2.6604676 2.7093885 2.7420452]
  [2.523203  3.5003133 2.7060704 2.306899  1.367864  2.868446  3.0218394
   3.2517724 2.4256036 2.601138  2.6989818 2.7314444]
  [2.522348  3.4881163 2.727444  2.3161442 1.3553139 2.888186  3.0052195
   3.241209  2.4373336 2.6289148 2.7032614 2.7362404]]]


In [12]:
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.3 0.9 0.6 0.6]
 [0.9 0.4 0.9 0.9]
 [0.3 0.3 0.5 0.5]
 [0.3 0.8 0.1 0.7]
 [0.8 0.2 0.3 0.7]
 [0.9 0.6 0.5 0.6]
 [0.2 0.3 0.2 0.5]
 [0.3 0.7 0.3 0.2]
 [0.5 0.6 0.4 0.5]
 [0.1 0.1 0.4 0.3]
 [0.8 0.3 0.3 0.8]
 [0.9 0.8 0.4 0.4]] 

Query weights for second head: 
 [[0.  0.4 0.6 0.5]
 [0.5 1.  0.6 0.2]
 [0.7 0.4 0.6 1. ]
 [0.8 0.2 0.5 0.7]
 [0.9 0.8 0.5 0.7]
 [0.4 0.6 0.6 0.7]
 [0.9 0.9 0.2 0.5]
 [0.1 0.2 0.  0.4]
 [0.3 0.4 0.3 0.9]
 [0.2 0.3 0.4 0.9]
 [0.  0.8 0.9 0.5]
 [0.3 0.  0.8 0. ]] 

Query weights for third head: 
 [[0.9 0.7 0.2 0.7]
 [0.8 0.5 0.  0.2]
 [1.  0.2 0.6 0.8]
 [0.8 0.1 0.7 0.4]
 [0.3 0.8 1.  0.3]
 [0.3 0.4 0.6 0.9]
 [0.1 0.7 0.  0.7]
 [0.9 0.6 0.6 0.2]
 [0.4 0.4 0.1 0.5]
 [0.3 0.2 0.9 0.2]
 [0.9 0.9 0.1 0.2]
 [0.6 1.  0.3 0.2]]


Let's now get the same thing done using a single query weight matrix, single key weight matrix, and single value weight matrix.

Suppose instead of declaring three separate query weight matrices, we had declared one. i.e. a single  d x d  matrix. We're concatenating our per-head query weights here instead of declaring a new set of weights so that we get the same results.

In [13]:
wq = np.concatenate((wq0, wq1, wq2), axis=1)
wk = np.concatenate((wk0, wk1, wk2), axis=1)
wv = np.concatenate((wv0, wv1, wv2), axis=1)

print(f"Single key weight matrix {wk.shape}:\n", wk, "\n")
print(f"Single value weight matrix {wv.shape}:\n", wv)
print(f"Single query weight matrix {wq.shape}: \n", wq)

Single key weight matrix (12, 12):
 [[0.  0.4 0.7 0.5 0.8 0.8 0.3 0.2 0.4 0.7 0.3 0.9]
 [0.5 0.5 1.  0.9 0.6 0.4 0.8 0.5 0.8 0.2 1.  0.1]
 [0.9 0.2 0.  0.9 0.4 0.7 0.5 0.5 0.3 0.2 0.6 0.7]
 [0.1 0.4 0.  0.2 0.6 0.8 0.3 0.6 0.1 0.1 0.9 0.5]
 [0.8 0.9 0.5 0.9 0.4 0.6 0.1 0.6 0.4 0.4 0.  0.9]
 [0.8 0.4 0.6 0.9 0.1 0.5 0.3 0.6 0.9 0.2 0.4 0.7]
 [0.6 0.4 0.6 0.9 0.4 0.8 0.9 0.4 0.6 0.7 0.7 0.9]
 [0.2 0.4 0.3 0.6 0.6 0.2 0.7 0.4 0.9 0.1 0.7 0.7]
 [0.9 0.8 0.2 0.5 0.5 0.7 0.2 0.7 0.7 0.4 0.8 0.7]
 [0.2 0.9 0.1 0.8 0.8 0.1 0.8 0.9 0.6 0.5 0.8 0.4]
 [0.4 0.1 0.3 0.4 0.8 0.1 0.8 0.1 0.8 0.9 0.9 0.8]
 [0.8 0.5 0.3 1.  0.5 0.3 1.  0.4 0.4 0.4 0.5 0.7]] 

Single value weight matrix (12, 12):
 [[0.5 0.3 0.4 0.1 0.2 0.5 0.4 0.7 0.  0.1 0.1 0.1]
 [0.3 0.2 0.1 0.4 0.5 1.  0.1 0.8 0.8 0.4 0.6 0.3]
 [0.9 0.6 0.3 0.3 0.2 0.4 0.5 0.8 0.7 0.9 0.8 0.5]
 [0.3 0.7 0.4 0.3 0.  0.7 1.  0.4 0.9 0.  0.2 0.9]
 [0.5 1.  0.4 0.5 0.8 0.7 0.8 1.  0.4 0.1 0.5 0. ]
 [0.  0.6 0.4 0.2 0.4 0.2 1.  0.7 0.3 0.1 1.  1. ]
 [0.8

In [14]:
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [15]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[3.05 2.99 2.36 3.21 2.57 2.75 2.91 3.98 2.97 2.93 3.01 2.94]
  [3.55 3.05 2.56 3.78 2.66 3.42 3.32 3.59 3.74 3.16 1.89 2.83]
  [3.51 2.79 1.84 3.29 2.72 2.82 3.31 3.03 3.15 3.5  2.38 2.53]]]


In [16]:
print(q0, "\n")
print(q1, "\n")
print(q2)

# we can see how combined and seperated query values are identical 

[[[3.05 2.99 2.36 3.21]
  [3.55 3.05 2.56 3.78]
  [3.51 2.79 1.84 3.29]]] 

[[[2.57 2.75 2.91 3.98]
  [2.66 3.42 3.32 3.59]
  [2.72 2.82 3.31 3.03]]] 

[[[2.97 2.93 3.01 2.94]
  [3.74 3.16 1.89 2.83]
  [3.15 3.5  2.38 2.53]]]


We can split our combined queries into  d x d/h  heads using reshape and transpose

In [17]:
# Note: we can achieve the same thing by passing -1 instead of seq_len.
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))
print(f"Combined queries: {q_s.shape}\n", q_s, "\n")
print(f"Reshaped into separate heads: {q_s_reshaped.shape}\n", q_s_reshaped)

Combined queries: (1, 3, 12)
 [[[3.05 2.99 2.36 3.21 2.57 2.75 2.91 3.98 2.97 2.93 3.01 2.94]
  [3.55 3.05 2.56 3.78 2.66 3.42 3.32 3.59 3.74 3.16 1.89 2.83]
  [3.51 2.79 1.84 3.29 2.72 2.82 3.31 3.03 3.15 3.5  2.38 2.53]]] 

Reshaped into separate heads: (1, 3, 3, 4)
 tf.Tensor(
[[[[3.05 2.99 2.36 3.21]
   [2.57 2.75 2.91 3.98]
   [2.97 2.93 3.01 2.94]]

  [[3.55 3.05 2.56 3.78]
   [2.66 3.42 3.32 3.59]
   [3.74 3.16 1.89 2.83]]

  [[3.51 2.79 1.84 3.29]
   [2.72 2.82 3.31 3.03]
   [3.15 3.5  2.38 2.53]]]], shape=(1, 3, 3, 4), dtype=float64)


In [18]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()
print(f"Queries transposed into \"separate\" heads {q_s_transposed.shape}:\n", 
      q_s_transposed)

Queries transposed into "separate" heads (1, 3, 3, 4):
 [[[[3.05 2.99 2.36 3.21]
   [3.55 3.05 2.56 3.78]
   [3.51 2.79 1.84 3.29]]

  [[2.57 2.75 2.91 3.98]
   [2.66 3.42 3.32 3.59]
   [2.72 2.82 3.31 3.03]]

  [[2.97 2.93 3.01 2.94]
   [3.74 3.16 1.89 2.83]
   [3.15 3.5  2.38 2.53]]]]


In [19]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2)

The separate per-head query matrices from before: 
[[[3.05 2.99 2.36 3.21]
  [3.55 3.05 2.56 3.78]
  [3.51 2.79 1.84 3.29]]] 

[[[2.57 2.75 2.91 3.98]
  [2.66 3.42 3.32 3.59]
  [2.72 2.82 3.31 3.03]]] 

[[[2.97 2.93 3.01 2.94]
  [3.74 3.16 1.89 2.83]
  [3.15 3.5  2.38 2.53]]]


In [20]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()

print(f"Keys for all heads in a single matrix {k_s.shape}: \n", k_s_transposed, "\n")
print(f"Values for all heads in a single matrix {v_s.shape}: \n", v_s_transposed)

Keys for all heads in a single matrix (1, 3, 12): 
 [[[[3.26 3.48 2.15 4.2 ]
   [3.47 2.7  2.39 4.1 ]
   [3.39 2.39 1.88 4.03]]

  [[3.   3.24 2.58 3.34]
   [3.2  3.16 3.45 2.93]
   [2.72 2.73 3.33 2.51]]

  [[3.25 2.35 3.23 4.1 ]
   [3.69 2.51 4.45 3.86]
   [3.06 2.48 3.39 4.11]]]] 

Values for all heads in a single matrix (1, 3, 12): 
 [[[[2.56 3.71 2.46 2.16]
   [2.45 3.17 3.01 2.53]
   [2.52 3.13 3.48 2.6 ]]

  [[1.66 2.23 3.51 3.44]
   [1.25 3.17 2.8  3.19]
   [1.28 2.61 3.13 3.05]]

  [[1.98 1.79 2.64 2.61]
   [2.5  2.81 2.74 2.77]
   [2.38 2.04 2.49 2.6 ]]]]


Set up this way, we can now calculate the outputs from all attention heads with a single call to our self-attention operation.

In [21]:
all_heads_output, all_attn_weights = scaled_dot_product_attention(q_s_transposed, 
                                                                  k_s_transposed, 
                                                                  v_s_transposed)
print("Self attention output:\n", all_heads_output)

Self attention output:
 tf.Tensor(
[[[[2.5237305 3.500574  2.7084112 2.3069787]
   [2.523203  3.500313  2.70607   2.3068988]
   [2.522348  3.4881163 2.727444  2.3161442]]

  [[1.387991  2.8214273 3.0571883 3.2637637]
   [1.3678638 2.8684459 3.0218391 3.2517724]
   [1.3553139 2.888186  3.0052195 3.241209 ]]

  [[2.4486809 2.6604676 2.7093887 2.7420454]
   [2.4256034 2.601138  2.6989818 2.7314444]
   [2.4373336 2.6289148 2.7032614 2.7362404]]]], shape=(1, 3, 3, 4), dtype=float32)


In [22]:
print("Per head outputs from using separate sets of weights per head:")
print(out0, "\n")
print(out1, "\n")
print(out2)

#both seperate and combined results are again identical

Per head outputs from using separate sets of weights per head:
tf.Tensor(
[[[2.5237308 3.5005746 2.7084117 2.3069787]
  [2.523203  3.5003133 2.7060704 2.306899 ]
  [2.522348  3.4881163 2.727444  2.3161442]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[1.387991  2.8214273 3.0571883 3.2637637]
  [1.367864  2.868446  3.0218394 3.2517724]
  [1.3553139 2.888186  3.0052195 3.241209 ]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[2.4486806 2.6604676 2.7093885 2.7420452]
  [2.4256036 2.601138  2.6989818 2.7314444]
  [2.4373336 2.6289148 2.7032614 2.7362404]]], shape=(1, 3, 4), dtype=float32)


In [23]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]), 
                            shape=(batch_size, seq_len, embed_dim))
print("Final output from using single query, key, value matrices:\n", 
      combined_out_b, "\n")
print("Final output from using separate query, key, value matrices per head:\n", 
      combined_out_a)

Final output from using single query, key, value matrices:
 tf.Tensor(
[[[2.5237305 3.500574  2.7084112 2.3069787 1.387991  2.8214273 3.0571883
   3.2637637 2.4486809 2.6604676 2.7093887 2.7420454]
  [2.523203  3.500313  2.70607   2.3068988 1.3678638 2.8684459 3.0218391
   3.2517724 2.4256034 2.601138  2.6989818 2.7314444]
  [2.522348  3.4881163 2.727444  2.3161442 1.3553139 2.888186  3.0052195
   3.241209  2.4373336 2.6289148 2.7032614 2.7362404]]], shape=(1, 3, 12), dtype=float32) 

Final output from using separate query, key, value matrices per head:
 [[[2.5237308 3.5005746 2.7084117 2.3069787 1.387991  2.8214273 3.0571883
   3.2637637 2.4486806 2.6604676 2.7093885 2.7420452]
  [2.523203  3.5003133 2.7060704 2.306899  1.367864  2.868446  3.0218394
   3.2517724 2.4256036 2.601138  2.6989818 2.7314444]
  [2.522348  3.4881163 2.727444  2.3161442 1.3553139 2.888186  3.0052195
   3.241209  2.4373336 2.6289148 2.7032614 2.7362404]]]


##We can encapsulate everything we just covered in a single class.

In [24]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadSelfAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    self.d_head = self.d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(self.d_model)
    self.wk = tf.keras.layers.Dense(self.d_model)
    self.wv = tf.keras.layers.Dense(self.d_model)

    # Linear layer to generate the final output.
    self.dense = tf.keras.layers.Dense(self.d_model)
  
  def split_heads(self, x):
    batch_size = x.shape[0]

    split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
    return tf.transpose(split_inputs, perm=[0, 2, 1, 3])
  
  def merge_heads(self, x):
    batch_size = x.shape[0]

    merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
    return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

  def call(self, q, k, v, mask):
    qs = self.wq(q)
    ks = self.wk(k)
    vs = self.wv(v)

    qs = self.split_heads(qs)
    ks = self.split_heads(ks)
    vs = self.split_heads(vs)

    output, attn_weights = scaled_dot_product_attention(qs, ks, vs, mask)
    output = self.merge_heads(output)

    return self.dense(output), attn_weights


In [25]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[ 0.2294916   0.12367499  0.5002226  -0.05087875  0.44209313
    0.04899372  0.13054031  0.12503248  0.14715973  0.45378336
   -0.3372569  -0.30783698]
  [ 0.24200596  0.12704034  0.5397398  -0.06036338  0.5002323
    0.04581532  0.17330357  0.15491767  0.19630358  0.4839844
   -0.3624456  -0.31827798]
  [ 0.24138258  0.12508693  0.53865886 -0.06811577  0.45645258
    0.02451682  0.1569365   0.14821433  0.182532    0.4636302
   -0.3310364  -0.31074744]]], shape=(1, 3, 12), dtype=float32)


##ENCODER

In [26]:
def feed_forward_network(d_model, hidden_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(hidden_dim, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

In [27]:
class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(EncoderBlock, self).__init__()

    self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
  
  def call(self, x, training, mask):
    mhsa_output, attn_weights = self.mhsa(x, x, x, mask)
    mhsa_output = self.dropout1(mhsa_output, training=training)
    mhsa_output = self.layernorm1(x + mhsa_output)

    ffn_output = self.ffn(mhsa_output)
    ffn_output = self.dropout2(ffn_output, training=training)
    output = self.layernorm2(mhsa_output + ffn_output)

    return output, attn_weights


In [28]:
encoder_block = EncoderBlock(12, 3, 48)

block_output,  _ = encoder_block(x, True, None)
print(f"Output from single encoder block {block_output.shape}:")
print(block_output)

Output from single encoder block (1, 3, 12):
tf.Tensor(
[[[-2.1197195e-01 -1.3177364e-01  1.5815003e-01  1.0531749e+00
   -1.3040812e+00  4.0824369e-02 -3.9061621e-01 -2.8542405e-01
    1.0904245e+00  2.2544982e+00 -1.1763124e+00 -1.0968924e+00]
  [-6.3340265e-01 -6.7885280e-02  1.6601526e+00  1.2038925e+00
   -1.7723087e+00 -2.6691768e-01 -7.9801881e-01  7.8692302e-02
    9.3915153e-01  9.6578437e-01 -8.1238325e-04 -1.3083278e+00]
  [-1.1207460e+00 -5.5309528e-01  2.0188301e+00  1.3985721e+00
   -1.2443501e+00 -8.0472708e-01 -3.1493428e-01 -2.1804489e-02
    6.3748491e-01  7.4244398e-01  2.9259631e-01 -1.0302701e+00]]], shape=(1, 3, 12), dtype=float32)


## Word and Positional Embeddings

In [29]:
# Load the English tokenizer.
bpemb_en = BPEmb(lang="en")
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
print("Vocabulary size:", bpemb_vocab_size)
print("Embedding size:", bpemb_embed_size)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 1060653.81B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:00<00:00, 4791288.21B/s]


Vocabulary size: 10000
Embedding size: 100


In [30]:
# Embedding for the word "car".
bpemb_en.vectors[bpemb_en.words.index('car')]

array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

We don't need the embeddings since we're going to use our own embedding layer. What we're interested in are the subword tokens and their respective ids. The ids will be used as indexes into our embedding layer.<br>

These are the subword tokens for our example sentence from the slides. **BPEmb** places underscores in front of any tokens which are whole words or intended to begin words.<br>

Remember that subword tokenizers are trained using count frequencies over a corpus. So these subword tokens are specific to **BPEmb**. Another subword tokenizer may output something different. This is why it's important that when we use a pretrained model, we make sure to use the pretrained model's tokenizer. We'll see this when we use pretrained transformers later in this module.

In [31]:
sample_sentence = "Where can I find a pizzeria?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?']


In [32]:
token_seq = np.array(bpemb_en.encode_ids("Where can I find a pizzeria?"))
print(token_seq)

[ 571  280  386 1934    4   24  248 4339  177 9967]


Now that we have a way to tokenize and vectorize sentences, we can declare and use an embedding layer with the same vocabulary size as **BPEmb** and a desired embedding size.

In [33]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embed_dim)
token_embeddings = token_embed(token_seq)

# The untrained embeddings for our sample sentence.
print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  Where can I find a pizzeria?
tf.Tensor(
[[-0.01302861  0.03313113 -0.04513763 -0.00997768 -0.02491695 -0.01981989
  -0.03860885 -0.0486939  -0.03882251 -0.02593344  0.04286231  0.00894827]
 [-0.01603675  0.01845826  0.0008552   0.02768201 -0.02777867 -0.00649695
   0.00868416 -0.03917243 -0.01836915  0.02388283  0.0146642   0.01673187]
 [-0.04333204 -0.01979154  0.0175184  -0.044195   -0.02448606 -0.01857346
  -0.00193138  0.01074655  0.02854523 -0.03246266  0.01003687  0.03762888]
 [ 0.04620275  0.02691752 -0.02910745 -0.02630258 -0.01229552 -0.00610255
  -0.01044874  0.01956037  0.04262752 -0.04155936 -0.01917005 -0.01994878]
 [-0.03539585  0.00268234 -0.01060759 -0.01913463 -0.02754086  0.01332739
  -0.04862207 -0.01495548  0.02292304  0.02027467 -0.00249114 -0.01782548]
 [ 0.02176336  0.04785791 -0.03956974  0.00849892  0.00147782 -0.01380372
  -0.02697941 -0.0199378   0.0121421  -0.00711564 -0.02658532 -0.02139275]
 [-0.04082402  0.01951199 -0.02894517  0.0055403 

In [34]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embed_dim)

# Generate ids for each position of the token sequence.
pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)


In [35]:
# These are our positon embeddings.
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence\n", position_embeddings)

Position embeddings for the input sequence
 tf.Tensor(
[[-0.00267386  0.04497888 -0.01898999  0.02883989 -0.00654338 -0.03075428
   0.01430224 -0.03397721  0.01428051 -0.03040561  0.00714212  0.02189949]
 [ 0.00583997  0.00416914  0.01025517 -0.01858835  0.01152635  0.0453545
   0.00875487 -0.04249355  0.02623383 -0.00830251  0.04689712 -0.04844732]
 [-0.0395951  -0.00850092 -0.04495001  0.04774108 -0.01864243  0.03377538
  -0.01878288  0.04108479 -0.03097264  0.04437483 -0.02057176  0.04371068]
 [ 0.00818767  0.0161379   0.00114429 -0.03537606  0.04595914  0.02060746
  -0.03434259  0.03872677 -0.01169735  0.0408063  -0.0116725   0.01446274]
 [ 0.02735185  0.01735767 -0.034096   -0.01159757 -0.02599738  0.03450202
  -0.03427508  0.02789124  0.04310719 -0.0003692  -0.0216522   0.02385452]
 [-0.04487137  0.0297712   0.00572839 -0.04311861  0.03395119 -0.03924471
  -0.02079718  0.04100347  0.02254671 -0.02317665  0.01228994  0.04094117]
 [ 0.03096701 -0.0092387  -0.00644468  0.04171463  0

In [36]:
input = token_embeddings + position_embeddings
print("Input to the initial encoder block:\n", input)

Input to the initial encoder block:
 tf.Tensor(
[[-0.01570247  0.07811001 -0.06412762  0.01886221 -0.03146032 -0.05057417
  -0.02430661 -0.08267111 -0.024542   -0.05633905  0.05000442  0.03084775]
 [-0.01019678  0.0226274   0.01111037  0.00909366 -0.01625233  0.03885754
   0.01743903 -0.08166598  0.00786468  0.01558032  0.06156132 -0.03171545]
 [-0.08292715 -0.02829247 -0.0274316   0.00354609 -0.04312849  0.01520192
  -0.02071426  0.05183134 -0.0024274   0.01191217 -0.01053488  0.08133955]
 [ 0.05439042  0.04305542 -0.02796316 -0.06167864  0.03366362  0.01450491
  -0.04479133  0.05828714  0.03093017 -0.00075306 -0.03084254 -0.00548605]
 [-0.008044    0.02004001 -0.04470359 -0.0307322  -0.05353824  0.04782941
  -0.08289715  0.01293576  0.06603023  0.01990546 -0.02414334  0.00602904]
 [-0.02310801  0.07762911 -0.03384135 -0.03461969  0.035429   -0.05304843
  -0.04777659  0.02106567  0.03468881 -0.03029229 -0.01429537  0.01954842]
 [-0.00985701  0.01027329 -0.03538986  0.04725493  0.04821

In [37]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(src_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    # The original Attention Is All You Need paper applied dropout to the
    # input before feeding it to the first encoder block.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    # Create encoder blocks.
    self.blocks = [EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) 
    for _ in range(num_blocks)]
  
  def call(self, input, training, mask):
    token_embeds = self.token_embed(input)

    # Generate position indices for a batch of input sequences.
    num_pos = input.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, input.shape)
    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    # Run input through successive encoder blocks.
    for block in self.blocks:
      x, weights = block(x, training, mask)

    return x, weights

In [38]:
# Batch of 3 sequences, each of length 10 (10 is also the 
# maximum sequence length in this case).
seqs = np.random.randint(0, 10000, size=(3, 10))
print(seqs.shape)
print(seqs)

(3, 10)
[[7886 2242 8537 8926 8527 7208 1972 3828 9959 8853]
 [1099 9474 9773 6282 3514 4379 2932 9802 8522 7832]
 [4356 5064 7960 6436 5539 9761 8952 2611 4558 8429]]


In [39]:
pos_ids = np.resize(np.arange(seqs.shape[1]), seqs.shape[0] * seqs.shape[1])
print(pos_ids)

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [40]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [41]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[-0.00267386,  0.04497888, -0.01898999,  0.02883989,
         -0.00654338, -0.03075428,  0.01430224, -0.03397721,
          0.01428051, -0.03040561,  0.00714212,  0.02189949],
        [ 0.00583997,  0.00416914,  0.01025517, -0.01858835,
          0.01152635,  0.0453545 ,  0.00875487, -0.04249355,
          0.02623383, -0.00830251,  0.04689712, -0.04844732],
        [-0.0395951 , -0.00850092, -0.04495001,  0.04774108,
         -0.01864243,  0.03377538, -0.01878288,  0.04108479,
         -0.03097264,  0.04437483, -0.02057176,  0.04371068],
        [ 0.00818767,  0.0161379 ,  0.00114429, -0.03537606,
          0.04595914,  0.02060746, -0.03434259,  0.03872677,
         -0.01169735,  0.0408063 , -0.0116725 ,  0.01446274],
        [ 0.02735185,  0.01735767, -0.034096  , -0.01159757,
         -0.02599738,  0.03450202, -0.03427508,  0.02789124,
          0.04310719, -0.0003692 , -0.0216522 ,  0.02385452],
        [-0.04487137,  0.02

Lets test our Encoder

In [42]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [43]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

In [44]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


In [45]:
enc_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print("Input:")
print(padded_input_seqs, '\n')
print("Encoder mask:")
print(enc_mask)

Input:
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]] 

Encoder mask:
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], shape=(3, 10), dtype=float32)


In [46]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

<tf.Tensor: shape=(3, 1, 1, 10), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]], dtype=float32)>

In [47]:
num_encoder_blocks = 6

# d_model is the embedding dimension used throughout.
d_model = 12

num_heads = 3

# Feed-forward network hidden dimension width.
ffn_hidden_dim = 48

src_vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_encoder_blocks,
    d_model,
    num_heads,
    ffn_hidden_dim,
    src_vocab_size,
    max_input_seq_len)

In [48]:
encoder_output, attn_weights = encoder(padded_input_seqs, training=True, 
                                       mask=enc_mask)
print(f"Encoder output {encoder_output.shape}:")
print(encoder_output)

Encoder output (3, 10, 12):
tf.Tensor(
[[[ 5.15816629e-01  7.61540473e-01 -5.77319562e-01 -1.17828798e+00
   -7.44615853e-01  1.65449226e+00 -9.72729862e-01  2.48307199e-03
   -3.40688735e-01  1.36467981e+00  1.00068784e+00 -1.48605800e+00]
  [ 3.57278019e-01  5.87776482e-01 -7.65292645e-01 -1.69315350e+00
   -9.02665973e-01  1.44819975e+00 -8.42213154e-01  1.35431662e-01
    3.02950680e-01  1.25711429e+00  1.22967255e+00 -1.11509860e+00]
  [-9.24525186e-02  3.44464451e-01 -8.33958745e-01 -1.31060040e+00
   -3.82650867e-02  9.61784005e-01 -7.66187847e-01  4.31145787e-01
   -1.68807566e-01  1.72790456e+00  1.40664506e+00 -1.66167176e+00]
  [ 5.31727314e-01  7.21970201e-01 -4.14329678e-01 -1.24973392e+00
   -1.39198756e+00  9.73598361e-01 -1.07928050e+00  3.52839112e-01
    1.39878631e-01  1.25907505e+00  1.43182182e+00 -1.27557898e+00]
  [ 3.78314406e-01  5.09653270e-01 -4.96241033e-01 -1.19794273e+00
   -9.56186414e-01  2.11426353e+00 -9.38754022e-01  4.33031589e-01
   -3.46625894e-01 

##DECODER
1. a **Multi-Head Cross-Attention** layer which uses the encoder's outputs as the keys and values.

2. an extra skip/residual connection along with an extra layer normalization step.

<div>
<img src="https://drive.google.com/uc?export=view&id=1WVT4SX49bnta4uscOTF4xrsxFI4PbPER" width="500"/>
</div>

In [49]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()
    
  # Note the decoder block takes two masks. One for the first MHSA(Multi-Head Skip-Attention), another
  # for the second MHSA.
  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attn_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attn_weights = self.mhsa2(mhsa_output1, encoder_output, 
                                            encoder_output, 
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attn_weights


The decoder is almost the same as the encoder except it takes the encoder's output as part of its input, and it takes two masks: the decoder mask and memory mask.

In [50]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    # Generate position indices.
    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training, decoder_mask, memory_mask)

    return x, weights

In [54]:
# Dummy values
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

As we did with the encoder input sequences, we need to pad out this batch so that all sequences within it are the same length.

In [55]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


We can create the padding mask the same way we did for the encoder.

In [56]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


As we covered in the slides, the look-ahead mask is a diagonal where the lower half are 1s and the upper half are zeros. This is easy to create using the *band_part* method:<br>
https://www.tensorflow.org/api_docs/python/tf/linalg/band_part

In [57]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len, 
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


To create the decoder mask, we just need to combine the padding and look-ahead masks. Note how the columns of the resulting decoder mask are all zero for padding positions.

In [58]:
dec_mask = tf.minimum(dec_padding_mask, look_ahead_mask)
print("The decoder mask:")
print(dec_mask)

The decoder mask:
tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


We can now declare a decoder and pass it everything it needs. In our case, the *memory* mask is the same as the *encoder* mask.

In [59]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs, 
                            True, dec_mask, enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

Decoder output (3, 8, 12):
tf.Tensor(
[[[-1.3903046e-01 -1.1472348e+00 -9.4525301e-01 -9.9262261e-01
    9.6591866e-01  7.7667969e-01  7.3278648e-01 -1.5465268e+00
    1.8527468e+00  8.8189763e-01 -2.6752219e-01 -1.7183934e-01]
  [-1.3411474e-01 -9.7858495e-01 -8.6579299e-01 -1.0426285e+00
    9.2134255e-01  1.2728901e+00  3.7006342e-01 -1.4032898e+00
    2.0083435e+00  4.4674009e-01 -6.0845405e-01  1.3485389e-02]
  [ 6.5341175e-01 -8.9217401e-01 -6.6149551e-01 -9.4876671e-01
    6.9195396e-01  1.4535443e+00 -8.3237670e-02 -1.5115743e+00
    1.8314381e+00  6.5290809e-01 -4.1486269e-01 -7.7114522e-01]
  [-1.1089486e-01 -1.0670661e+00 -9.7008365e-01 -6.3454688e-01
    7.4080920e-01  1.3388698e+00  6.3307554e-01 -1.3387581e+00
    1.8285021e+00  8.3866543e-01 -3.8097706e-01 -8.7759578e-01]
  [ 4.8765421e-01 -1.3967913e+00 -1.1272376e+00 -6.4497453e-01
    6.0428280e-01  1.7554371e+00 -3.9304355e-01 -4.1379380e-01
    1.6058282e+00  8.9691055e-01 -5.9938884e-01 -7.7488327e-01]
  [ 5.280253

#TRANSFORMER

In [60]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size, 
                           max_input_len, dropout_rate)
    
    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)
    
    # The final dense layer to generate logits from the decoder output.
    self.output_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input_seqs, target_input_seqs, training, encoder_mask,
           decoder_mask, memory_mask):
    encoder_output, encoder_attn_weights = self.encoder(input_seqs, 
                                                        training, encoder_mask)

    decoder_output, decoder_attn_weights = self.decoder(encoder_output, 
                                                        target_input_seqs, training,
                                                        decoder_mask, memory_mask)

    return self.output_layer(decoder_output), encoder_attn_weights, decoder_attn_weights


In [61]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, # made-up target vocab size.
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs, 
                                       padded_target_input_seqs, True, 
                                       enc_mask, dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output) # If training, we would use this output to calculate losses.

Transformer output (3, 8, 7000):
tf.Tensor(
[[[-0.01586961 -0.01493736  0.10750537 ...  0.00607951  0.08848661
    0.015226  ]
  [-0.04400273 -0.03207124  0.13835101 ... -0.0017617   0.07536489
    0.02428343]
  [-0.03911787 -0.03066409  0.12442512 ...  0.00308029  0.0952151
    0.01540167]
  ...
  [-0.0881561   0.03407094  0.08948196 ... -0.01739982  0.05045627
    0.02708782]
  [-0.0687721   0.03003042  0.08036731 ...  0.00281868  0.08661909
    0.02208049]
  [-0.04451494 -0.00923184  0.10660914 ...  0.01369279  0.09513992
    0.01580442]]

 [[ 0.01396652 -0.04162994  0.10592167 ...  0.05138298 -0.00354845
   -0.06019162]
  [-0.02629372 -0.02878474  0.12608218 ...  0.00743495  0.07350878
    0.02871001]
  [-0.05186695 -0.03338452  0.08886429 ...  0.00248026  0.08310422
   -0.00086674]
  ...
  [-0.04867133  0.00328152  0.11373516 ...  0.03528256  0.05657118
   -0.01047443]
  [-0.06869268 -0.0065574   0.13306202 ...  0.01718214  0.06655375
   -0.00289538]
  [-0.00519407 -0.02498811  0.