In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
import math

In [2]:
n_seq = 7
x = tf.constant(np.random.normal(size=(1,n_seq,512)), dtype='float32')
Wq = tf.Variable(np.random.normal(size=(512,512)), dtype='float32')
Wk = tf.Variable(np.random.normal(size=(512,512)), dtype='float32')
Wv = tf.Variable(np.random.normal(size=(512,512)), dtype='float32')

print('x.shape={}'.format(x.shape))
print('Wq.shape={}'.format(Wq.shape))
print('Wk.shape={}'.format(Wk.shape))
print('Wv.shape={}'.format(Wv.shape))

x.shape=(1, 7, 512)
Wq.shape=(512, 512)
Wk.shape=(512, 512)
Wv.shape=(512, 512)


In [3]:
"i kicked the ball and it disappeared"
class SelfAttentionLayer(layers.Layer):
    
    def __init__(self, d):
        super(SelfAttentionLayer, self).__init__()
        self.d = d
    
    def build(self, input_shape):
        self.Wq = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )        
        self.Wk = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        self.Wv = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
    
    def call(self, x):
        q = tf.matmul(x,self.Wq)
        k = tf.matmul(x,self.Wk)
        v = tf.matmul(x,self.Wv)

        p = tf.nn.softmax(tf.matmul(q, k, transpose_b=True)/math.sqrt(self.d))
        p = tf.squeeze(p)

        h = tf.matmul(p, v)
        return h,p

layer = SelfAttentionLayer(512)
h, p = layer(x)
print(h.shape)

(1, 7, 512)


In [4]:
multi_attn_head = [SelfAttentionLayer(64) for i in range(8)]
outputs = [head(x)[0] for head in multi_attn_head]

outputs = tf.concat(outputs, axis=-1)
print(outputs.shape)

(1, 7, 512)


In [5]:
class FCLayer(layers.Layer):
    
    def __init__(self, d1, d2):
        super(FCLayer, self).__init__()
        self.d1 = d1
        self.d2 = d2
    
    def build(self, input_shape):
        self.W1 = self.add_weight(
            shape=(input_shape[-1], self.d1), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        self.b1 = self.add_weight(
            shape=(self.d1,), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )        
        self.W2 = self.add_weight(
            shape=(input_shape[-1], self.d2), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        self.b2 = self.add_weight(
            shape=(self.d2,), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )  
    
    def call(self, x):
        ff1 = tf.nn.relu(tf.matmul(x,self.W1)+self.b1)
        ff2 = tf.matmul(x,self.W2)+self.b2
        return ff2
    
ff = FCLayer(2048, 512)(h)
print(ff.shape)

(1, 7, 512)


In [6]:
"i kicked the ball and it disappeared"
class SelfAttentionLayer(layers.Layer):
    
    def __init__(self, d):
        super(SelfAttentionLayer, self).__init__()
        self.d = d
    
    def build(self, input_shape):
        self.Wq = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )        
        self.Wk = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        self.Wv = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
    
    def call(self, x, mask=None):
        q = tf.matmul(x,self.Wq)
        k = tf.matmul(x,self.Wk)
        v = tf.matmul(x,self.Wv)

        p = tf.matmul(q, k, transpose_b=True)/math.sqrt(self.d)
        p = tf.squeeze(p)
        if mask is None:
            p = tf.nn.softmax(p)
        else:
            p += mask * -1e9
            p = tf.nn.softmax(p)
                

        h = tf.matmul(p, v)
        return h,p

layer = SelfAttentionLayer(512)
mask = 1 - tf.linalg.band_part(tf.ones((7, 7)), -1, 0)
h, p = layer(x, mask)
print(h.shape)

(1, 7, 512)


In [7]:
mask = 1 - tf.linalg.band_part(tf.ones((7, 7)), -1, 0)
print(mask)

tf.Tensor(
[[0. 1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0.]], shape=(7, 7), dtype=float32)


In [8]:
np.set_printoptions(precision=3)
print(p.numpy())

[[1.    0.    0.    0.    0.    0.    0.   ]
 [0.888 0.112 0.    0.    0.    0.    0.   ]
 [0.051 0.028 0.921 0.    0.    0.    0.   ]
 [0.261 0.556 0.142 0.042 0.    0.    0.   ]
 [0.592 0.191 0.068 0.068 0.081 0.    0.   ]
 [0.141 0.275 0.106 0.17  0.183 0.126 0.   ]
 [0.482 0.11  0.117 0.073 0.063 0.134 0.022]]


In [9]:
a = tf.constant([1.,2.])
b = tf.constant([2.,1.])
print(tf.stack([a,b])[0,:])
print(tf.reduce_mean(tf.stack([a,b]), axis=0))

tf.Tensor([1. 2.], shape=(2,), dtype=float32)
tf.Tensor([1.5 1.5], shape=(2,), dtype=float32)


In [11]:
multi_attn_head = [SelfAttentionLayer(512) for i in range(8)]
outputs = [head(x)[0] for head in multi_attn_head]
outputs = tf.math.add_n(outputs)
print(outputs.shape)

(1, 7, 512)
