<a href="https://colab.research.google.com/github/vifirsanova/100-days-of-code/blob/main/day11/dot_product_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import sys
import numpy as np
import scipy.special
import textwrap

wrapper = textwrap.TextWrapper(width=70)
np.set_printoptions(threshold=sys.maxsize)


def to_tensor(tensor):
    return np.array(tensor)


def shape_tensor(tensor, name):
    print(f'{name} shape: {tensor.shape}\n')
    print(f'{tensor}\n')

print('Q, K, V arrays must have the same embedding dimensions (number of columns)')
print('M array must have the same shape as np.dot(Q, K.T)')
print()

q = to_tensor([[1, 0, 0, 1], [0, 1, 0, 0]])
shape_tensor(q, 'query')
k = to_tensor([[1, 2, 3, 1], [4, 5, 6, 5]])
shape_tensor(k, 'key')
v = to_tensor([[0, 1, 0, 0], [1, 0, 1, 1]])
shape_tensor(v, 'value')
m = to_tensor([[0, 0], [-1e9, 0]])
shape_tensor(m, 'mask')
shape_tensor(np.dot(q, k.T), 'dot')

Q, K, V arrays must have the same embedding dimensions (number of columns)
M array must have the same shape as np.dot(Q, K.T)

query shape: (2, 4)

[[1 0 0 1]
 [0 1 0 0]]

key shape: (2, 4)

[[1 2 3 1]
 [4 5 6 5]]

value shape: (2, 4)

[[0 1 0 0]
 [1 0 1 1]]

mask shape: (2, 2)

[[ 0.e+00  0.e+00]
 [-1.e+09  0.e+00]]

dot shape: (2, 2)

[[2 9]
 [2 5]]



$\textrm{softmax} \left(\frac{Q K^T}{\sqrt{d}} + M \right) V$

In [31]:
def dot_product_self_attention(q, k, v, m, scale=True):
    """Args:
        q (numpy.ndarray): query representations with shape (L_q, d)
        k (numpy.ndarray): key representations with shape (L_k, d)
        v (numpy.ndarray): value representations with shape (L_v, d), L_v = L_k
        m (numpy.ndarray): attention-mask, attention shape (L_q, L_k)
        scale (bool): scale the dot product of the q and transposed k

    Returns:
        numpy.ndarray: self-attention array for q, k, v arrays with shape (L_q, L_k)
    """

    assert q.shape[-1] == k.shape[-1] == v.shape[-1], "Embedding dim's of q, k, v have different shapes"

    # Set depth of the query embedding for scaling down the dot product
    if scale: 
        depth = q.shape[-1]
    else:
        depth = 1

    # Scaled query key dot product
    dots = np.matmul(q, np.swapaxes(key, -1, -2)) / np.sqrt(depth) 
    
    # Masking
    if mask is not None:
        dots = np.where(m, dots, np.full_like(dots, -1e9)) 
    
    # Softmax
    dots = np.exp(dots - scipy.special.logsumexp(dots, axis=-1, keepdims=True))

    # Dots * value
    self_attention = np.matmul(dots, v)
    
    return self_attention


def masked_dot_product_self_attention(q, k, v, scale=True):
    """ Returns:
        numpy.ndarray: masked dot product self attention tensor
    """
    
    # Penultimate q dim
    m_size = q.shape[-2]

    # Matrix (see fig. above) with shape (1, mask_size, mask_size)
    m = np.tril(np.ones((1, m_size, m_size), dtype=np.bool_), k=0)  
        
    return dot_product_attention(q, k, v, m, scale=scale)

masked_dot_product_self_attention(q, k, v)

array([[[0.        , 1.        , 0.        , 0.        ],
        [0.81757448, 0.18242552, 0.81757448, 0.81757448]]])

<img src="https://sun9-24.userapi.com/impg/W8AgaZFgXYAlb-XrEs-WeorsQrxtObPz7sj_gw/0T6AULfoIU0.jpg?size=682x522&quality=96&sign=c1ee9debbc950b20937439b674c09e5b&type=album" width="500"/>
