### Attention

In [2]:
import numpy as np

In [4]:
seq_length = 3
d_k = 4

In [6]:
inputs = np.random.uniform(size=(seq_length, d_k))
inputs

array([[0.12659904, 0.48719869, 0.33291713, 0.04049794],
       [0.53279274, 0.55435979, 0.8208603 , 0.57607201],
       [0.90858765, 0.90543261, 0.60495958, 0.81817818]])

In [7]:
a_input = inputs[0]
b_input = inputs[1]
c_input = inputs[2]

In [8]:
def softmax(X):
    z = np.exp(X)
    return (z / z.sum(axis=0)).T

In [19]:
c_alpha = softmax([
    (c_input @ a_input.T) / np.sqrt(d_k),
    (c_input @ b_input.T) / np.sqrt(d_k)
])

In [23]:
c_alpha

array([0.35855838, 0.64144162])

In [11]:
c_attn = c_alpha[0] * a_input + c_alpha[1] * b_input
c_attn

array([0.38714858, 0.53027862, 0.64590419, 0.38403744])

In [12]:
ab = inputs[:2]

In [20]:
softmax(c_input @ ab.T / np.sqrt(d_k)) @ ab

array([0.38714858, 0.53027862, 0.64590419, 0.38403744])

In [25]:
# If we allow self-attention
c_attn_self = softmax(inputs @ inputs.T / np.sqrt(d_k)) @ inputs
c_attn_self

array([[0.55010996, 0.66293166, 0.5975196 , 0.50620306],
       [0.60325788, 0.68878584, 0.62194722, 0.56131293],
       [0.63736133, 0.71029641, 0.6262569 , 0.59236005]])

In [27]:
c_attn, c_attn_self[2]

(array([0.38714858, 0.53027862, 0.64590419, 0.38403744]),
 array([0.63736133, 0.71029641, 0.6262569 , 0.59236005]))

In [28]:
c_attn + c_input, c_attn_self[2]

(array([1.29573623, 1.43571122, 1.25086376, 1.20221562]),
 array([0.63736133, 0.71029641, 0.6262569 , 0.59236005]))

Self-attention and non self-attention are not the same

### A view from pytorch

In [29]:
from transformers import AutoModel

model = AutoModel.from_pretrained('bert-base-cased')

model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  