## 3.3

### 3.3.1

In [42]:
import torch
from torch import nn

In [43]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your (x^1)
    [0.55, 0.87, 0.66], # journey (x^2)
    [0.57, 0.85, 0.64], # starts (x^3)
    [0.22, 0.58, 0.33], # with (x^4)
    [0.77, 0.25, 0.10], # one (x^5)
    [0.05, 0.80, 0.55]] # step (x^6)
)

In [44]:
# calculating attention scores $omega$

query = inputs[1]
attn_scores2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores2[i] = torch.dot(x_i, query)
print(attn_scores2)

#normalizing attentions scores
# In practice, it’s more common and advisable to use the softmax function for normalization.
attn_weights2_tmp = attn_scores2 / attn_scores2.sum()
print(attn_weights2_tmp, f"\n{attn_weights2_tmp.sum()}")

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) 
1.0000001192092896


In [45]:
def softmax_naive(x: torch.Tensor):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights2_naive = softmax_naive(attn_scores2)
print(attn_weights2_naive, f"\n{attn_weights2_naive.sum()}")

# since torch's implementation is more stable and is optimized for performance
attn_weights2 = torch.softmax(attn_scores2, dim=0)
print(attn_weights2, f"\n{attn_weights2.sum()}")

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]) 
1.0
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]) 
1.0


In [46]:
# calculating context vector z^(2)
query = inputs[1]
context_vect2 = torch.zeros(query.shape[0])
for i, x_i in enumerate(inputs):
    context_vect2 += attn_weights2[i] * x_i
print(context_vect2)

tensor([0.4419, 0.6515, 0.5683])


### 3.3.2

In [47]:
alpha = torch.empty(inputs.shape[0], inputs.shape[0])
context_vects = torch.zeros(inputs.shape[0], inputs.shape[1])

for i, q_i in enumerate(inputs):
    for j, x_i in enumerate(inputs):
        alpha[i][j] = torch.dot(q_i, x_i)
alpha[i] = torch.softmax(alpha[i], dim=-1)
# By setting dim=-1, we are instructing the softmax function to apply the normalization along the last dimension of the attn_scores tensor

context_vects = alpha @ inputs # torch.matmul()
print(alpha)
print(context_vects)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
tensor([[1.9802, 2.6760, 2.6721],
        [2.8579, 4.2330, 3.7270],
        [2.8335, 4.1718, 3.6734],
        [1.5501, 2.4086, 2.0693],
        [1.5969, 1.8945, 1.6743],
        [0.4177, 0.6503, 0.5645]])


## 3.4

### 3.4.1

In [48]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2

torch.manual_seed(123)
W_q = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # req grad is false for illustration, in actual implementation, this must be true
W_k = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

q_2 = x_2 @ W_q 
k_2 = x_2 @ W_k
v_2 = x_2 @ W_v

print(q_2)

tensor([0.4306, 1.4551])


In [49]:
Q = inputs @ W_q # the matrix mul is done on the last dim, the 6 elements here will be broadcast. since for an ele, size is 1x3  and the W_q size is 3x2
K = inputs @ W_k
V = inputs @ W_q

print(inputs.shape, W_q.shape)
print(Q)

torch.Size([6, 3]) torch.Size([3, 2])
tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])


In [50]:
# computing omega_22
k_2 = K[1]
attn_score_22 = q_2.dot(k_2)
print(attn_score_22)

tensor(1.8524)


In [51]:
# computing omega_2i ie[0, T]
attn_scores_2 = q_2 @ K.T
print(q_2)
print(K.T)
print(attn_scores_2)

tensor([0.4306, 1.4551])
tensor([[0.3669, 0.4433, 0.4361, 0.2408, 0.1827, 0.3275],
        [0.7646, 1.1419, 1.1156, 0.6706, 0.3292, 0.9642]])
tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [52]:
d_k = K.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2/d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


In [53]:
# calculating the context vector z_2
context_vect_2 = attn_weights_2 @ V
print(context_vect_2)

tensor([0.3313, 1.1652])


### 3.4.2

$$\mathrm{Attention}(Q,K,V)=\mathrm{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

In [54]:
class SelfAttentionV1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_q = nn.Parameter(torch.rand(d_in, d_out))
        self.W_k = nn.Parameter(torch.rand(d_in, d_out))
        self.W_v = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_k
        queries = x @ self.W_q
        values = x @ self.W_v
        attn_scores = queries @ keys.T #omega
        attn_weights = torch.softmax(attn_scores/keys.shape[-1]**0.5, dim=-1)
        context_vect = attn_weights @ values
        return context_vect

In [55]:
torch.manual_seed(123)
saV1 = SelfAttentionV1(d_in, d_out)
print(saV1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


We can improve the SelfAttention_v1 implementation further by utilizing PyTorch’s nn.Linear layers, which effectively perform matrix multiplication when the bias units are disabled. Additionally, a significant advantage of using nn.Linear instead of manually implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight initialization scheme, contributing to more stable and
effective model training.

In [56]:
class SelfAttentionV2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
    
    def forward(self, x):
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        omega = Q @ K.T
        alpha = torch.softmax(omega/K.shape[-1]**0.5, dim=-1)
        Z = alpha @ V
        return Z
    

In [57]:
torch.manual_seed(789)
saV2 = SelfAttentionV2(d_in, d_out)
print(saV2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


## 3.5

### 3.5.1

In [58]:
queries = saV2.W_q(inputs)
keys = saV2.W_k(inputs)
attn_scores = (queries @ keys.T)
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**.5, dim=-1)
print(attn_weights)

tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [59]:
context_len = attn_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_len, context_len)) # .triangular lower = tril. returns lower triangular matrix of a given matrix
print(mask_simple)

masked_simple = attn_weights*mask_simple
print(masked_simple)

#renormalization: We divide each element in each row by the sum in each row.
row_sums = masked_simple.sum(dim=-1, keepdim=True)
masked_simple_norm = masked_simple/row_sums
print(masked_simple_norm)


tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])
tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.0000, 0.0000],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<MulBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<DivBackward0>)


In [60]:
# -inf masking method

mask = torch.triu(torch.ones(context_len, context_len), diagonal=1) # diagonal=1 shifts the diagonal by 1 upwards
print(mask)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf) # Fills elements of self tensor with value where mask is True.
print(masked)

attn_weights = torch.softmax(masked/ keys.shape[-1]**0.5, dim=-1)
print(attn_weights)


tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])
tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


### 3.5.2

In [61]:
torch.manual_seed(123)
dropout = nn.Dropout(0.5)
print(dropout(attn_weights))

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.8966, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6206, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4921, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4350, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3327, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


### 3.5.3

In [62]:
# simulating a batch by stacking the same inputs

batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape) # three-dimensional tensor consisting of two input texts with six tokens each, where each token is a three-dimensional embedding vector

torch.Size([2, 6, 3])


In [63]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(p=dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_len, context_len), diagonal=1)
        )
    
    def forward(self, x: torch.Tensor):
        b, num_tokens, d_in = x.shape
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        attn_scores = Q @ K.transpose(1,2) # transpose is done in the inner dims keeping the batches same
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) # [:num_tokens, :num_tokens] us done since we might not use the full context_len defined during initialization. Therefore, we need to adjust the mask to match the current sequence length dynamically.
        attn_weights = torch.softmax(attn_scores / K.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vect = attn_weights @ V
        return context_vect
    

In [64]:
torch.manual_seed(123)
context_len = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_len, 0.0)
context_vects = ca(batch)
print(context_vects)
print(context_vects.shape)

tensor([[[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]],

        [[-0.4519,  0.2216],
         [-0.5874,  0.0058],
         [-0.6300, -0.0632],
         [-0.5675, -0.0843],
         [-0.5526, -0.0981],
         [-0.5299, -0.1081]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([2, 6, 2])


## 3.6

### 3.6.1

In [65]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList([CausalAttention(d_in, d_out, context_len, dropout, qkv_bias) for _ in range(num_heads)])
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [66]:
torch.manual_seed(123)
context_len = batch.shape[1]
d_in, d_out = 3, 2

mha = MultiHeadAttentionWrapper(d_in, d_out, context_len, 0.0, 2)
context_vects = mha(batch)
print(context_vects)
print(context_vects.shape)

# The first dimension of the resulting context_vecs tensor is 2 since we have two input
# texts (the input texts are duplicated, which is why the context vectors are exactly the
# same for those). The second dimension refers to the 6 tokens in each input. The third
# dimension refers to the four-dimensional embedding of each token.


tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],

        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
torch.Size([2, 6, 4])


### 3.6.2

In [67]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # reduces the projection dim to match the desired output dim
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out) # Uses a Linear layer to combine all the outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_len, context_len), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        K = K.view(b, num_tokens, self.num_heads, self.head_dim)
        V = V.view(b, num_tokens, self.num_heads, self.head_dim)

        Q = Q.transpose(1,2) # (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        K = K.transpose(1,2)
        V = V.transpose(1,2)

        attn_scores = Q @ K.transpose(2,3)
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vect = (attn_weights @ V).transpose(1,2) # (b, num_tokens, num_heads, head_dim)
        context_vect = context_vect.contiguous().view(b, num_tokens, self.d_out) # combines heads
        context_vect = self.out_proj(context_vect)
        return context_vect


In [68]:
torch.manual_seed(123)
b, context_len, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_len, 0.0, 2)
context_vects = mha(batch)
print(context_vects)
print(context_vects.shape)

tensor([[[0.3190, 0.4858],
         [0.2941, 0.3906],
         [0.2854, 0.3600],
         [0.2692, 0.3879],
         [0.2636, 0.3935],
         [0.2574, 0.4033]],

        [[0.3190, 0.4858],
         [0.2941, 0.3906],
         [0.2854, 0.3600],
         [0.2692, 0.3879],
         [0.2636, 0.3935],
         [0.2574, 0.4033]]], grad_fn=<ViewBackward0>)
torch.Size([2, 6, 2])


In [69]:
# for GPT-2 => d_in = d_out = 768, num_heads = 12, context_len = 1024
gpt_context_len = 1024
gpt_d_in = 768
gpt_d_out = gpt_d_in
gpt_num_heads = 12

gpt_batch = torch.randn(2, gpt_context_len, gpt_d_in)
print(gpt_batch.shape)
gpt_mha = MultiHeadAttention(gpt_d_in, gpt_d_out, gpt_context_len, 0.0, gpt_num_heads)
gpt_context_vects = gpt_mha(gpt_batch)
print(gpt_context_vects)
print(gpt_context_vects.shape)

torch.Size([2, 1024, 768])
tensor([[[-0.6856,  0.3426, -0.5692,  ...,  0.1581,  0.2084, -0.4349],
         [-0.4580,  0.6576, -0.2165,  ...,  0.1919, -0.1643, -0.0923],
         [-0.5824,  0.0255, -0.0739,  ...,  0.3619,  0.1883, -0.2682],
         ...,
         [-0.0078,  0.0308,  0.0352,  ..., -0.0772,  0.0167, -0.0341],
         [-0.0734,  0.1377,  0.0321,  ..., -0.1071, -0.0999,  0.0101],
         [-0.0128,  0.0345, -0.0458,  ...,  0.0121, -0.0684, -0.0504]],

        [[ 0.1067, -0.4346,  0.6253,  ..., -0.1796,  0.3786,  0.2089],
         [-0.1972, -0.3109,  0.4191,  ...,  0.1684,  0.4277,  0.3420],
         [ 0.0047,  0.0440,  0.3937,  ...,  0.0093,  0.3385,  0.2311],
         ...,
         [ 0.0530,  0.0515,  0.0188,  ...,  0.0227,  0.0717, -0.0827],
         [-0.0172,  0.0535, -0.0050,  ..., -0.0138, -0.0248, -0.0157],
         [ 0.0120,  0.0782,  0.0148,  ...,  0.0172,  0.0447, -0.0717]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 1024, 768])
