### Attention mechanism

 * Queries with shape `[B, T, C]`
 * Keys with shape `[B, T, C]`
 * Values with shape `[B, T, C]`

 * Computations:
   * Attention scores $QK^\top$ ==> shape: `[B, T, T]`
   * Scaling the attention score
   * Softmax normalization
   * Multiply with tensor V

$$\text{Attention(Q,K,V)}=\text{Softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V$$

### Version 1: using ModuleList for multi-head attention

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.q_proj = nn.Linear(d_in, d_out)
        self.k_proj = nn.Linear(d_in, d_out)
        self.v_proj = nn.Linear(d_in, d_out)
        
    def forward(self, x):
        # Apply linear transformations to get queries, keys, and values
        # x: [B, T, C]
        q = self.q_proj(x)  # [B, T, C]
        k = self.k_proj(x)  # [B, T, C]
        v = self.v_proj(x)  # [B, T, C]
        
        # Calculate attention scores
        scores = torch.bmm(q, k.transpose(1, 2))  # [B, T, T]
        scores = scores / (k.size(-1) ** 0.5)     # Scaling by sqrt(d_k)
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)  # [B, T, T]
        
        # Multiply attention weights with values
        out = torch.bmm(attn_weights, v)          # [B, T, C]
        
        return out, attn_weights

# testing
x = torch.randn(2, 5, 10)
attention = SelfAttention_v1(10, 10)
print(attention)

out, attn_weights = attention(x)
print(out.size())  # torch.Size([2, 5, 10])

SelfAttention_v1(
  (q_proj): Linear(in_features=10, out_features=10, bias=True)
  (k_proj): Linear(in_features=10, out_features=10, bias=True)
  (v_proj): Linear(in_features=10, out_features=10, bias=True)
)
torch.Size([2, 5, 10])


In [16]:
# A wrapper class for multihead attention
class MultiHeadAttention_v1(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"

        self.heads = nn.ModuleList(
            [
                SelfAttention_v1(d_in=embed_dim, d_out=embed_dim // num_heads)
                for _ in range(num_heads)
            ]
        )
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # split x into multiple heads and apply attention
        context_vec = torch.cat([head(x)[0] for head in self.heads], dim=-1)
        return self.out_proj(context_vec)
    
# testing
batch_size = 2
seq_len = 5
embed_dim = 32
num_heads = 8

mha = MultiHeadAttention_v1(embed_dim, num_heads)
print(mha)

x = torch.randn(batch_size, seq_len, embed_dim)
print("\nInput:", x.size())
out = mha(x)
print("Output:", out.size())

MultiHeadAttention_v1(
  (heads): ModuleList(
    (0-7): 8 x SelfAttention_v1(
      (q_proj): Linear(in_features=32, out_features=4, bias=True)
      (k_proj): Linear(in_features=32, out_features=4, bias=True)
      (v_proj): Linear(in_features=32, out_features=4, bias=True)
    )
  )
  (out_proj): Linear(in_features=32, out_features=32, bias=True)
)

Input: torch.Size([2, 5, 32])
Output: torch.Size([2, 5, 32])


## Tokenization

In [16]:
%%capture

import sys

!{sys.executable} -m pip install tiktoken

In [17]:
import tiktoken

class Tokenizer:
    def __init__(self, encoding_name='gpt2'):
        # load the encoding
        self.encoding = tiktoken.get_encoding(encoding_name)

    def encode(self, text):
        # Convert text to token IDs
        tokens = self.encoding.encode(text)
        return tokens

    def decode(self, token_ids):
        # Convert token IDs to text
        text = self.encoding.decode(token_ids)
        return text

# testing
tokenizer = Tokenizer()

# Sample text
text = "Hello, world! This is a test of tokenization."

# Encoding (tokenization)
token_ids = tokenizer.encode(text)
print(f"Token IDs: {token_ids}")

# Decoding (detokenization)
decoded_text = tokenizer.decode(token_ids)
print(f"Decoded Text: {decoded_text}")


Token IDs: [15496, 11, 995, 0, 770, 318, 257, 1332, 286, 11241, 1634, 13]
Decoded Text: Hello, world! This is a test of tokenization.


## Embeddings

In [20]:
vocab_size = tokenizer.encoding.n_vocab
embed_dim = 128
max_seq_len = 512

token_embedding = nn.Embedding(vocab_size, embed_dim)
position_embedding = nn.Embedding(max_seq_len, embed_dim)

# testing
text = "Hello, world! This is a test of tokenization."
token_ids = tokenizer.encode(text)

# Convert token IDs to tensor
token_tensor = torch.tensor(token_ids).unsqueeze(0)
# Generate position IDs
position_ids = torch.arange(token_tensor.size(1)).unsqueeze(0)

token_embeds = token_embedding(token_tensor)
print(token_embeds.size())

position_embeds = position_embedding(position_ids)
print(position_embeds.size())

combined_embeds = token_embeds + position_embeds
print(combined_embeds.size())

torch.Size([1, 12, 128])
torch.Size([1, 12, 128])
torch.Size([1, 12, 128])


## Feed-Forward-Network (FFN)

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForwardNetwork(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super().__init__()

        # Two linear layers with activation in between
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):        # [B, T, C]
        x = F.relu(self.fc1(x))  # [B, T, 2C]
        x = self.dropout(x)      # [B, T, 2C]
        x = self.fc2(x)          # [B, T, C]

        return x

# testing
ffn = FeedForwardNetwork(embed_dim=embed_dim, ff_dim=256)
x = torch.randn(2, 5, embed_dim)
out = ffn(x)
print(out.size())

torch.Size([2, 5, 128])
