### Attention mechanism

 * Queries with shape `[B, T, C]`
 * Keys with shape `[B, T, C]`
 * Values with shape `[B, T, C]`

 * Computations:
   * Attention scores $QK^\top$ ==> shape: `[B, T, T]`
   * Scaling the attention score
   * Softmax normalization
   * Multiply with tensor V

$$\text{Attention(Q,K,V)}=\text{Softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V$$

### Version 1: using ModuleList for multi-head attention

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.q_proj = nn.Linear(d_in, d_out)
        self.k_proj = nn.Linear(d_in, d_out)
        self.v_proj = nn.Linear(d_in, d_out)
        
    def forward(self, x):
        # Apply linear transformations to get queries, keys, and values
        # x: [B, T, C]
        q = self.q_proj(x)  # [B, T, C]
        k = self.k_proj(x)  # [B, T, C]
        v = self.v_proj(x)  # [B, T, C]
        
        # Calculate attention scores
        scores = torch.bmm(q, k.transpose(1, 2))  # [B, T, T]
        scores = scores / np.sqrt(k.size(-1))     # Scaling by sqrt(d_k)
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)  # [B, T, T]
        
        # Multiply attention weights with values
        out = torch.bmm(attn_weights, v)          # [B, T, C]
        
        return out, attn_weights

# testing
x = torch.randn(2, 5, 10)
attention = SelfAttention_v1(10, 10)
print(attention)

out, attn_weights = attention(x)
print("\nAttention Weights:", attn_weights.size())  # torch.Size([2, 5, 5])
print("Output:", out.size())  # torch.Size([2, 5, 10])

SelfAttention_v1(
  (q_proj): Linear(in_features=10, out_features=10, bias=True)
  (k_proj): Linear(in_features=10, out_features=10, bias=True)
  (v_proj): Linear(in_features=10, out_features=10, bias=True)
)

Attention Weights: torch.Size([2, 5, 5])
Output: torch.Size([2, 5, 10])


In [2]:
# A wrapper class for multihead attention
class MultiHeadAttention_v1(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, (
            "Embedding dimension must be divisible by number of heads"
        )

        self.heads = nn.ModuleList(
            [
                SelfAttention_v1(d_in=embed_dim, d_out=embed_dim // num_heads)
                for _ in range(num_heads)
            ]
        )
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # split x into multiple heads and apply attention
        context_vec = torch.cat([head(x)[0] for head in self.heads], dim=-1)
        return self.out_proj(context_vec)
    
# testing
batch_size = 2
seq_len = 5
embed_dim = 32
num_heads = 8

mha = MultiHeadAttention_v1(embed_dim, num_heads)
print(mha)

x = torch.randn(batch_size, seq_len, embed_dim)
print("\nInput:", x.size())
out = mha(x)
print("Output:", out.size())

MultiHeadAttention_v1(
  (heads): ModuleList(
    (0-7): 8 x SelfAttention_v1(
      (q_proj): Linear(in_features=32, out_features=4, bias=True)
      (k_proj): Linear(in_features=32, out_features=4, bias=True)
      (v_proj): Linear(in_features=32, out_features=4, bias=True)
    )
  )
  (out_proj): Linear(in_features=32, out_features=32, bias=True)
)

Input: torch.Size([2, 5, 32])
Output: torch.Size([2, 5, 32])


### Version 2: All-in-one

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class MultiheadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, (
            "Embedding dimension must be divisible by number of heads"
        )
        self.num_heads = num_heads
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, x):
        # Apply linear transformations to get queries, keys, and values
        # x: [B, T, C]
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        # q,k,v: [B, T, C]

        # Split the queries, keys, and values into multiple heads
        B, T, C = q.size()
        q = q.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        k = k.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        v = v.view(B, T, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
        # q,k,v: [B, nh, T, C//nh]
        
        # Calculate attention scores
        scores = torch.matmul(q, k.permute(0, 1, 3, 2))
        scores = scores / (np.sqrt(k.size(-1)))
        # scores: [B, nh, T, T]
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)
        # attn_weights: [B, nh, T, T]
        
        # Multiply attention weights with values
        out = torch.matmul(attn_weights, v)
        # out: [B, nh, T, C//nh]

        # Concatenate the heads and apply a linear transformation
        out = out.permute(0, 2, 1, 3).contiguous().view(B, T, C)
        out = self.out_proj(out)
        # out: [B, T, C]
        
        return out, attn_weights

# testing
x = torch.randn(2, 5, 32)
msa = MultiheadSelfAttention(32, 8)
print(msa)

out, attn_weights = msa(x)
print("\nAttention Weights:", attn_weights.size())  # torch.Size([2, 8, 5, 5])
print("Output:", out.size())  # torch.Size([2, 5, 10])

MultiheadSelfAttention(
  (q_proj): Linear(in_features=32, out_features=32, bias=True)
  (k_proj): Linear(in_features=32, out_features=32, bias=True)
  (v_proj): Linear(in_features=32, out_features=32, bias=True)
  (out_proj): Linear(in_features=32, out_features=32, bias=True)
)

Attention Weights: torch.Size([2, 8, 5, 5])
Output: torch.Size([2, 5, 32])


## Feed-Forward Network (FFN)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForwardNetwork(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()

        # Two linear layers with activation in between
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.activ = nn.GELU(approximate='tanh')

    def forward(self, x):            # [B, T, C]
        x = self.activ(self.fc1(x))  # [B, T, 2C]
        x = self.fc2(x)              # [B, T, C]

        return x

# testing
ffn = FeedForwardNetwork(embed_dim=embed_dim, ff_dim=256)
print(ffn)
x = torch.randn(2, 5, embed_dim)
out = ffn(x)
print("\nOutput:", out.size())

FeedForwardNetwork(
  (fc1): Linear(in_features=32, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=32, bias=True)
  (activ): GELU(approximate='tanh')
)

Output: torch.Size([2, 5, 32])


## Transformer block

In [5]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = MultiheadSelfAttention(
            config.embed_dim, config.num_heads
        )
        self.norm1 = nn.LayerNorm(config.embed_dim)

        self.ffn = FeedForwardNetwork(
            config.embed_dim, config.ff_dim
        )
        self.norm2 = nn.LayerNorm(config.embed_dim)
        self.resid_dropout = nn.Dropout(config.resid_dropout)

    def forward(self, x):
        # Apply self-attention and add residual connection
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)[0]
        x = self.resid_dropout(x)
        x = shortcut + x

        # Apply feedforward network and add residual connection
        shortcut = x
        x = self.norm2(x)
        x = self.ffn(x)
        x = self.resid_dropout(x)
        x = shortcut + x

        return x
    
# testing
class Config:
    embed_dim = 32
    num_heads = 8
    ff_dim = 256
    resid_dropout = 0.1

transformer_block = TransformerBlock(Config)
print(transformer_block)

TransformerBlock(
  (attention): MultiheadSelfAttention(
    (q_proj): Linear(in_features=32, out_features=32, bias=True)
    (k_proj): Linear(in_features=32, out_features=32, bias=True)
    (v_proj): Linear(in_features=32, out_features=32, bias=True)
    (out_proj): Linear(in_features=32, out_features=32, bias=True)
  )
  (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (ffn): FeedForwardNetwork(
    (fc1): Linear(in_features=32, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=32, bias=True)
    (activ): GELU(approximate='tanh')
  )
  (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)


## Tokenization

In [6]:
%%capture

import sys

!{sys.executable} -m pip install tiktoken

In [7]:
import tiktoken

class Tokenizer:
    def __init__(self, encoding_name='gpt2'):
        # load the encoding
        self.encoding = tiktoken.get_encoding(encoding_name)

    def encode(self, text):
        # Convert text to token IDs
        tokens = self.encoding.encode(text)
        return tokens

    def decode(self, token_ids):
        # Convert token IDs to text
        text = self.encoding.decode(token_ids)
        return text

# testing
tokenizer = Tokenizer()

# Sample text
text = "Hello, world! This is a test of tokenization."

# Encoding (tokenization)
token_ids = tokenizer.encode(text)
print(f"Token IDs: {token_ids}")

# Decoding (detokenization)
decoded_text = tokenizer.decode(token_ids)
print(f"Decoded Text: {decoded_text}")


Token IDs: [15496, 11, 995, 0, 770, 318, 257, 1332, 286, 11241, 1634, 13]
Decoded Text: Hello, world! This is a test of tokenization.


## Embeddings

In [8]:
vocab_size = tokenizer.encoding.n_vocab
embed_dim = 128
max_seq_len = 512

token_embedding = nn.Embedding(vocab_size, embed_dim)
position_embedding = nn.Embedding(max_seq_len, embed_dim)

# testing
text = "Hello, world! This is a test of tokenization."
token_ids = tokenizer.encode(text)

# Convert token IDs to tensor
token_tensor = torch.tensor(token_ids).unsqueeze(0)
# Generate position IDs
position_ids = torch.arange(token_tensor.size(1)).unsqueeze(0)

token_embeds = token_embedding(token_tensor)
print(token_embeds.size())

position_embeds = position_embedding(position_ids)
print(position_embeds.size())

combined_embeds = token_embeds + position_embeds
print(combined_embeds.size())

torch.Size([1, 12, 128])
torch.Size([1, 12, 128])
torch.Size([1, 12, 128])


## Full GPT-like architecture

In [9]:
import yaml

# Load the YAML configuration file
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

config["model"]

{'vocab_size': 50257,
 'embed_dim': 768,
 'max_seq_len': 1024,
 'num_layers': 12,
 'ff_dim': 3072,
 'num_heads': 12,
 'attn_dropout': 0.1,
 'resid_dropout': 0.1,
 'embed_dropout': 0.1,
 'initializer_range': 0.02,
 'layer_norm_epsilon': '1e-5',
 'output_attentions': False,
 'output_hidden_states': False}

In [10]:
%%capture

import sys
!{sys.executable} -m pip install pydantic

In [11]:
from pydantic import BaseModel
import yaml

class ModelConfig(BaseModel):
    vocab_size: int
    embed_dim: int
    max_seq_len: int
    num_layers: int
    ff_dim: int
    num_heads: int
    attn_dropout: float
    resid_dropout: float
    embed_dropout: float
    initializer_range: float
    layer_norm_epsilon: float
    output_attentions: bool
    output_hidden_states: bool

class TrainingConfig(BaseModel):
    batch_size: int
    learning_rate: float
    weight_decay: float
    max_steps: int
    warmup_steps: int
    optimizer: str
    beta1: float
    beta2: float
    epsilon: float

class Config(BaseModel):
    model: ModelConfig
    training: TrainingConfig

# Load the YAML configuration file
with open("config.yaml", "r") as file:
    config = Config.parse_obj(yaml.safe_load(file))

print(config.model)

vocab_size=50257 embed_dim=768 max_seq_len=1024 num_layers=12 ff_dim=3072 num_heads=12 attn_dropout=0.1 resid_dropout=0.1 embed_dropout=0.1 initializer_range=0.02 layer_norm_epsilon=1e-05 output_attentions=False output_hidden_states=False


In [12]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(
            config.vocab_size, config.embed_dim
        )
        self.position_embedding = nn.Embedding(
            config.max_seq_len, config.embed_dim
        )
        self.layers = nn.ModuleList([
            TransformerBlock(config) for _ in range(config.num_layers)
        ])
        self.fc = nn.Linear(config.embed_dim, config.vocab_size)
        
    def forward(self, x):
        B, T = x.size()
        positions = torch.arange(T).expand(B, T).to(x.device)
        x = self.token_embedding(x) + self.position_embedding(positions)
        for layer in self.layers:
            x = layer(x)
        x = self.fc(x)
        return x
    
# testing
gpt2 = GPT(config.model)

gpt2

GPT(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(1024, 768)
  (layers): ModuleList(
    (0-11): 12 x TransformerBlock(
      (attention): MultiheadSelfAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ffn): FeedForwardNetwork(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (activ): GELU(approximate='tanh')
      )
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (resid_dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (fc): Linear(in_features=768, out_features=50257, bias=True)
)

In [13]:
x = torch.randint(0, config.model.vocab_size, (2, 5))
out = gpt2(x)
print("output:", out.size())

output: torch.Size([2, 5, 50257])
