# GPT-2 LLM Architecture

The time has come for us to now code the GPT-2 LLM architecture in `ttnn`. It involves putting everything together. 

We define what GPT-2 124M looks like

In [10]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

# Torch Implementation

We will be brining back a lot of code we have written in the previous notebooks, but will now tweak and adjust things as necessary.

In [11]:
import torch
from torch import nn

In [12]:
class DummyGPTModel(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])
    self.trf_blocks = nn.Sequential(
      *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm = DummyLayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(
      cfg["emb_dim"], cfg["vocab_size"], bias=False
    )

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(
      torch.arange(seq_len, device=in_idx.device)
    )

    x = tok_embeds + pos_embeds
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)

    logits = self.out_head(x)
    
    return logits

In [13]:
class DummyTransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()

  def forward(self, x):
    return x

class DummyLayerNorm(nn.Module):
  def __init__(self, normalized_shape, eps=1e-5):
    super().__init__()

  def forward(self, x):
    return x

    

In [14]:
import tiktoken

In [15]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [16]:
torch.manual_seed(123)

model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)


In [17]:
logits.shape, logits

(torch.Size([2, 4, 50257]),
 tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
          [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
          [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
          [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],
 
         [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
          [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
          [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
          [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
        grad_fn=<UnsafeViewBackward0>))

## Layer Normalization

In [18]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [19]:
mean = out.mean(dim=-1, keepdim=True)
variance = out.var(dim=-1, keepdim=True)

mean, variance

(tensor([[0.1324],
         [0.2170]], grad_fn=<MeanBackward1>),
 tensor([[0.0231],
         [0.0398]], grad_fn=<VarBackward0>))

In [20]:
out_norm = (out - mean) / torch.sqrt(variance)

mean = out_norm.mean(dim=-1, keepdim=True)
variance = out_norm.var(dim=-1, keepdim=True)

out_norm, mean, variance

(tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
         [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
        grad_fn=<DivBackward0>),
 tensor([[9.9341e-09],
         [1.9868e-08]], grad_fn=<MeanBackward1>),
 tensor([[1.0000],
         [1.0000]], grad_fn=<VarBackward0>))

In [21]:
torch.set_printoptions(sci_mode=False)
print("Mean", mean)
print("Variance", variance)

Mean tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


Finally, now an implementation!

In [22]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()

    self.eps = 1e-5
    
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

    print(self.scale)
    print(self.shift)

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)

    norm_x = (x - mean) / torch.sqrt(var + self.eps)

    return self.scale * norm_x + self.shift

In [23]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)

mean = out_ln.mean(dim=-1, keepdim=True)
variance = out_ln.var(dim=-1, unbiased=False, keepdim=True)

print("Mean", mean)
print("Variance", variance)

Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)
Mean tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## GELU Activation


In [24]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(
      torch.sqrt(torch.tensor(2.0 / torch.pi)) *
      (x + 0.044715 * torch.pow(x, 3))
    ))

## Feed Forward

linear, GELU, linear

In [25]:
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    self.layer = nn.Sequential(
      nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
      GELU(),
      nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self, x):
    return self.layers(x)

## TransformerBlock