## 4.1 Coding an LLM architecture

In [1]:
GPT_CONFIG_124M = {  # GPT-2 Small
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

GPT_CONFIG_350M = {  # GPT-2 Medium
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

GPT_CONFIG_774M = {  # GPT-2 Large
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

GPT_CONFIG_1558M = {  # GPT-2 XL
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

In [2]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        context_length,
        emb_dim,
        n_heads,
        n_layers,
        drop_rate,
        qkv_bias=False,
        **kwargs,
    ):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(context_length, emb_dim)
        self.drop_emb = nn.Dropout(drop_rate)
        self.trf_blocks = nn.Sequential(
            *[
                DummyTransformerBlock(
                    vocab_size=vocab_size,
                    context_length=context_length,
                    emb_dim=emb_dim,
                    n_heads=n_heads,
                    n_layers=n_layers,
                    drop_rate=drop_rate,
                    qkv_bias=qkv_bias,
                    **kwargs,
                )
                for _ in range(n_layers)
            ]
        )
        self.final_norm = DummyLayerNorm(emb_dim)
        self.out_head = nn.Linear(emb_dim, vocab_size, bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(
        self,
        vocab_size,
        context_length,
        emb_dim,
        n_heads,
        n_layers,
        drop_rate,
        qkv_bias=False,
        **kwargs,
    ):
        super().__init__()

    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x


DummyGPTModel(**GPT_CONFIG_124M)

DummyGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): DummyTransformerBlock()
    (1): DummyTransformerBlock()
    (2): DummyTransformerBlock()
    (3): DummyTransformerBlock()
    (4): DummyTransformerBlock()
    (5): DummyTransformerBlock()
    (6): DummyTransformerBlock()
    (7): DummyTransformerBlock()
    (8): DummyTransformerBlock()
    (9): DummyTransformerBlock()
    (10): DummyTransformerBlock()
    (11): DummyTransformerBlock()
  )
  (final_norm): DummyLayerNorm()
  (out_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [5]:
import tiktoken


tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [6]:
torch.manual_seed(42)
model = DummyGPTModel(**GPT_CONFIG_124M)
logits = model(batch)
print(f"{logits} ({logits.shape=})")

tensor([[[ 0.7739,  0.0181, -0.0797,  ...,  0.3098,  0.8177, -0.6049],
         [-0.8063,  0.8920, -1.0962,  ..., -0.4378,  1.1056,  0.1939],
         [-0.8459, -1.0176,  0.4964,  ...,  0.4581, -0.3293,  0.2320],
         [ 0.4098, -0.3144, -1.0831,  ...,  0.7491,  0.7018,  0.4715]],

        [[ 0.2911,  0.1596, -0.2137,  ...,  0.5173,  0.7380, -0.7045],
         [-0.4064,  0.6045, -0.4485,  ..., -0.5616,  0.4590, -0.1384],
         [-0.6108,  0.7148,  1.2499,  ..., -0.7925, -0.5328,  0.4794],
         [ 0.9423,  0.1867, -0.5557,  ...,  0.4156,  0.1756,  1.9882]]],
       grad_fn=<UnsafeViewBackward0>) (logits.shape=torch.Size([2, 4, 50257]))


## 4.2 Normalizing activations with layer normalization

In [7]:
torch.manual_seed(42)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print(f"{mean=}\n{var=}")

tensor([[0.0000, 0.1842, 0.0052, 0.7233, 0.0000, 0.5298],
        [0.0000, 0.0000, 0.0000, 0.2237, 0.0000, 0.7727]],
       grad_fn=<ReluBackward0>)
mean=tensor([[0.2404],
        [0.1661]], grad_fn=<MeanBackward1>)
var=tensor([[0.0982],
        [0.0963]], grad_fn=<VarBackward0>)


First value in the `mean` tensor is the mean of the first row, second value is the mean for the second row. Likewise for the variance.

`keepdim=True` means the output tensor retains the same number of dimensions as the input tensor, even though the operation reduces the tensor along the dimension specified via `dim`.
`dim` specifies the dimension along which the calculation of the statistic should be performed.

In [8]:
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Normalized layer outputs:\n", out_norm)
print(f"Mean:\n{mean}")
print(f"Variance:\n{var}")

Normalized layer outputs:
 tensor([[-0.7672, -0.1794, -0.7506,  1.5410, -0.7672,  0.9234],
        [-0.5351, -0.5351, -0.5351,  0.1857, -0.5351,  1.9546]],
       grad_fn=<DivBackward0>)
Mean:
tensor([[0.0000e+00],
        [2.2352e-08]], grad_fn=<MeanBackward1>)
Variance:
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [9]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5  # Prevent division by zero
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [10]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-1.1921e-08],
        [ 3.2037e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## 4.3 Implementing a feed forward network with GELU activations

In [11]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return (
            0.5 * x * (1 + torch.tanh(
                torch.sqrt(torch.tensor(2.0 / torch.pi))
                * (x + 0.044715 * torch.pow(x, 3))
            ))
        )

In [12]:
class FeedForward(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )

    def forward(self, x):
        return self.layers(x)

## 4.4 Adding shortcut connections

## 4.5 Connecting attention and linear layers in a transformer block


In [13]:
# From chapter 3
class MultiHeadAttention(nn.Module):
    def __init__(
        self,
        d_in: int,
        d_out: int,
        context_length: int,
        drop_rate: float,
        num_heads: int,
        qkv_bias=False,
    ):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = (
            d_out // num_heads
        )  # Reduces projection dim to match desired output dim
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # To combine head outputs
        self.dropout = nn.Dropout(drop_rate)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        # Tensor shape (b, num_tokens, d_out)
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(
            b, num_tokens, self.num_heads, self.head_dim
        )  # implicitly split the matrix by adding num_heads dimension, then unroll the last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transposes from shape (b, num_tokens, num_heads, head_dim) to (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(
            2, 3
        )  # compute dot product for each head
        mask_bool = self.mask.bool()[
            :num_tokens, :num_tokens
        ]  # masks truncated to the number of tokens

        attn_scores.masked_fill_(mask_bool, -torch.inf)  # uses mask to fill attn scores

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(
            1, 2
        )  # tensor shape: (b, num_tokens, n_heads, head_dim)

        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional linear projection
        return context_vec


In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, context_length, num_heads, drop_rate, qkv_bias):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                nn.Sequential(
                    LayerNorm(emb_dim),
                    MultiHeadAttention(
                        d_in=emb_dim,
                        d_out=emb_dim,
                        context_length=context_length,
                        drop_rate=drop_rate,
                        num_heads=num_heads,
                        qkv_bias=qkv_bias,
                    ),
                    nn.Dropout(drop_rate),
                ),
                nn.Sequential(
                    LayerNorm(emb_dim), FeedForward(emb_dim), nn.Dropout(drop_rate)
                ),
            ]
        )

    def forward(self, x):
        for layer in self.layers:
            x = layer(x) + x
        return x

In [15]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(
    context_length=GPT_CONFIG_124M["context_length"],
    drop_rate=GPT_CONFIG_124M["drop_rate"],
    emb_dim=GPT_CONFIG_124M["emb_dim"],
    num_heads=GPT_CONFIG_124M["n_heads"],
    qkv_bias=GPT_CONFIG_124M["qkv_bias"]
)
output = block(x)

print(f"Input shape: {x.shape=}")
print(f"Output shape: {output.shape=}")

Input shape: x.shape=torch.Size([2, 4, 768])
Output shape: output.shape=torch.Size([2, 4, 768])


## 4.6 Coding the GPT model


In [16]:
class GPTModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        context_length,
        emb_dim,
        n_heads,
        n_layers,
        drop_rate,
        qkv_bias=False,
        **kwargs,
    ):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(context_length, emb_dim)
        self.drop_emb = nn.Dropout(drop_rate)
        self.trf_blocks = nn.Sequential(
            *[
                TransformerBlock(
                    context_length=context_length,
                    emb_dim=emb_dim,
                    num_heads=n_heads,
                    drop_rate=drop_rate,
                    qkv_bias=qkv_bias,
                    **kwargs,
                )
                for _ in range(n_layers)
            ]
        )
        self.final_norm = LayerNorm(emb_dim)
        self.out_head = nn.Linear(emb_dim, vocab_size, bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [23]:
torch.manual_seed(123)

cfg = GPT_CONFIG_124M

model = GPTModel(
    vocab_size=cfg["vocab_size"],
    context_length=cfg["context_length"],
    drop_rate=cfg["drop_rate"],
    emb_dim=cfg["emb_dim"],
    n_heads=cfg["n_heads"],
    n_layers=cfg["n_layers"],
    qkv_bias=cfg["qkv_bias"]
)

out = model(batch)
print(f"Input batch:\n{batch}")
print()
print(f"Output shape: {out.shape=}")
print(out)

Input batch:
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: out.shape=torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [24]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [25]:
# Exercise 4.1: Calculate the number of parameters that are contained in the feed forward module and those that are contained in the multi-head attention module.
ff_params = 0
attn_params = 0

for module in model.modules():
    if isinstance(module, FeedForward):
        ff_params += sum(p.numel() for p in module.parameters())
    elif isinstance(module, MultiHeadAttention):
        attn_params += sum(p.numel() for p in module.parameters())

print(f"Parameters in feed forward layers: {ff_params:,}")
print(f"Parameters in attention layers: {attn_params:,}")
print(f"Percentage of total parameters:")
print(f"Feed forward: {ff_params/total_params*100:.1f}%")
print(f"Attention: {attn_params/total_params*100:.1f}%")

Parameters in feed forward layers: 56,669,184
Parameters in attention layers: 28,320,768
Percentage of total parameters:
Feed forward: 34.8%
Attention: 17.4%


In [26]:
total_size_bytes = total_params * 4 # assumes float32, = 4 bytes per parameter
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


## 4.7 Generating text

In [27]:
# `idx` is a (batch, n_tokens) array of indices in the current context
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # crops current context if it exceeds supported context size (only last 'context_size' tokens are used as context if current context is larger than dontext_size)
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :] # focus on last time step
        probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # appends sampled index to the running sequence. idx: (batch, n_tokens+1)

    return idx

In [28]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print(f"{encoded=}")
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # adds batch dimension
print(f"{encoded_tensor.shape=}")

encoded=[15496, 11, 314, 716]
encoded_tensor.shape=torch.Size([1, 4])


In [29]:
model.eval()
out = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=cfg["context_length"]
)
print(f"{out=}")
print(f"{len(out)=}")

out=tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
len(out)=1


In [30]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
