In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [15]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

# LayerNorm

In [16]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [17]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [18]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)

mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)

print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [19]:
print(out_ln)
print(nn.GELU()(out_ln))

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
tensor([[ 0.3924,  0.9170, -0.0110,  0.1606, -0.0579],
        [ 0.7435, -0.1161, -0.1620,  0.9844,  0.1810]],
       grad_fn=<GeluBackward0>)


# Feedforward

In [20]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(embed_dim, 4*embed_dim),
            nn.GELU(),
            nn.Linear(4*embed_dim, embed_dim),
        )
    def forward(self, x):
        return self.layers(x)

In [21]:
ffn = FeedForward(7)
x = torch.rand(2,3,7)
ffn(x)

tensor([[[ 0.2938,  0.2750, -0.2915, -0.1774,  0.0675, -0.3712,  0.1027],
         [ 0.6098,  0.4351, -0.4272, -0.1102,  0.2877, -0.4949,  0.1296],
         [ 0.5349,  0.4603, -0.4299, -0.1643,  0.2524, -0.4933,  0.1858]],

        [[ 0.3170,  0.3319, -0.2710, -0.1806,  0.0505, -0.3808,  0.1284],
         [ 0.3270,  0.2830, -0.2897, -0.2049,  0.0593, -0.3527,  0.1450],
         [ 0.3536,  0.3837, -0.3053, -0.1027,  0.1574, -0.3907,  0.1756]]],
       grad_fn=<ViewBackward0>)

# Transformer Block

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout=-1, num_heads=2):
        super().__init__()

        assert (d_out % num_heads == 0)
        
        self.num_heads = num_heads
        self.d_out = d_out
        self.head_dim = d_out // num_heads

        self.W_query = torch.nn.Parameter(torch.rand(d_in, d_out, device=device))
        self.W_key = torch.nn.Parameter(torch.rand(d_in, d_out, device=device))
        self.W_value = torch.nn.Parameter(torch.rand(d_in, d_out, device=device))
        causal_mask = torch.triu(torch.ones(context_length, context_length, device=device), diagonal=1)
        self.register_buffer("causal_mask", causal_mask, persistent=False)

        self.out_proj = nn.Linear(d_out, d_out, device=device)
        
        self.dropout = dropout
        if dropout > 0.0:
            self.dropout_layer = torch.nn.Dropout(dropout)

    def forward(self, x):

        assert len(x.shape) == 3, print(f"x should be 3 dimensional but found it as {len(x.shape)}")
        b, num_tokens, d_in = x.shape

        Q = x @ self.W_query
        K = x @ self.W_key
        V = x @ self.W_value

        Q = Q.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1,2) # b, num_heads, tokens, embedding
        K = K.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1,2)
        V = V.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1,2)
        
        QKt = Q @ K.transpose(-2,-1)

        # Apply causal mask        
        QKt.masked_fill_(
            self.causal_mask.bool()[:num_tokens, :num_tokens], # num_tokens makes it work for cases where batch has less tokens than context length
            -torch.inf
        )
        
        QKt = F.softmax(QKt / self.head_dim**0.5, dim=-1)
        if self.dropout > 0.0:
            QKt = self.dropout_layer(QKt)
        
        QKtV = (QKt @ V).transpose(1,2) # batch, num_tokens, num_heads, head_dim

        QKtV = QKtV.contiguous().view(b, num_tokens, self.d_out)
        QKtV = self.out_proj(QKtV)
        return QKtV

In [25]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attention = MultiHeadAttention(
            d_in=cfg["emb_dim"], 
            d_out=cfg["emb_dim"], 
            context_length=cfg["context_length"], 
            dropout=cfg["drop_rate"], 
            num_heads=cfg["n_heads"]
        )
        self.ff = FeedForward(cfg["emb_dim"])
        self.norm1 = nn.LayerNorm(cfg["emb_dim"])
        self.norm2 = nn.LayerNorm(cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        
    def forward(self, x):
        residual = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = x + residual

        residual = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + residual

        return x
        

In [26]:
torch.manual_seed(123)
x = torch.rand(2, 7, 768).to(device)

block = TransformerBlock(GPT_CONFIG_124M).to(device)
output = block(x)

In [27]:
output.shape

torch.Size([2, 7, 768])

# GPT Model

In [28]:
torch.arange(2)

tensor([0, 1])

In [38]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = nn.LayerNorm(cfg["emb_dim"])

        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

        self.out_head.weight = self.tok_emb.weight  # enable weight sharing between the input and output token matrices

        self.register_buffer("pos_ids", torch.arange(cfg["context_length"]))

    def forward(self, x):
        batch_size, seq_len = x.shape
        tok_embeds = self.tok_emb(x)
        pos_embeds = self.pos_emb(self.pos_ids[:seq_len])

        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

In [39]:
torch.manual_seed(123)
import tiktoken 

tokenizer = tiktoken.get_encoding("gpt2")
batch = []

txt1 = "AI is machine learning."
txt2 = "Machine learning is AI."

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0).to(device)  # the sequence length needs to be the same for this to work

print(batch)

tensor([[20185,   318,  4572,  4673,    13],
        [37573,  4673,   318,  9552,    13]], device='cuda:0')


In [40]:
gpt_model = GPTModel(GPT_CONFIG_124M).to(device)

In [41]:
def count_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total params: {total:,}")
    print(f"Trainable params: {trainable:,}")

def model_bytes(model, include_grads=True):
    param_bytes = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_bytes = sum(b.nelement() * b.element_size() for b in model.buffers())
    grad_bytes = 0
    if include_grads:
        grad_bytes = sum(p.grad.nelement() * p.grad.element_size()
                         for p in model.parameters() if p.grad is not None)
    return param_bytes + buffer_bytes + grad_bytes

def print_model_memory(model):
    bytes_total = model_bytes(model, include_grads=True)
    print(f"Model+grads+buffers: {bytes_total/1024**2:.2f} MiB")


print(count_params(gpt_model))
print(print_model_memory(gpt_model))

Total params: 124,412,160
Trainable params: 124,412,160
None
Model+grads+buffers: 522.60 MiB
None


In [42]:
gpt_model(batch)

tensor([[[ 1.1135e+00, -3.9195e+00,  2.5692e+01,  ...,  7.1352e+01,
          -7.9276e+01,  2.4213e+01],
         [ 3.9648e+01,  5.2325e+00,  3.6704e+00,  ...,  4.2743e+01,
          -4.4488e+01,  1.0543e+00],
         [ 3.8809e+01,  4.6302e+01,  6.6042e+00,  ..., -1.4842e+01,
          -6.7325e+01, -4.6987e+00],
         [-5.1707e+00,  1.0667e+01, -8.6553e+00,  ...,  1.7779e+01,
          -5.1281e+01, -2.1124e+01],
         [ 1.8735e+01,  5.9936e+01,  2.6030e+01,  ..., -6.5854e+00,
          -8.4782e+01,  3.9035e-01]],

        [[-4.0170e+01,  5.7704e+01,  2.9817e+01,  ...,  4.7025e+01,
           4.8286e+00,  2.2369e+01],
         [-5.1062e+01,  2.0255e+01,  3.9927e+01,  ...,  1.4520e+01,
          -1.4429e+01, -4.6117e-03],
         [-2.0727e+01,  1.1689e+01,  4.4933e+00,  ...,  5.2980e+01,
          -3.2617e+01,  2.7573e+00],
         [-9.5684e+00,  2.4166e+01,  2.7835e+01,  ...,  2.8095e-01,
          -1.8059e+01,  9.4250e+00],
         [-2.3676e+01,  2.4165e+01,  3.0748e+01,  ...