# Part 2: Optimizing the Model

So far, we've initialized the GPT-2 model as described in the paper. Now we would want to optimize the model such that we speed up training and potentially, get better performance. So, we'll start with the code we had by the end of last part and then build on that. 

In [16]:
# imports
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import math

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

device

'cpu'

In [17]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

class CausalSelfAttention(nn.Module):
    def __init__(self, config:GPTConfig):
        super().__init__()
        assert config.n_embd % config.n_head == 0

        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1 # Just to identify the layer which we want to scale down by 1 / sqrt(N)

        self.n_head = config.n_head
        self.n_embd = config.n_embd

        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size)) 

    def forward(self, x):
        B, T, C = x.size() # Batch Size, Sequence Length, Embedding Dim

        qkv = self.c_attn(x)

        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(y)

        return y

class MLP(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1 # Just to identify the layer which we want to scale down by 1/sqrt(N)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config:GPTConfig):
        super().__init__()
        self.config = config
        
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd), 
            wpe = nn.Embedding(config.block_size, config.n_embd), 
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), 
            ln_final = nn.LayerNorm(config.n_embd)
        )) 

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights) # apply func iterates over all submodules, and calls _init_weights on it

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (self.config.n_layer * 2) ** -0.5 # Scale down the weights by 1/sqrt(N) 

            torch.nn.init.normal_(module.weight, mean=0, std=std) # initialize linear layer with zero mean and 0.02 stdev
        
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias) # Initialize the bias to zero if it exists
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02) # Embedding layer also init with zero mean and 0.02 stdev

        # We're not initializing the LayerNorm because the PyTorch default is what GPT-2 has also used

    def forward(self, idx, targets=None):
        B, T = idx.size()

        assert T <= self.config.block_size

        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = pos_emb + tok_emb

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_final(x)

        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

In [18]:
# Dataloader
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        with open('input.txt', 'r') as f:
            text = f.read()

        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)

        print(f"1 Epoch = {len(self.tokens) // (B * T)} Batches") # In one epoch, we're going to see these many batches, and then start again

        self.current_position = 0
    
    def next_batch(self):
        B, T = self.B, self.T

        buf = self.tokens[self.current_position: self.current_position + (B*T+1)]
        x = (buf[:-1]).view(B, T)
        y = (buf[1:]).view(B, T)

        self.current_position += B * T

        # Start again if you reach the end of the dataset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        
        return x, y

In [21]:
# training loop
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

model = GPT(GPTConfig())
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
train_loader = DataLoaderLite(B=4, T=32)

# Iterate for some epochs and optimize
for i in range(50):
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
print(f"After 50 steps, Loss is: {loss.item()}")

1 Epoch = 2640 Batches
After 50 steps, Loss is: 6.799213886260986


When optimizing, you should always start with:

1. What hardware do you have?
2. What does it offer?
3. Are you fully utilizing it?

In PyTorch, by default, all tensors are of dtype float32. That is, each tensor is occupying 32 bits of memory- including parameters, activations, etc. Empirically, for deep learning, 32 bit float representation is too much. You can lower this precision for deep learning but still get good enough models. This can speed up quite a bit because you have much lower memory to move around. Because memory bandwidth is the bottleneck for GPU workloads. That is, most of the time, the tensor cores just sit idle because you're moving memory around to feed them. If you're getting 60% of hardware utilization, you're doing quite well.

For training, you still want floats (i.e. not integers), but precision can be lower. However, during inference time, you can use integer precision and still get decent results.

Inspect your hardware and GPU. How many TFLOPS does it offer theoretically? 1 TFLOPS = 1 Trillion Floating Point Operations. If you go down the precision, the FLOPS increases quite a bit.

### Tensor Cores

What are tensor cores? Tensor cores are basically simple instructions for the GPU. These do a $4 \times 4$ matrix multiplication. That is, when you pass a big matrix multiplication to GPU, it breaks down the matrix multiplication into these $4 \times 4$ units, and does this small matmul in parallel. And deep learning, is mostly matrix multiplication!

For reference, look at the white paper on the GPU architecture that you are using.