### reBuild GPT2
- Large means vast of parameters, 124M for GPT2

In [9]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [16]:
import torch
import torch.nn as nn

class DummyGPT2(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])

        self.blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, token_ids):
        batch_size, seq_len = token_ids.shape
        token_emb = self.token_emb(token_ids)
        pos_emb = self.pos_emb(torch.arange(seq_len, device=token_ids.device))
        x = token_emb + pos_emb
        x = self.drop(x)
        x = self.blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

In [26]:
import torch
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

texts = ["Once upon a time there", "were four little Rabbits"]
batch = torch.stack([torch.tensor(tokenizer.encode(t)) for t in texts])
print(batch)


tensor([[ 7454,  2402,   257,   640,   612],
        [22474,  1440,  1310, 22502,   896]])


In [27]:
torch.manual_seed(123)
model = DummyGPT2(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 5, 50257])
tensor([[[-0.1453, -0.5939,  0.3767,  ...,  0.4361,  0.3913,  1.1740],
         [ 0.2646,  0.5527, -1.0897,  ...,  0.3165,  0.7068,  1.9168],
         [-0.2009, -0.7217,  0.7162,  ...,  0.6297,  0.6221, -0.1177],
         [ 0.1959,  0.4116,  1.1859,  ...,  2.2309,  0.2540,  0.7609],
         [-0.4772, -0.7713,  0.6711,  ...,  0.9593, -1.1426, -1.0256]],

        [[-0.7387,  0.2473, -2.2699,  ..., -0.9243, -1.1297,  0.1037],
         [-0.5791,  1.0997, -0.4741,  ..., -0.7711,  0.9321,  1.0572],
         [ 0.7911,  1.0512,  0.4935,  ...,  0.8441, -0.2399, -0.5090],
         [ 1.1721,  0.9144, -0.7984,  ...,  1.6035,  0.5685,  1.0169],
         [-1.0692, -1.7418,  0.1271,  ...,  0.1854, -0.5162, -0.7783]]],
       grad_fn=<UnsafeViewBackward0>)


### Layer Norm

In [118]:
torch.manual_seed(123)

batch_example = torch.randn(2, 5)

layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(batch_example)
print(out)

mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, unbiased=False, keepdim=True)
print("mean:\n", mean)
print("var:\n", var)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
var:
 tensor([[0.0192],
        [0.0332]], grad_fn=<VarBackward0>)


In [128]:
torch.manual_seed(123)

batch_example = torch.randn(2, 5)

layer = nn.Sequential(nn.Linear(5,6),nn.ReLU(),nn.LayerNorm(6))
out = layer(batch_example)
print(out)

mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, unbiased=False, keepdim=True)
print("mean:\n", mean)
print("var:\n", var)

tensor([[ 0.6745,  1.5470, -0.9549,  0.6431, -0.9549, -0.9549],
        [-0.0207,  0.1228, -1.1913,  1.6619,  0.6186, -1.1913]],
       grad_fn=<NativeLayerNormBackward0>)
mean:
 tensor([[-4.9671e-08],
        [-1.9868e-08]], grad_fn=<MeanBackward1>)
var:
 tensor([[0.9995],
        [0.9997]], grad_fn=<VarBackward0>)


### Gelu: smoother than Relu

In [130]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]), nn.GELU(), nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"]))

    def forward(self, x):
        return self.layers(x)

In [142]:
print("model structure: \n",FeedForward(GPT_CONFIG_124M))

model structure: 
 FeedForward(
  (layers): Sequential(
    (0): Linear(in_features=768, out_features=3072, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=3072, out_features=768, bias=True)
  )
)


### ShortCut connections


In [149]:
import torch
import torch.nn as nn

class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList(nn.Sequential(nn.Linear(layer_sizes[i], layer_sizes[i+1]), nn.GELU()) for i in range(len(layer_sizes)-1))
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            out = layer(x)
            if self.use_shortcut and x.shape[-1] == out.shape[-1]:
                x = x + out
            else:
                x = out
        return x

def print_gradients(model,x):
    output = model(x)
    target = torch.zeros_like(output)
    loss = nn.MSELoss()(output, target)
    loss.backward()

    for name, param in model.named_parameters():
        if param.grad is not None and 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [210]:
layer_sizes = [3] * 5 + [1]

x = torch.randn(1, 3)

torch.manual_seed(123)
model = ExampleDeepNeuralNetwork(
    layer_sizes, use_shortcut=False
)
print_gradients(model, x)

layers.0.0.weight has gradient mean of 0.00026235237601213157
layers.1.0.weight has gradient mean of 8.274786523543298e-05
layers.2.0.weight has gradient mean of 0.0007381783216260374
layers.3.0.weight has gradient mean of 0.0012937318533658981
layers.4.0.weight has gradient mean of 0.004788137506693602


In [213]:
layer_sizes = [3] * 5 + [1]

x = torch.randn(1, 3)

torch.manual_seed(123)
model = ExampleDeepNeuralNetwork(
    layer_sizes, use_shortcut=True
)
print_gradients(model, x)

layers.0.0.weight has gradient mean of 0.16460204124450684
layers.1.0.weight has gradient mean of 0.17733019590377808
layers.2.0.weight has gradient mean of 0.1964351385831833
layers.3.0.weight has gradient mean of 0.10792218148708344
layers.4.0.weight has gradient mean of 0.6378514766693115


### Transformer