# Implementing a GPT model from scratch
## Coding a LLM architecture

In [10]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_layers': 12,
    'n_heads': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [3]:
import torch
import torch.nn as nn
from urllib3.poolmanager import key_fn_by_scheme

In [7]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        # placeholder for the transformer block
        self.trf_block = nn.Sequential(
            *[DummyTranformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )
        # use a placeholder for layer norm
        self.final_norm =  DummyLayerNorm(cfg['emb_dim'])
        self.out_head  = nn.Linear(
            cfg['emb_dim'] , cfg['vocab_size'], bias=False
        )

    def forward(self,x):
        batch_size, seq_len = x.shape
        tok_embeds = self.tok_emb(x)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=x.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTranformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # a simple place holder

    def forward(self,x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # a simple place holder

    def forward(self,x):
        return x

In [9]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []
txt1 = 'Every effort moves your'
txt2 = 'Every day holds a'

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch,dim=0)
print(batch)


tensor([[6109, 3626, 6100,  534],
        [6109, 1110, 6622,  257]])


In [13]:
da

torch.Size([2, 4])

In [12]:
torch.manual_seed(123)

model = DummyGPTModel(cfg=GPT_CONFIG_124M)
logits = model(batch)
print(f"Output shape: {logits.shape}")
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [-0.2218,  0.2200, -0.5722,  ...,  3.4660,  0.7727,  0.5104]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


# 4.2 Normalizing activations with layer normalization

Layer normalization is a techique that normalizes the activities of a neuron across a batch of inputs.

the layer will have mean 0 and variance 1.


In [14]:
torch.manual_seed(123)

batch_example = torch.rand(2, 5)
print(batch_example)

tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740],
        [0.8665, 0.1366, 0.1025, 0.1841, 0.7264]])


In [16]:
layer = nn.Sequential(nn.Linear(5,6), nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.4993, 0.0000, 0.0000, 0.0094, 0.2147, 0.0000],
        [0.3390, 0.0000, 0.0000, 0.5273, 0.3894, 0.2145]],
       grad_fn=<ReluBackward0>)


In [23]:
mean = out.mean(dim=-1, keepdim=True)
print(mean)

tensor([[0.1206],
        [0.2450]], grad_fn=<MeanBackward1>)


In [24]:
var = out.var(dim=-1, keepdim=True)
print(var)

tensor([[0.0417],
        [0.0461]], grad_fn=<VarBackward0>)


In [26]:
torch.set_printoptions(sci_mode=False)

In [31]:
normed = (out - mean)/torch.sqrt(var)
print(normed.mean(dim=-1, keepdim=True))
# print(normed.var(dim=-1, keepdim=True))


tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)


In [45]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)   # unbiased true is population stats and False is sample stats
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift





In [46]:
3*5+5

20

In [47]:
ln = LayerNorm(6)
outputs_normed = ln(out)
print(outputs_normed)

tensor([[ 2.0325, -0.6471, -0.6471, -0.5965,  0.5052, -0.6471],
        [ 0.4795, -1.2504, -1.2504,  1.4404,  0.7368, -0.1559]],
       grad_fn=<AddBackward0>)


In [48]:
# print(outputs_normed.mean(dim=-1, keepdim=True))
print(outputs_normed.var(dim=-1, keepdim=True))

tensor([[1.1997],
        [1.1997]], grad_fn=<VarBackward0>)
