In [3]:
import torch
import torch.nn as nn # All neural network components, nn.Linear, nn.Conv2d, BatchNorm, Loss functions, Activation functions

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024, # max number of input tokens the model can handle via the positional embeddings
    "embedding_size": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

## 1. A placeholder GPT model architecture class

In [5]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__() # Call the __init__ of the parent class
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["embedding_size"]) 
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["embedding_size"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.transformer_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])] # * is used to unpack the list
        )
        self.final_norm = DummyLayerNorm(cfg["embedding_size"])
        self.out_head = nn.Linear(cfg["embedding_size"], cfg["vocab_size"], bias=False)

    def forward(self, input_idx):
        batch_size, seq_len = input_idx.shape
        tok_embeds = self.tok_emb(input_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len, # Indices for the positional embeddings
            device=input_idx.device) # Ensure the tensor generated by torch.arange is on the same device as input_idx
         ) 
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x) # logits are the raw, unnormalized scores or outputs of the final layer before the activation function is applied
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x
    
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

## 2. High-level overview of data flow in a GPT model

In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
tx1 = "Every effort moves you"
tx2  = "Every day holds a"

print(tokenizer.encode(tx1))

[6109, 3626, 6100, 345]


In [7]:
batch.append(torch.tensor(tokenizer.encode(tx1)))
batch.append(torch.tensor(tokenizer.encode(tx2)))
batch = torch.stack(batch, dim=0)
print(batch)
print(batch.shape)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
torch.Size([2, 4])


In [8]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


## 3. Layer Normalization

In [9]:
# Example
torch.manual_seed(123)
batch_sample = torch.randn(2,5)
print(batch_sample)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU()) # A linear layer followed by a ReLU activation function
out = layer(batch_sample)
print(out)

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [10]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [11]:
out_norm = (out - mean) / torch.sqrt(var) # Normalizing the output by subtracting the mean and dividing by the standard deviation
normalized_mean = out_norm.mean(dim=-1, keepdim=True)
nomarlized_var = out_norm.var(dim=-1, keepdim=True)
torch.set_printoptions(sci_mode=False)
print("Mean:\n", normalized_mean)
print("Variance:\n", nomarlized_var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [19]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5 # Epsilon value to avoid division by zero
        self.scale = nn.Parameter(torch.ones(emb_dim)) # Trainable scale parameter, initialized to 1
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # Trainable shift parameter, initialized to 0

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) # x shape is (batch_size, seq_len, emb_dim), so we calculate the mean along the last dimension - the embedding dimension
        var = x.var(dim=-1, keepdim=True, unbiased=False) # unbiased=False means the variance is calculated with N instead of N-1
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return norm_x * self.scale + self.shift

In [13]:
nn.Parameter(torch.ones(5)), nn.Parameter(torch.zeros(5))

(Parameter containing:
 tensor([1., 1., 1., 1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0.], requires_grad=True))

In [25]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_sample) # Call the forward method of the LayerNorm instance
print(out_ln)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1,unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## 4. Batch Normalization

In [None]:
import torch.nn.functional as F

class BatchNorm1dCustom(nn.Module):
    def __init__(self, num_features, epsilon=1e-5, momentum=0.1):
        super(BatchNorm1dCustom, self).__init__()
        
        # Initialize parameters
        self.num_features = num_features
        self.epsilon = epsilon
        self.momentum = momentum
        
        # Learnable parameters (gamma and beta)
        self.gamma = nn.Parameter(torch.ones(num_features))  # scale factor
        self.beta = nn.Parameter(torch.zeros(num_features))  # shift factor
        
        # Running mean and variance for inference phase
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.ones(num_features)

    def forward(self, x):
        # Calculate the mean and variance of the current batch
        batch_mean = x.mean(dim=0)
        batch_var = x.var(dim=0, unbiased=False)
        
        # Normalize the input using batch statistics
        x_normalized = (x - batch_mean) / torch.sqrt(batch_var + self.epsilon)
        
        # Scale and shift
        out = self.gamma * x_normalized + self.beta
        
        # Update running statistics for inference
        # During training, we use the statistics of the current batch (mean and variance) to normalize the data.
        # During inference, we don't have a batch of data to compute the mean and variance, 
        # so we use the running statistics accumulated during training to normalize the input data. 
        # This helps in ensuring that the model behaves consistently during inference, 
        # even when the batch size may be different or a single sample is passed through.

        # self.momentum is a hyperparameter that controls how quickly the running statistics adapt to new data. 
        # It determines how much weight is given to the current batch's statistics versus the accumulated statistics
        self.running_mean = self.momentum * batch_mean + (1 - self.momentum) * self.running_mean # Cumulative mean across all batches
        self.running_var = self.momentum * batch_var + (1 - self.momentum) * self.running_var # Cumulative variance across all batches
        
        return out
    
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(10, 50)
        self.bn1 = BatchNorm1dCustom(50)  # Apply BatchNorm after fc1
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))  # Pass through FC and BatchNorm
        x = self.fc2(x)
        return x

# Example usage
x = torch.randn(32, 10)  # A batch of 32 samples, each with 10 features
model = SimpleNet()
output = model(x)
print(output.shape)

torch.Size([32, 10])


## 5. GELU

In [41]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh((torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3)))))

In [42]:
x = torch.tensor([0.0], requires_grad=True)
output = torch.relu(x)
print(output)
output.backward()

print(x.grad) # Output: tensor([0.])

tensor([0.], grad_fn=<ReluBackward0>)
tensor([0.])


In [43]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["embedding_size"], 4*cfg["embedding_size"]),
            GELU(), # Apply the GELU activation function to the output of the linear layer
            nn.Linear(4*cfg["embedding_size"], cfg["embedding_size"])
        )

    def forward(self, x):
        return self.layers(x)

In [44]:
ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])
