# Convert GPT model to LLAMA2.
## Reason: Because LLaMA2 has several "modern" optimizations that make it more efficient and accurate for large scale language tasks.

## Replacing the layerNorm with RMSNORM layer

In [4]:
!pip install nbformat
!pip install blobfile
!pip install huggingface_hub
!pip install matplotlib
!pip install tiktoken
!pip install torch
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp310-cp310-win_amd64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp310-cp310-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 7.2 MB/s  0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1


In [9]:
# load LLama2 component

# this will replace the LayerNorm with RMSNORM Layer , this improve computational efficiency

import torch
import torch.nn as nn

class RMSNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__() # tell python to run the initialization code of the parent class, in this case since we create class based on nn (pytorch) 
        self.eps = eps # define epsilon 
        self.emb_dim = emb_dim # define embedding dimension size 
        self.weight = nn.Parameter(torch.ones(emb_dim)).float()

    def forward(self, x): # this is use to Replace LayerNorm with RMSNorm layer (in order to transform LLAMA 2 to gpt) imporve computational efficiency
        means = x.pow(2).mean(dim=-1, keepdim=True) # set up the mean. where mean(dim=-1, keepin = True) tell python calculate the average of those squared value across the last dimension
        x_normed = x * torch.rsqrt(means + self.eps) # this will normalize the input
        return (x_normed * self.weight).to(dtype=x.dtype) # apply the learnable weight here 

''' run it / check '''
torch.manual_seed(123)

example_batch = torch.randn(2, 3, 4)

rms_norm = RMSNorm(emb_dim=example_batch.shape[-1])
rmsnorm_pytorch = torch.nn.RMSNorm(example_batch.shape[-1], eps=1e-5) # pytorch implementation, replaycing the standard layer normalization with the rsmnorm 

assert torch.allclose(rms_norm(example_batch), rmsnorm_pytorch(example_batch)) # this is the validation step. This out come is sielnt. 

## Step 2: Replace GELU with SiLU activation
### Note: LLAMA use the SiLU activation (Sigmoid Linear Unit) and GPT use GELU (Gaussian Error Linear Unit). Both are smooth alternative, SiLU often is more computationally efficient and become the preferred choice.

In [10]:
class SiLU(nn.Module):# define a class 
    def __init__(self):
        super(SiLU, self).__init__() # this line here connect my customer class to pytorch framework.

    def forward(self, x):
        return x * torch.sigmoid(x)# take everynumber in the input and pressed down between 0 and 1

# Validation 
silu = SiLU()

assert torch.allclose(silu(example_batch), torch.nn.functional.silu(example_batch))

## Update the feed-forward Network (FFN) 
### NOTE: b/c LLAMA use more sophisticated structure (SwiGLU) instead of the standard two-layer MLP found in GPT

In [12]:
class FeedForward(nn.Module):
    def __init__(self, cfg): # cfg is the configuration 
        super().__init__()
        # in LLAMA the feed forward network require 3 layer instead of two layer
        self.fc1 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False) # take the data and expand it into a higher dimension
        self.fc2 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False) # Gate layer. Run in parallel to fc1, help the model decide which information is important
        self.fc3 = nn.Linear(cfg["hidden_dim"], cfg["emb_dim"], dtype=cfg["dtype"], bias=False) # output projection layer. Combine the results from prior layers and shrinks it down to the original emb_dim (demnsion)
        self.silu = SiLU()

    def forward(self, x):
        x_fc1 = self.fc1(x)
        x_fc2 = self.fc2(x)
        x = self.silu(x_fc1) * x_fc2
        return self.fc3(x)

# Validate 
# 1. Setup a dummy configuration matching your notebook's style
test_cfg = {
    "emb_dim": 4,
    "hidden_dim": 8,
    "dtype": torch.float32
}

# 2. Initialize your FeedForward layer
model_ffn = FeedForward(test_cfg)

# 3. Create dummy input data (Batch size=2, Seq length=3, Emb dim=4)
test_input = torch.randn(2, 3, 4)

# 4. Pass the input through your layer
output = model_ffn(test_input)

# 5. Validation Assertions
# Check if the output shape matches the input shape (Standard for FFNs)
assert output.shape == test_input.shape
print(f"Shape validation passed: {output.shape}")

# Manual verification of the SwiGLU logic:
# output = fc3( silu(fc1(x)) * fc2(x) )
with torch.no_grad():
    x_fc1 = model_ffn.fc1(test_input)
    x_fc2 = model_ffn.fc2(test_input)
    manual_swiglu = model_ffn.silu(x_fc1) * x_fc2
    expected_output = model_ffn.fc3(manual_swiglu)

assert torch.allclose(output, expected_output)
print("Mathematical logic validation passed!")

Shape validation passed: torch.Size([2, 3, 4])
Mathematical logic validation passed!


## Implement RoPE (rotary positional embedding). Core architectual differences between GPT and Llama2 
### NOTE: bc gpt use absolute Positional embeddings. By using this we have 
### 1. relative position awareness
### 2. Infinite Length Extrapolation 
### 3. Mathematicall efficiency 

In [13]:
def precompute_rope_params(head_dim, theta_base=10_000, context_length=4096):
    assert head_dim % 2 == 0, "Embedding dimension must be even" # RoPE work by rotating pairs of number. CANT change these number it will always be 2 

    # Compute the inverse frequencies
    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim)) # calculate the frequencies (rotational speeds), can adjust to make it faster. Increase the value will extend the context window of a model.

    # Generate position indices
    positions = torch.arange(context_length) # this create a list of number from 0 up to 4095, represent each possible position in a sentence.

    # Compute the angles
    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2) # it multiple each position by every rotation speed to create a unique 'angle' for each dimension at every position.

    # Expand angles to match the head_dim
    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim) # duplicate the angles so the first half and the second half of my word can be rotate insync.

    # Precompute sine and cosine. Converting raw angles into cosine and sine values. 
    cos = torch.cos(angles)
    sin = torch.sin(angles)

    return cos, sin

def compute_rope(x, cos, sin):
    # x is the input data
    batch_size, num_heads, seq_len, head_dim = x.shape # extract the dimension of the input data X.
    assert head_dim % 2 == 0, "Head dimension must be even" # setup for RoPE

    # Split x into first half and second half
    x1 = x[..., : head_dim // 2]  # First half
    x2 = x[..., head_dim // 2 :]  # Second half

    # Adjust sin and cos shapes. Since precomputed table might be very large. 
    cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, seq_len, head_dim)
    sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)

    # Apply the rotary transformation
    rotated = torch.cat((-x2, x1), dim=-1)
    x_rotated = (x * cos) + (rotated * sin)

    return x_rotated.to(dtype=x.dtype)

In [14]:
''' VALIDATION / TEST '''
# Input
batch_size = 2
context_len = 5
num_heads = 4
head_dim = 16

# Instantiate RoPE parameters
cos, sin = precompute_rope_params(head_dim=head_dim, context_length=context_len)

# Dummy query and key tensors
torch.manual_seed(123)
queries = torch.randn(batch_size, num_heads, context_len, head_dim)
keys = torch.randn(batch_size, num_heads, context_len, head_dim)

# Apply rotary position embeddings
queries_rot = compute_rope(queries, cos, sin)
keys_rot = compute_rope(keys, cos, sin)

# Check that the shapes remain the same
print(f"Original shape: {queries.shape}")
print(f"Rotated shape:  {queries_rot.shape}")

# Verify that the values have actually changed
print(f"\nFirst query vector (before RoPE):\n{queries[0, 0, 0, :4]}")
print(f"First query vector (after RoPE):\n{queries_rot[0, 0, 0, :4]}")

# Optional: Verify that the rotation preserved the vector length (magnitude)
# Rotation shouldn't change the 'energy' of the vector
print(f"\nOriginal magnitude: {torch.norm(queries[0, 0, 0]):.4f}")
print(f"Rotated magnitude:  {torch.norm(queries_rot[0, 0, 0]):.4f}")

Original shape: torch.Size([2, 4, 5, 16])
Rotated shape:  torch.Size([2, 4, 5, 16])

First query vector (before RoPE):
tensor([ 0.3374, -0.1778, -0.3035, -0.5880])
First query vector (after RoPE):
tensor([ 0.3374, -0.1778, -0.3035, -0.5880])

Original magnitude: 3.1241
Rotated magnitude:  3.1241


## Add RoPE for MultiHeadAttention module
### B/c in a standard GPT model, positional information is added to the word embedding at the very beginning of the network. LlaMA2 positions are injected inside every attention layer

In [15]:
#####################################
# Chapter 3
#####################################
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dtype=None):  # ,dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads" # initial the RoPE

        self.d_out = d_out # output dimension of attion layer 
        self.num_heads = num_heads #number of attention heads 
        self.head_dim = d_out // num_heads  # calculate the size of each individual attention head. Reduce the projection dim to match desired output dim

        ################################### NEW ###################################
        # Set bias=False and dtype=dtype for all linear layers below
        ###########################################################################

        ''' these are the layers projectmy input into query, key and value vector'''
        self.W_query = nn.Linear(d_in, d_out, bias=False, dtype=dtype) 
        self.W_key = nn.Linear(d_in, d_out, bias=False, dtype=dtype)
        self.W_value = nn.Linear(d_in, d_out, bias=False, dtype=dtype)

        self.out_proj = nn.Linear(d_out, d_out, bias=False, dtype=dtype)  # Linear layer to combine head outputs
        # self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)) # create a causal mask - ensure that token "t" can only attend to tokens \leq t, not future token

        ################################### NEW ###################################
        cos, sin = precompute_rope_params(head_dim=self.head_dim, context_length=context_length) # pre-calculated the  the rotation value for the entire possible context length
        '''save these rotation values as buffer so they move automatically to GPU with the GPU''' 
        self.register_buffer("cos", cos)
        self.register_buffer("sin", sin)
        ###########################################################################


    def forward(self, x):
        b, num_tokens, d_in = x.shape  # extract the batch size 

        '''  linear layer that create three specialized version of my data ''' 
        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out) # what each word has to 'offer' to other words.
        queries = self.W_query(x) # what each word is looking for in the sentence
        values = self.W_value(x) # the actual 'meaning' or content that gets passed forward

        
        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) 
        # rearrange the data so the head are in the correct position for parallel math operations. 
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Apply LlaMA2 positional logic (i.e. RoPE) #
        keys = compute_rope(keys, self.cos, self.sin)
        queries = compute_rope(queries, self.cos, self.sin) # LLaMA specific upgrade. it rotate the queries and keys based on the position that we preset (cosine and sine) 
        ###########################################################################

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head # compare every words 'question' (Q) against every other words 'label' )K)

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf) # apply causal mask. ensure that model only look at past words and cannot 'cheat' by looking at future word

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) # converts raw scores into percentages that add up to 100%
        # attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # multiples the percentage by the "meaning" (V) to create a final weighted representation

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.reshape(b, num_tokens, self.d_out) # glues the multiple head back together 
        context_vec = self.out_proj(context_vec)  # optional projection. This is a final layer that "clean" up and refine combined result before passing it to the next part of the model

        return context_vec


In [17]:
# example of using mutiHeadAttention
# Settings
batch_size = 1
context_len = 100
max_context_len = 4096
embed_dim = 128
num_heads = 4


example_batch = torch.randn((batch_size, context_len, embed_dim))

mha = MultiHeadAttention(
    d_in=embed_dim,
    d_out=embed_dim,
    context_length=max_context_len,
    num_heads=num_heads
)

mha(example_batch)
print(mha)
# del mha  # delete to free up memory

MultiHeadAttention(
  (W_query): Linear(in_features=128, out_features=128, bias=False)
  (W_key): Linear(in_features=128, out_features=128, bias=False)
  (W_value): Linear(in_features=128, out_features=128, bias=False)
  (out_proj): Linear(in_features=128, out_features=128, bias=False)
)


## Update the transformerBlock Module 
### this is the final step to glue together all the individual Llama2 components that you have build i.e. (RMSNorm, MultiHeadAttention, and FeedForward Network) 

In [18]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dtype=cfg["dtype"]  # NEW
            # dropout=cfg["drop_rate"],
            # qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)

        ################################### REplace layerNorm with RMSNorm ###################################
        # self.norm1 = LayerNorm(cfg["emb_dim"])
        # self.norm2 = LayerNorm(cfg["emb_dim"])
        self.norm1 = RMSNorm(cfg["emb_dim"])
        self.norm2 = RMSNorm(cfg["emb_dim"])
        ###########################################################################

        # self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        # x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

## Update the Model Class 
### Not since transformerBlock is a repeated block within the main model

In [22]:
# class GPTModel(nn.Module):
class Llama2Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"], dtype=cfg["dtype"]) # create token embedding layer. Turn input word IDs (integers) into dense vector
        # self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        # self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) # craete n_layers transformer blocks and stack them one after another. The outputs of one block becomes the input of the next block 

        ################################### Replace LayerNorm with RMSNorm  ###################################
        # self.final_norm = LayerNorm(cfg["emb_dim"])
        self.final_norm = RMSNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"])

    def forward(self, in_idx):
        # batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        # pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds  # + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        # x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

# Final step - TEST run with GPT model

In [31]:
# state the config

GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

GPT_CONFIG_1558M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 1600,         # Embedding dimension
    "n_heads": 25,           # Number of attention heads
    "n_layers": 48,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

LLAMA2_CONFIG_7B = {
    "vocab_size": 32000,     # Vocabulary size
    "context_length": 4096,  # Context length
    "emb_dim": 4096,         # Embedding dimension
    "n_heads": 32,           # Number of attention heads
    "n_layers": 32,          # Number of layers
    "hidden_dim": 11008,     # NEW: Size of the intermediate dimension in FeedForward
    "dtype": torch.bfloat16  # NEW: Lower-precision dtype to reduce memory usage
}

# Initialize the model
model = Llama2Model(LLAMA2_CONFIG_7B)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 6,738,415,616


In [32]:
'''
Calculate the the memory require to run the model Llama2Model using  the following
'''
def model_memory_size(model, input_dtype=torch.float32):
    total_params = 0
    total_grads = 0
    for param in model.parameters():
        # Calculate total number of elements per parameter
        param_size = param.numel()
        total_params += param_size
        # Check if gradients are stored for this parameter
        if param.requires_grad:
            total_grads += param_size

    # Calculate buffer size (non-parameters that require memory)
    total_buffers = sum(buf.numel() for buf in model.buffers())

    # Size in bytes = (Number of elements) * (Size of each element in bytes)
    # We assume parameters and gradients are stored in the same type as input dtype
    element_size = torch.tensor(0, dtype=input_dtype).element_size()
    total_memory_bytes = (total_params + total_grads + total_buffers) * element_size

    # Convert bytes to gigabytes
    total_memory_gb = total_memory_bytes / (1024**3)

    return total_memory_gb

print(f"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB")
print(f"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB")

float32 (PyTorch default): 52.33 GB
bfloat16: 26.17 GB
