### Load the tokenizer

In [1]:
from minbpe.minbpe import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.load(model_file = "./output/tokenizer/my_tokenizer.model")


In [2]:
tokenizer.vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [3]:
def get_vocab_size(tokenizer):
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

## Creating the model

Step:1 Word & Position embedding

In this step,we convert a text into a list of tokens. Each token has an ID from the vocabulary. The shape of the tensor is 1x6 because we have one sentence with 6 tokens.

Next, we use these tokens to find the corresponding embedding vector for each token. The vocab size is 1024,  so each token uses its ID to look up the right vector in the token embedding table. We do the same for positional embeddings, which have 256 rows because the block size is 256. This means the model can only handle sequences with up to 256 tokens.

After getting the token and positional embeddings, we add them together. This results in a tensor of size 1x6x768, where 1 is the number of inputs, 6 is the number of tokens, and 768 is the size of the embedding vectors. This output is then sent to the block layer.

![Transformer Step 1](https://raw.githubusercontent.com/ImadSaddik/Train_Your_Language_Model_Course/e9e8e01b46e1376406bd1c3a0e1692b64ba660ea/images/transformer_step_1.svg)


## Step 2: Multi-Head Attention

We take the tensor from the previous step and pass it to the multi-head attention layer. This layer has two settings: `head size and number of heads`. These settings split the attention block into smaller parts called heads. All heads process the input at the same time to speed up calculations.

The goal of multi-head attention is to help the model focus on different parts of the input at once. Each head can learn to look at different relationships between words or tokens. Since they work in parallel, the model can understand patterns in the data more effectively.

Each head produces a tensor of size 1x6x128, where 6 is the number of heads and 128 is the size of each head. We then combine all the outputs into a 1x6x768 tensor. Finally, this is passed through a feed-forward layer, which adjusts the last dimension to 768, matching the embedding size.

We can stack multiple multi-head attention blocks to deepen the model's understanding of the input. This allows it to learn more complex patterns and relationships. In the image, we have stacked four layers to enhance its ability to process the data.

![Transformer Step 2](https://raw.githubusercontent.com/ImadSaddik/Train_Your_Language_Model_Course/e9e8e01b46e1376406bd1c3a0e1692b64ba660ea/images/transformer_step_2.svg)



In [4]:
import torch
torch.manual_seed(3647)

block_size = 256
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
vocab_size = get_vocab_size(tokenizer)

device = "mps" if torch.backends.mps.is_available() else "cpu"

### 1. Head

In [5]:
from typing import Optional, Tuple
import torch
import torch.nn as nn
from torch.nn import functional as F

# Each word gets turned into:

# a query (what it's looking for)

# a key (how it can be found)

# a value (what information it has)

# These are all made with nn.Linear, which means we’re learning how to turn input embeddings into these things.

class Head(nn.Module):
    """One head of self-attention"""
    def __init__(self,head_size:int):
        super().__init__()
        self.key = nn.Linear(n_embd,head_size,bias=False)
        self.query = nn.Linear(n_embd,head_size,bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril',torch.tril(
            torch.ones(block_size,block_size)
        )) #We’re making a triangle mask ⬛⬛⬜⬜... that ensures a word can only look at earlier words, not future ones (important for things like language generation).


        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        _, T, _ = x.shape
        k = self.key(x)   # (B,T,hs) # turn Each word (input) into keys
        q = self.query(x)  # (B,T,hs) # turn Each word (input) into queries
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        weights = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5  #We’re seeing how well each word's query matches every word’s key using a dot product (matrix multiplication). This gives us a score matrix of shape (B, T, T) — how much attention each word gives to every other word.

# We also scale it by the size of the key to keep the numbers stable.
        weights = weights.masked_fill(
            self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T) We apply the triangle mask here. So future words get -inf as their score — meaning they won’t be attended to.
        weights = F.softmax(weights, dim=-1)  # (B, T, T)
        weights = self.dropout(weights)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = weights @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs) Now each word gathers information from the values of other words, based on the attention weights.


        return out

        
        

### 2. MultiHead Attention

In [6]:
class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention running in parallel."""

    def __init__(self, num_heads: int, head_size: int) -> None:
        super().__init__()

        # Create a list of independent attention heads.
        # Each head will learn to focus on different parts of the input.
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

        # A linear layer to combine all the outputs from the different heads
        # into a single vector for each position in the input.
        self.projection = nn.Linear(head_size * num_heads, n_embd)

        # A dropout layer to randomly drop some connections during training,
        # helping the model generalize better and not overfit.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Run input x through all attention heads.
        # Each head returns a tensor of shape [batch, sequence_len, head_size].
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Concatenate outputs from all heads along the last dimension
        # Resulting shape: [batch, sequence_len, head_size * num_heads]

        # Project the concatenated output to match the original embedding size (n_embd)
        # Then apply dropout to the result
        out = self.dropout(self.projection(out))

        # Return the final processed tensor
        return out


### 3. Block

In [7]:
class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd: int) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # First linear layer expands the dimension
            nn.ReLU(),                     # Adds non-linearity so model can learn complex patterns
            nn.Linear(4 * n_embd, n_embd), # Brings it back to original embedding size
            nn.Dropout(dropout),           # Randomly drops values to prevent overfitting
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)  # Simply run input through the feed-forward network



class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd: int, n_head: int) -> None:
        # n_embd: embedding size (width of data), n_head: how many attention heads
        super().__init__()

        head_size = n_embd // n_head  # Each head looks at part of the embedding
        self.self_attention = MultiHeadAttention(n_head, head_size)  # Multiple attention heads
        self.feed_forward = FeedForward(n_embd)                      # Simple feed-forward network

        self.layer_norm_1 = nn.LayerNorm(n_embd)  # Normalizes input for stability
        self.layer_norm_2 = nn.LayerNorm(n_embd)  # Same, but for feed-forward part

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # First do attention, with layer norm and residual connection
        x = x + self.self_attention(self.layer_norm_1(x))

        # Then do feed-forward, again with layer norm and residual connection
        x = x + self.feed_forward(self.layer_norm_2(x))

        return x


### 4. Assembling the language model

In [8]:
class GPTLanguageModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        # Lookup table to get the embedding vector for each token in the vocabulary
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        # Lookup table to get the embedding for each position in the sequence
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # A stack of transformer blocks (attention + feedforward layers)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        )

        # Normalize the output from the transformer blocks
        self.final_layer_norm = nn.LayerNorm(n_embd)

        # Final linear layer maps the output embeddings to vocab size logits
        self.final_linear_layer = nn.Linear(n_embd, vocab_size)

        # Apply custom weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module: nn.Module) -> None:
        # Initializes weights with normal distribution (mean=0, std=0.02)
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, input_tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Forward pass of the model.

        Args:
            input_tokens: Tensor of token indices of shape (batch_size, sequence_length)
            targets: Optional tensor of target token indices of same shape as input_tokens

        Returns:
            Tuple of (logits, loss) where logits has shape (batch_size, sequence_length, vocab_size)
            and loss is optional cross-entropy loss if targets are provided
        """

        B, T = input_tokens.shape  # Batch size, Sequence length

        # Get token embeddings: shape (B, T, C)
        token_embedding = self.token_embedding_table(input_tokens)

        # Get positional embeddings for each position (0 to T-1)
        positional_embedding = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)

        # Add token + position embeddings to mix both meanings
        x = token_embedding + positional_embedding  # (B, T, C)

        # Pass through the stack of transformer blocks
        x = self.blocks(x)  # (B, T, C)

        # Normalize the output before final prediction
        x = self.final_layer_norm(x)  # (B, T, C)

        # Map to vocabulary logits (i.e., unnormalized probabilities)
        logits = self.final_linear_layer(x)  # (B, T, vocab_size)

        # If we have target tokens, calculate the loss
        if targets is None:
            loss = None
        else:
            # Flatten logits and targets for cross-entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, input_tokens: torch.Tensor, max_new_tokens: int) -> torch.Tensor:
        """
                Generate new tokens given a context.

                Args:>ns: Starting token indices of shape (batch_size, sequence_length)
                        max_new_tokens: Number of new tokens to generate

                Returns:
                        Tensor of token indices of shape (batch_size, sequence_length + max_new_tokens)
                """

        # input_tokens is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop input_tokens to the last block_size tokens
            cropped_input = input_tokens[:, -block_size:]
            # get the predictions
            logits, _ = self(cropped_input)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            input_tokens = torch.cat(
                (input_tokens, idx_next), dim=1)  # (B, T+1)
        return input_tokens
        
        
        

### 5. Parameters & dummy input


In [9]:
model = GPTLanguageModel()
model = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

11.53409 M parameters


In [10]:
batch_size = 1
seq_length = 6
x = torch.randint(0, vocab_size, (batch_size, seq_length))
x = x.to(device)

logits, loss = model(x)
print(logits.shape, loss)

torch.Size([1, 6, 1034]) None


### Display the model summary


In [11]:
def print_model_structure(model: torch.nn.Module, indent: str = '') -> None:
    """
    Custom function to print model structure in a hierarchical format
    """
    for name, child in model.named_children():
        params = sum(p.numel() for p in child.parameters())
        print(f"{indent}├─ {name}: {child.__class__.__name__} ({params:,} parameters)")
        print_model_structure(child, indent + '│  ')


print_model_structure(model)

├─ token_embedding_table: Embedding (397,056 parameters)
├─ position_embedding_table: Embedding (98,304 parameters)
├─ blocks: Sequential (10,639,872 parameters)
│  ├─ 0: Block (1,773,312 parameters)
│  │  ├─ self_attention: MultiHeadAttention (590,208 parameters)
│  │  │  ├─ heads: ModuleList (442,368 parameters)
│  │  │  │  ├─ 0: Head (73,728 parameters)
│  │  │  │  │  ├─ key: Linear (24,576 parameters)
│  │  │  │  │  ├─ query: Linear (24,576 parameters)
│  │  │  │  │  ├─ value: Linear (24,576 parameters)
│  │  │  │  │  ├─ dropout: Dropout (0 parameters)
│  │  │  │  ├─ 1: Head (73,728 parameters)
│  │  │  │  │  ├─ key: Linear (24,576 parameters)
│  │  │  │  │  ├─ query: Linear (24,576 parameters)
│  │  │  │  │  ├─ value: Linear (24,576 parameters)
│  │  │  │  │  ├─ dropout: Dropout (0 parameters)
│  │  │  │  ├─ 2: Head (73,728 parameters)
│  │  │  │  │  ├─ key: Linear (24,576 parameters)
│  │  │  │  │  ├─ query: Linear (24,576 parameters)
│  │  │  │  │  ├─ value: Linear (24,576 param

In [None]:
! git add .
! git commit -m "added transformer layer"
! git push origin main