# *Build a tiny transformer model from scratch*



**Step 1: Setup and Import Necessary Libraries**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math


**Step 2: Define Positional Encoding**

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]


**Step 3: Define Multi-Head Self-Attention**

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert embedding_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.query = nn.Linear(embedding_dim, embedding_dim)
        self.key = nn.Linear(embedding_dim, embedding_dim)
        self.value = nn.Linear(embedding_dim, embedding_dim)
        self.fc_out = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, values, keys, query, mask=None):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.num_heads different pieces
        values = values.reshape(N, value_len, self.num_heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.num_heads, self.head_dim)
        queries = query.reshape(N, query_len, self.num_heads, self.head_dim)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.head_dim ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.head_dim * self.num_heads
        )

        out = self.fc_out(out)
        return out


**Step 4: Define Feedforward Neural Network**

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, forward_expansion):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, forward_expansion * embedding_dim)
        self.fc2 = nn.Linear(forward_expansion * embedding_dim, embedding_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


**Step 5: Define the Transformer Block**

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, forward_expansion, dropout):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadSelfAttention(embedding_dim, num_heads)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.feed_forward = FeedForward(embedding_dim, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


**Step 6: Define the Full Transformer Model**

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx,
                 embedding_dim=512, num_heads=8, num_layers=6,
                 forward_expansion=4, dropout=0, max_length=100):
        super(Transformer, self).__init__()

        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_dim)
        self.src_position_embedding = PositionalEncoding(embedding_dim, max_length)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_dim)
        self.trg_position_embedding = PositionalEncoding(embedding_dim, max_length)

        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(embedding_dim, num_heads, forward_expansion, dropout)
             for _ in range(num_layers)]
        )

        self.fc_out = nn.Linear(embedding_dim, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        src_embedding = self.dropout(self.src_word_embedding(src) * math.sqrt(src.size(1)))
        src_embedding = self.src_position_embedding(src_embedding)
        trg_embedding = self.dropout(self.trg_word_embedding(trg) * math.sqrt(trg.size(1)))
        trg_embedding = self.trg_position_embedding(trg_embedding)

        out = src_embedding
        for layer in self.transformer_blocks:
            out = layer(out, out, out, src_mask)

        out = trg_embedding
        for layer in self.transformer_blocks:
            out = layer(out, out, out, trg_mask)

        out = self.fc_out(out)

        return out


**Step 7: Define a Small Example for Training (with Sample Sentences)**

In [None]:
# Sample sentences for training
sentences = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the bird sang a song",
    "a fish swims in the sea",
    "the sun rises in the east"
]

# Create a character-to-ID and ID-to-character mappings
chars = sorted(list(set("".join(sentences))))
char_to_id = {char: idx for idx, char in enumerate(chars)}
id_to_char = {idx: char for idx, char in enumerate(chars)}

# Encode sentences into sequences of IDs
encoded_sentences = [[char_to_id[char] for char in sentence] for sentence in sentences]

# Convert sequences into tensor format
max_len = max(len(sentence) for sentence in encoded_sentences)
padded_sentences = [sentence + [char_to_id[" "]] * (max_len - len(sentence)) for sentence in encoded_sentences]
input_data = torch.tensor(padded_sentences, dtype=torch.long)

# Targets will be the same sentences shifted by one character (teacher forcing)
target_data = torch.tensor([sentence[1:] + [char_to_id[" "]] for sentence in padded_sentences], dtype=torch.long)

# Hyperparameters
embedding_dim = 32
num_heads = 2
num_layers = 2
forward_expansion = 4
dropout = 0.1
max_length = max_len
src_vocab_size = len(chars)
trg_vocab_size = len(chars)
src_pad_idx = char_to_id[" "]
trg_pad_idx = char_to_id[" "]
learning_rate = 0.001

# Model initialization
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx,
                    embedding_dim, num_heads, num_layers,
                    forward_expansion, dropout, max_length)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

# Adjusted training loop to ensure correct dimensions
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    output = model(input_data, input_data)

    # Reshape output: (batch_size, seq_len - 1, vocab_size) -> (batch_size * (seq_len - 1), vocab_size)
    output = output[:, :-1, :].reshape(-1, output.shape[-1])

    # Reshape target_data: (batch_size, seq_len - 1) -> (batch_size * (seq_len - 1))
    target = target_data[:, 1:].reshape(-1)

    # Ensure that the output and target are aligned correctly
    assert output.size(0) == target.size(0), "Output and target sizes do not match"

    # Calculate the loss
    loss = criterion(output, target)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')




Epoch [10/1000], Loss: 2.5069
Epoch [20/1000], Loss: 2.2377
Epoch [30/1000], Loss: 2.1104
Epoch [40/1000], Loss: 1.9969
Epoch [50/1000], Loss: 1.8672
Epoch [60/1000], Loss: 1.6969
Epoch [70/1000], Loss: 1.7683
Epoch [80/1000], Loss: 1.6004
Epoch [90/1000], Loss: 1.5333
Epoch [100/1000], Loss: 1.5024
Epoch [110/1000], Loss: 1.4961
Epoch [120/1000], Loss: 1.3856
Epoch [130/1000], Loss: 1.3198
Epoch [140/1000], Loss: 1.4195
Epoch [150/1000], Loss: 1.3259
Epoch [160/1000], Loss: 1.3530
Epoch [170/1000], Loss: 1.3315
Epoch [180/1000], Loss: 1.3342
Epoch [190/1000], Loss: 1.3444
Epoch [200/1000], Loss: 1.2921
Epoch [210/1000], Loss: 1.2615
Epoch [220/1000], Loss: 1.3209
Epoch [230/1000], Loss: 1.3713
Epoch [240/1000], Loss: 1.2837
Epoch [250/1000], Loss: 1.1927
Epoch [260/1000], Loss: 1.2934
Epoch [270/1000], Loss: 1.2728
Epoch [280/1000], Loss: 1.2460
Epoch [290/1000], Loss: 1.2093
Epoch [300/1000], Loss: 1.1664
Epoch [310/1000], Loss: 1.1216
Epoch [320/1000], Loss: 1.2054
Epoch [330/1000],

**Step 8. Prediction (Inference)**

In [None]:
# Example: Generate text based on the model's predictions
def generate_text(model, start_seq, max_len=20):
    model.eval()
    with torch.no_grad():
        input_ids = torch.tensor([[char_to_id[char] for char in start_seq]], dtype=torch.long)
        for _ in range(max_len - len(start_seq)):
            output = model(input_ids, input_ids)
            next_char_id = torch.argmax(output[0, -1, :]).item()
            input_ids = torch.cat([input_ids, torch.tensor([[next_char_id]], dtype=torch.long)], dim=1)
            if id_to_char[next_char_id] == " ":
                break
        return "".join([id_to_char[id] for id in input_ids[0].tolist()])

# Generate a text starting with "the"
generated_text = generate_text(model, "the dog ")
print(f"Generated text: {generated_text}")

Generated text: the dog hsdtecaectec
