# *Build a tiny transformer model from scratch using nn.Transformer*



**Step 1: Setup and Import Necessary Libraries**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math


**Step 2: Define the Tokenizer and Vocabulary**


In [None]:
# Example vocabulary and tokenizer
vocab = ['i', 'am', 'a', 'student', 'teacher', 'you', 'are', 'learning', 'to', 'build', 'language', 'model']
vocab_size = len(vocab)
word_to_id = {word: idx for idx, word in enumerate(vocab)}
id_to_word = {idx: word for idx, word in enumerate(vocab)}

# Example tokenizer function
def tokenize(sentence):
    return [word_to_id[word] for word in sentence.lower().split() if word in word_to_id]

# Example detokenizer function
def detokenize(tokens):
    return ' '.join([id_to_word[token] for token in tokens])



In [None]:
word_to_id
id_to_word

{0: 'i',
 1: 'am',
 2: 'a',
 3: 'student',
 4: 'teacher',
 5: 'you',
 6: 'are',
 7: 'learning',
 8: 'to',
 9: 'build',
 10: 'language',
 11: 'model'}

**Step 3: Define Positional Encoding**


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]


**Step 4: Build the Transformer Model**

In [None]:
class NanoLLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers):
        super(NanoLLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.transformer_blocks = nn.Transformer(d_model=embedding_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(src.size(1))
        tgt = self.embedding(tgt) * math.sqrt(tgt.size(1))
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)
        transformer_output = self.transformer_blocks(src, tgt)
        output = self.fc_out(transformer_output)
        return output


**Step 5: Generate Example Data**

In [None]:
def generate_example_data():
    sentences = [
        'i am a student',
        'you are learning to build',
        'i am a teacher',
        'you are a student'
    ]
    data = []
    for sentence in sentences:
        tokens = tokenize(sentence)
        input_seq = tokens[:-1]
        target_seq = tokens[1:]
        data.append((torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)))
    return data

training_data = generate_example_data()


In [None]:
training_data

[(tensor([0, 1, 2]), tensor([1, 2, 3])),
 (tensor([5, 6, 7, 8]), tensor([6, 7, 8, 9])),
 (tensor([0, 1, 2]), tensor([1, 2, 4])),
 (tensor([5, 6, 2]), tensor([6, 2, 3]))]

**Step 6: Train the Model**

In [None]:
# Hyperparameters
embedding_dim = 32
num_heads = 2
num_layers = 2
learning_rate = 0.001
num_epochs = 100

# Initialize model, loss function, and optimizer
model = NanoLLM(vocab_size, embedding_dim, num_heads, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for src, tgt in training_data:
        src = src.unsqueeze(1)
        tgt = tgt.unsqueeze(1)
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output.view(-1, vocab_size), tgt.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss/len(training_data):.4f}')




Epoch [10/100], Loss: 0.4042
Epoch [20/100], Loss: 0.2070
Epoch [30/100], Loss: 0.1231
Epoch [40/100], Loss: 0.0844
Epoch [50/100], Loss: 0.0568
Epoch [60/100], Loss: 0.0446
Epoch [70/100], Loss: 0.0346
Epoch [80/100], Loss: 0.0290
Epoch [90/100], Loss: 0.0212
Epoch [100/100], Loss: 0.0184


In [None]:
model

NanoLLM(
  (embedding): Embedding(12, 32)
  (positional_encoding): PositionalEncoding()
  (transformer_blocks): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
          )
          (linear1): Linear(in_features=32, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=32, bias=True)
          (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-1

**Step 7: Make Predictions**

In [None]:
def predict_next_word(model, input_seq):
    model.eval()
    with torch.no_grad():
        tokens = tokenize(input_seq)
        input_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(1)
        output = model(input_tensor, input_tensor)
        predicted_id = torch.argmax(output[-1, 0, :]).item()
        return id_to_word[predicted_id]

# Example usage
input_sentence = "you"
predicted_word = predict_next_word(model, input_sentence)
print(f"Input: '{input_sentence}', Predicted next word: '{predicted_word}'")


Input: 'you', Predicted next word: 'am'
