In [None]:
# Author: Roi Yehoshua
# Date: January 2024
# MIT License 

# Based on the PyTorch implementation from https://nlp.seas.harvard.edu/annotated-transformer/

In [None]:
# Make sure you have the following packages installed: 
!pip install spacy torchtext portalocker --quiet

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import spacy
import os

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
torch.manual_seed(42)  # For reproducibility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### Multi-Head Attention

$$
    \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \ldots, \text{head}_h)W^O \\
    \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\  
    \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

In [None]:
class MultiHeadAttention(nn.Module):
    """The multi-head attention module"""
    def __init__(self, d_model, num_heads):
        super().__init__() # calls the constructor of the parent class: import torch.nn as nn; a custom neural network module that inherits from nn.Module in PyTorch
        
        # Ensure the dimension of the model is divisible by the number of heads.
        # This is necessary to equally divide the embedding dimension across heads.
        assert d_model % num_heads == 0, 'd_model must be divisible by num_heads'
        
        self.d_model = d_model     # assigns the value from the parameter d_model to the instance variable self.d_model to store the value later use in other methods of the clas.  # Total dimension of the model, d_model is the size of the embedding vector for each token in the sequence.
        self.num_heads = num_heads       # Number of attention heads, Each head learns to focus on different parts of the input.
        self.d_k = d_model // num_heads  # Dimnsion of each head. We assume d_v = d_k
               
        # Linear transformations for queries, keys, and values
        # compare queries and keys to compute attention scores, and then use those scores to weight the values
        self.W_q = nn.Linear(d_model, d_model) # mapping from a vector of size d_model to another vector of size d_model. This layer transforms the input into a query vector.
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        
        # Final linear layer to project the concatenated heads' outputs back to d_model dimensions
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE
        
        # 1. Calculate attention scores with scaling.
        # Computes the similarity (dot product) between queries and keys. q * k^t and scale by sqrt(d_k)
        # Q: [batch_size, seq_len, d_k] K: [batch_size, seq_len, d_k]
        # Result should be: [batch_size, seq_len, seq_len], Rows = query positions, Columns = key positions
        scores = torch.matmul(Q, K.transpose(-2, -1))/ math.sqrt(self.d_k) # Transposes the last two dimensions of K (swaps rows and columns)

        # 2. Apply mask (if provided) by setting masked positions to a large negative value.
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # 3. Apply softmax to attention scores to get probabilities.
        # Each row sums to 1, creating probability distributions over keys for each query.
        attention_weights = torch.softmax(scores, dim=-1) # dim=-1: For each query, softmax across all key positions

        # 4. Return the weighted sum of values based on attention probabilities.
        # attention_weights: [batch_size, seq_len, seq_len], Represents the attention scores (probabilities) for each query-key pair.
        # V (Values): [batch_size, seq_len, d_k], Contains the value vectors for each position in the sequence.
        # Output: [batch_size, seq_len, d_k], Each position in the sequence is a weighted sum of the value vectors.
        output = torch.matmul(attention_weights, V)
        
        return output
    
    def split_heads(self, x):
        # Reshape the input tensor to [batch_size, num_heads, seq_length, d_k]
        # to prepare for multi-head attention processing
        # input x: [batch_size, seq_length, d_model]; batch_size: Number of sequences in the batch, seq_length: Number of tokens in each sequence, d_model: Total embedding dimension
        batch_size, seq_length, d_model = x.size() #Extracts the current tensor dimensions
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # Splits d_model into num_heads × d_k; self.num_heads and self.d_k: Defined in __init__ and stored with the object, vailable throughout the object's lifetime
        #After split_heads, each head operates on its own [seq_length, d_k] slice:
        # Head 1: x[:, 0, :, :]
        # Head 2: x[:, 1, :, :]
        # This allows each head to learn different types of relationships in the data.
        
    def combine_heads(self, x):
        # Inverse operation of split_heads: combine the head outputs back into the original tensor shape
        # [batch_size, seq_length, d_model]
        batch_size, num_heads, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE
        
        # 1. Linearly project the queries, keys, and values, and then split them into heads.
        #  Input: [batch_size, seq_length, d_model]
        # Output: [batch_size, num_heads, seq_length, d_k]
        Q = self.split_heads(self.W_q(Q)) # self.W_q = nn.Linear(d_model, d_model), projected_Q = self.W_q(Q), For each position: projected_Q[i] = Q[i] @ W_q + bias_q 
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # 2. Apply scaled dot-product attention for each head.
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # 3. Concatenate the heads' outputs and apply the final linear projection.
        output = self.combine_heads(attn_output)
        output = self.W_o(output)
        
        return output

### Feed-Forward NN

$$
    \text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2
$$

In [None]:
class PositionwiseFeedForward(nn.Module):
    """The Positionwise Feedforward Network (FFN) module"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()       
        self.linear1 = nn.Linear(d_model, d_ff) # expansion step: Transforms input from [batch, seq_len, d_model] to [batch, seq_len, d_ff]   
        self.linear2 = nn.Linear(d_ff, d_model) # Transforms back from [batch, seq_len, d_ff] to [batch, seq_len, d_model],  contraction step       
        self.dropout = nn.Dropout(dropout)# Randomly sets some elements to zero during training, Helps prevent overfitting        
        self.relu = nn.ReLU() # ReLu activation: max(0, x), non-linearity

    def forward(self, x):
        ### WRITE YOUR CODE HERE  
        # Apply first linear transformation: d_model -> d_ff
        x = self.linear1(x)
        # Apply ReLU activation
        x = self.relu(x)
        # Apply dropout for regularization
        x = self.dropout(x)
        # Apply second linear transformation: d_ff -> d_model
        x = self.linear2(x)

        return x
        
        
        

### Positional Encoding

$$
    \text{PE}(pos, 2i) = \sin(pos/10000^{2i/d_{\text{model}}}) \\
    \text{PE}(pos, 2i + 1) = \cos(pos/10000^{2i/d_{\text{model}}})
$$

In [None]:
class PositionalEncoding(nn.Module):    
    """
    Implements the positional encoding module using sinusoidal functions of different frequencies 
    for each dimension of the encoding.
    """
    def __init__(self, d_model, max_seq_length):
        super().__init__()        
        
        # Create a positional encoding (PE) matrix with dimensions [max_seq_length, d_model].
        # This matrix will contain the positional encodings for all possible positions up to max_seq_length.
        pe = torch.zeros(max_seq_length, d_model)
        
        # Generate a tensor of positions (0 to max_seq_length - 1) and reshape it to [max_seq_length, 1].
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        
        # Compute the division term used in the formulas for sin and cos functions.
        # This term is based on the dimension of the model and the position, ensuring that the wavelengths
        # form a geometric progression from 2π to 10000 * 2π. It uses only even indices for the dimensions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply the sin function to even indices in the PE matrix. These values are determined by
        # multiplying the position by the division term, creating a pattern where each position has
        # a unique sinusoidal encoding.       
        pe[:, 0::2] = torch.sin(position * div_term)
        
        # Apply the cos function to odd indices in the PE matrix, complementing the sin-encoded positions.
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Register 'pe' as a buffer within the module. Unlike parameters, buffers are not updated during training.
        # This is crucial because positional encodings are fixed and not subject to training updates.
        # The unsqueeze(0) adds a batch dimension for easier broadcasting with input tensors.
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to the input tensor x.
        # x is expected to have dimensions [batch_size, seq_length, d_model].
        # The positional encoding 'pe' is sliced to match the seq_length of 'x', and then added to 'x'.
        # This operation leverages broadcasting to apply the same positional encoding across the batch.
        x = x + self.pe[:, :x.size(1)]
        return x

### Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    """An encoder layer consists of a multi-head self-attention sublayer and a feed forward sublayer,
       with a dropout, residual connection, and layer normalization after each sub-layer.    
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        ### WRITE YOUR CODE HERE   
        # First sublayer: Multi-head self-attention with residual connection and layer norm
        attn_output = self.self_attn(x,x,x,mask) # Self-attention: Q=K=V=x
        x = self.layer_norm1(x + self.dropout(attn_output))

        # Second sublayer: Feed-forward network with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x +self.dropout(ff_output)) 

        return x
        
        
       

### Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    """A decoder layer consists of a multi-head self-attention, cross-attention and a feed-forward sublayers,
       with a dropout, residual connection, and layer normalization after each sub-layer.    
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)       

    def forward(self, x, enc_output, src_mask, tgt_mask):
        ### WRITE YOUR CODE HERE
        # First sublayer: Masked multi-head self-attention with residual connection and layer norm
        self_attn_output = self.self_attn(x,x,x,tgt_mask) # Self-attention process target sequence
        x = self.layer_norm1(x + self.dropout(self_attn_output)) 

        # Second sublayer: Cross-attention with encoder output
        # Q: From decoder (x) - "what am I looking for?"
        # K, V: From encoder output - "what information is available?"
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask) # Q=x, K=V=enc_output, Attend to source sequence
        x = self.layer_norm2(x + self.dropout(cross_attn_output))

        # Third sublayer: Feed-forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.layer_norm3(x + self.dropout(ff_output))

        return x


        
        
        

### The Full Model

In [None]:
class Transformer(nn.Module):
    """
    Implements the Transformer model for sequence-to-sequence tasks such as machine translation.
    The Transformer model, as described in "Attention is All You Need" by Vaswani et al., consists of an encoder and
    decoder architecture that uses self-attention mechanisms to process input sequences and generate output sequences.

    Parameters:
    - src_vocab_size (int): Size of the source vocabulary.
    - tgt_vocab_size (int): Size of the target vocabulary.
    - d_model (int): Dimension of the model embeddings and hidden states.
    - N (int): Number of layers in both the encoder and decoder stacks.
    - n_heads (int): Number of attention heads in each multi-head attention mechanism.
    - d_ff (int): Dimension of the feed-forward network within each layer.
    - max_seq_length (int): Maximum length of input sequences, used for positional encoding.
    - dropout (float): Dropout rate applied to embeddings and sub-layers.
    - pad_idx (int): Index of the padding token in the source and target vocabularies.

    Attributes:
    - src_embedding (torch.nn.Embedding): Embedding layer for source sequences.
    - tgt_embedding (torch.nn.Embedding): Embedding layer for target sequences.
    - positional_encoding (PositionalEncoding): Adds positional information to embeddings.
    - encoder (torch.nn.ModuleList): Stack of N encoder layers.
    - decoder (torch.nn.ModuleList): Stack of N decoder layers.
    - out (torch.nn.Linear): Linear layer that projects decoder output to target vocabulary size.
    - dropout (torch.nn.Dropout): Dropout layer applied after embedding and positional encoding.
    
    Methods:
    - init_weights: Initializes model parameters using Glorot uniform initialization.
    - create_source_mask: Creates a mask for padding tokens in the source sequence to ignore them in attention computations.
    - create_target_mask: Creates combined padding and future token masks for the target sequence to prevent attending to future tokens and padding tokens.
    - encode: Processes the source sequence through the encoder stack and generates memory states.
    - decode: Processes the target sequence through the decoder stack using memory states from the encoder and applicable masks.
    - forward: Defines the forward pass of the model using the encode and decode methods.
    """    
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, N, n_heads, d_ff, max_seq_length, dropout, pad_idx):
        super().__init__()

        # Embedding layers for source and target
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Encoder and Decoder stacks
        self.encoder = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(N)])

        # Output linear layer
        self.out = nn.Linear(d_model, tgt_vocab_size)
        
        self.dropout = nn.Dropout(dropout)

        # Initialization
        self.init_weights()
        self.pad_idx = pad_idx

    def init_weights(self):
        """Initialize parameters with Glorot / fan_avg"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def create_source_mask(self, src):
        """Create a mask for padding tokens in the source"""            
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, src_len]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(2) adds a dimension for the attention scores 
        # This mask can be broadcasted across the src_len dimension of the attention scores, 
        # effectively masking out specific tokens across all heads and all positions in the sequence. 
        return src_mask    
    
    def create_target_mask(self, tgt):
        """Create masks for both padding tokens and future tokens"""   
        # Target padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(3)  # [batch_size, 1, tgt_len, 1]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(3) adds a dimension for the attention scores
        # The final shape allows the mask to be broadcast across the attention scores, ensuring positions only 
        # attend to allowed positions as dictated by the no-peak mask (the preceding positions) and the padding mask.
                
        # Target no-peak mask
        tgt_len = tgt.size(1)        
        tgt_nopeak_mask = torch.tril(torch.ones(tgt_len, tgt_len, device=device)).bool()
        
        # Combine masks
        tgt_mask = tgt_pad_mask & tgt_nopeak_mask  # [batch_size, 1, tgt_len, tgt_len]        
        return tgt_mask 
        
    def encode(self, src):
        """Encodes the source sequence using the Transformer encoder stack.
        """       
        src_mask = self.create_source_mask(src)
        src = self.dropout(self.positional_encoding(self.src_embedding(src)))
        
        # Pass through each layer in the encoder        
        for layer in self.encoder:
            src = layer(src, src_mask)
        return src, src_mask
        
    def decode(self, tgt, memory, src_mask):
        """Decodes the target sequence using the Transformer decoder stack, given the memory from the encoder.
        """
        tgt_mask = self.create_target_mask(tgt)
        tgt = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))
        
        # Pass through each layer in the decoder
        for layer in self.decoder:
            tgt = layer(tgt, memory, src_mask, tgt_mask)

        # Output layer
        output = self.out(tgt)
        return output

    def forward(self, src, tgt):
        ### WRITE YOUR CODE HERE
        # Encode the source sequence
        memory, src_mask = self.encode(src)

        # Decode the target sequence using encoder memory
        output = self.decode(tgt, memory, src_mask)
        
    
        return output

In [None]:
# Define the hyperparameters of the model
src_vocab_size = 5000  # Size of source vocabulary
tgt_vocab_size = 5000  # Size of target vocabulary
d_model = 512          # Embedding dimension
N = 6                  # Number of encoder and decoder layers
num_heads = 8          # Number of attention heads
d_ff = 2048            # Dimension of feed forward networks
max_seq_length = 100   # Maximum sequence length
dropout = 0.1          # Dropout rate
pad_idx = 0            # Index of the padding token

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

### Testing on Random Data

In [None]:
# Generate random sample data
torch.manual_seed(42)

src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)

#### Inference

In [None]:
# Generate the next token using the first token in the first target tensor
model.eval()

memory, src_mask = model.encode(src_data[:1, :])
output = model.decode(tgt_data[:1, :1], memory, src_mask)
y = output.view(-1, tgt_vocab_size).argmax(-1)  
y

If your code is correct, you should get tensor([990]).

#### Training

In [None]:
# Train the model for 10 epochs
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
grad_clip = 1
n_epochs = 10

model.train()

for epoch in range(n_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    output = model(src_data, tgt_data[:, :-1])
    
    # tgt_data is of shape [batch_size, tgt_len]
    # output is of shape [batch_size, tgt_len, tgt_vocab_size]
    output = output.contiguous().view(-1, tgt_vocab_size)
    tgt = tgt_data[:, 1:].contiguous().view(-1)
    loss = criterion(output, tgt)        
    
    loss.backward()        
    nn.utils.clip_grad_norm_(model.parameters(), grad_clip)        
    optimizer.step()    
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')   

You should see the loss decreasing from around 8.6 to 8.1.

### Machine Translation Example

We now consider a real-world example using the Multi30k German-English translation task. This task is much smaller than the WMT task considered in the paper (only 30K sentence pairs compared to 4.5M pairs in the WMT-14 English-German dataset), but it illustrates the whole system. <br>
It is recommended to run this example on Google Colab, or on a machine with a strong GPU.

#### Define Tokenizers 

In [None]:
# Load spacy models for tokenization
try:
    spacy_de = spacy.load('de_core_news_sm')
except IOError:
    os.system("python -m spacy download de_core_news_sm")
    spacy_de = spacy.load('de_core_news_sm')

try:
    spacy_en = spacy.load('en_core_web_sm')
except IOError:
    os.system("python -m spacy download en_core_web_sm")
    spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

tokenizer_de = get_tokenizer(tokenize_de)
tokenizer_en = get_tokenizer(tokenize_en)

#### Build Vocabularies

In [None]:
train_data, _, _ = Multi30k(split=('train', 'valid', 'test'))
vocab_src = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_de, 0), 
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab_tgt = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_en, 1), 
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])

vocab_src.set_default_index(vocab_src['<unk>'])
vocab_tgt.set_default_index(vocab_tgt['<unk>'])

#### Create the Transformer

In [None]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

# Initialize the Transformer model
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Hyperparameters for the training process
batch_size = 128
grad_clip = 1
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Data Processing

In [None]:
def data_process(raw_data_iter):
    data = []
    for raw_src, raw_tgt in raw_data_iter:
        src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(raw_src)], dtype=torch.long)
        tgt_tensor = torch.tensor([vocab_tgt[token] for token in tokenizer_en(raw_tgt)], dtype=torch.long)
        data.append((src_tensor, tgt_tensor))
    return data

train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
train_data = data_process(train_data)
valid_data = data_process(valid_data)
#test_data = data_process(test_data)   
# The test set of Multi30k is corrupted
# See https://discuss.pytorch.org/t/unicodedecodeerror-when-running-test-iterator/192818/3

In [None]:
def generate_batch(data_batch):
    """Processes a batch of source-target pairs by adding start-of-sequence (BOS) and end-of-sequence (EOS) tokens
    to each sequence and padding all sequences to the same length.
    
    Parameters:
    - data_batch (Iterable[Tuple[Tensor, Tensor]]): A batch of source-target pairs, where each element is a tuple
      containing the source sequence tensor and the target sequence tensor.
    """
    src_batch, tgt_batch = [], []
    src_batch, tgt_batch = [], []
    
    # Iterate over each source-target pair in the provided batch
    for src_item, tgt_item in data_batch:
        # Prepend the start-of-sequence (BOS) token and append the end-of-sequence (EOS) token to the sequences        
        src_batch.append(torch.cat([torch.tensor([vocab_src['<bos>']]), src_item, 
                                    torch.tensor([vocab_src['<eos>']])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([vocab_tgt['<bos>']]), tgt_item, 
                                    torch.tensor([vocab_tgt['<eos>']])], dim=0))
        
    # Pad the sequences in the source batch to ensure they all have the same length.
    # 'batch_first=True' indicates that the batch dimension should come first in the resulting tensor.
    src_batch = pad_sequence(src_batch, padding_value=vocab_src['<pad>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocab_tgt['<pad>'], batch_first=True)
    return src_batch, tgt_batch

# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [None]:
def train(model, iterator, optimizer, criterion, grad_clip):
    """
    Trains the model for one epoch over the given dataset.
    This function iterates over the provided data iterator, performing the forward and backward passes for each batch.
    It employs teacher forcing by feeding the shifted target sequence (excluding the last token) as input to the decoder.
    
    Parameters:
    - model (torch.nn.Module): The model to be trained. 
    - iterator (Iterable): An iterable object that returns batches of data. 
    - optimizer (torch.optim.Optimizer): The optimizer to use for updating the model parameters.
    - criterion (Callable): The loss function used to compute the difference between the model's predictions and the actual targets.
    - grad_clip (float): The maximum norm of the gradients for gradient clipping. 

    Returns:
    - float: The average loss for the epoch, computed as the total loss over all batches divided by the number of batches in the iterator.
    """    
    # Set the model to training mode. 
    # This enables dropout, layer normalization etc., which behave differently during training.
    model.train()   
    
    epoch_loss = 0
    
    # Enumerate over the data iterator to get batches
    for i, batch in enumerate(iterator):         
        # Unpack the batch to get source (src) and target (tgt) sequences
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        
        # Forward pass through the model. 
        # For seq2seq models, the decoder input (tgt[:, :-1]) excludes the last token, implementing teacher forcing.
        output = model(src, tgt[:, :-1])
        
        # Reshape the output and target tensors to compute loss.
        # The output tensor is reshaped to a 2D tensor where rows correspond to each token in the batch and columns to vocabulary size.
                
        # tgt is of shape [batch_size, tgt_len]
        # output is of shape [batch_size, tgt_len, tgt_vocab_size]
        output = output.contiguous().view(-1, tgt_vocab_size)
        
        # The target tensor is reshaped to a 1D tensor, excluding the first token (BOS) from each sequence.
        tgt = tgt[:, 1:].contiguous().view(-1)
        
        # Compute loss, perform backpropagation, and update model parameters
        loss = criterion(output, tgt)          
        loss.backward() 
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)  
        optimizer.step()        
        epoch_loss += loss.item()
        
    # Compute average loss per batch for the current epoch
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    """
    Evaluates the model's performance on a given dataset.
    This function is similar to the training loop, but without the backward pass and parameter updates.
    """
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])            
            output_dim = output.shape[-1]            
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

#### Training the Model

In [None]:
n_epochs = 20

for epoch in range(n_epochs):
    train_loss = train(model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

The train loss should decrease from around 5.7 to 2.8 after 20 epochs.

#### Translating a Sample Sentence

In [None]:
def translate_sentence(model, sentence, vocab_src, vocab_tgt, max_length=50):
    """
    Translates a given source sentence into the target language using a trained Transformer model.
    The function preprocesses the input sentence by tokenizing and converting it to tensor format, then uses the model's
    encode and decode methods to generate the translated sentence. The translation process is performed token by token
    using greedy decoding, selecting the most likely next token at each step until an <eos> token is produced or the
    maximum length is reached.

    Parameters:
    - model (torch.nn.Module): The trained Transformer model. 
    - sentence (str): The source sentence to translate. 
    - vocab_src (dict): The source vocabulary mapping of tokens to indices. It should include special tokens such as
      '<bos>' (beginning of sentence) and '<eos>' (end of sentence).
    - vocab_tgt (dict): The target vocabulary mapping of indices to tokens. It should provide a method `lookup_token`
      to convert token indices back to the string representation.
    - max_length (int, optional): The maximum allowed length for the generated translation. The decoding process will
      stop when this length is reached if an <eos> token has not yet been generated.

    Returns:
    - str: The translated sentence as a string of text in the target language.
    """ 
    ### WRITE YOUR CODE HERE
    model.eval()

    # Tokenize and convert source sentence to tensor
    src_tokens = [vocab_src['<bos>']] + [vocab_src[token] for token in tokenize_de(sentence)] + [vocab_src['<eos']]
    src_tensor = torch.tensor(src_tokens, dtype = torch.long).unsqueeze(0).to(device) # add batch dimension
    
    # Encode the source sequence
    memory, src_mask = model.encode(src_tensor)

    # Initialize target sequence with <bos> token
    tgt_tokens = [vocab_tgt['<bos>']]

    # Generate translation token by token (greedy decoding)
    for _ in range(max_length):
        # Convert current target tokens to tensor
        tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long).unsqueeze(0).to(device)
        # Decode to get next token probabilities
        output = model.decode(tgt_tensor, memory, src_mask)
        # Get the most likely next token (greedy selection)
        next_token = output[:, -1, :].argmax(-1).item() # Get last position, most likely token
        # Add the predicted token to the sequence
        tgt_tokens.append(next_token)
        # Stop if we generate <eos> token
        if next_token == vocab_tgt['<eos>']:
            break
    # Convert token indices back to words (excluding <bos> and <eos>)
    translated_tokens = [vocab_tgt.lookup_token(token) for token in tgt_tokens[1:-1]] # Skip <bos> and <eos>; .lookup_token(): A method that converts token indices back to strings
    # Join tokens into a sentence
    translated_sentence = ' '.join(translated_tokens)
    
    
    return translated_sentence

In [None]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

You should get a translation similar to the reference after 20 epochs of training.

In [None]:
import itertools
import pandas as pd
import matplotlib.pyplot as plt

# hyperparameter ranges to experiment with
hyperparameter_configs = {
    'num_heads': [4, 8, 16],
    'num_layers': [3, 6, 9],
    'learning_rate': [0.0001, 0.0005, 0.001],
    'batch_size': [32, 64, 128]
}

# Generate all combinations 
def generate_configs(configs, max_experiments=20):
    """Generate hyperparameter combinations"""
    keys = list(configs.keys())
    values = list(configs.values())
    
    all_combinations = list(itertools.product(*values))
    
    # Limit number of experiments if too many
    if len(all_combinations) > max_experiments:
        # Select random subset or use grid search strategy
        import random
        random.seed(42)
        all_combinations = random.sample(all_combinations, max_experiments)
    
    return [dict(zip(keys, combo)) for combo in all_combinations]

experiments = generate_configs(hyperparameter_configs, max_experiments=3)
print(f"Total experiments: {len(experiments)}")

In [None]:
def create_model_with_config(config, src_vocab_size, tgt_vocab_size, pad_idx):
    """Create model with specific hyperparameters"""
    
    # Fixed hyperparameters
    d_model = 512
    d_ff = 2048
    max_seq_length = 5000
    dropout = 0.1
    
    model = Transformer(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        d_model=d_model,
        N=config['num_layers'],           # Variable
        n_heads=config['num_heads'],    # Variable
        d_ff=d_ff,
        max_seq_length=max_seq_length,
        dropout=dropout,
        pad_idx=pad_idx
    )
    
    return model.to(device)

def create_optimizer_with_config(model, config):
    """Create optimizer with specific learning rate"""
    return optim.Adam(
        model.parameters(), 
        lr=config['learning_rate'], 
        betas=(0.9, 0.98), 
        eps=1e-9
    )

def create_dataloader_with_config(data, config, collate_fn):
    """Create dataloader with specific batch size"""
    return DataLoader(
        data, 
        batch_size=config['batch_size'], 
        shuffle=True, 
        collate_fn=collate_fn
    )

In [None]:
def train_with_metrics(model, train_iterator, valid_iterator, optimizer, criterion, config, num_epochs=3):
    """Train model and collect metrics"""
    
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_train_loss = 0
        
        for batch in train_iterator:
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            
            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])
            
            output = output.contiguous().view(-1, tgt_vocab_size)
            target = tgt[:, 1:].contiguous().view(-1)
            
            loss = criterion(output, target)
            loss.backward()
            
            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            epoch_train_loss += loss.item()
        
        avg_train_loss = epoch_train_loss / len(train_iterator)
        
        # Validation
        model.eval()
        epoch_val_loss = 0
        
        with torch.no_grad():
            for batch in valid_iterator:
                src, tgt = batch
                src, tgt = src.to(device), tgt.to(device)
                
                output = model(src, tgt[:, :-1])
                output = output.contiguous().view(-1, tgt_vocab_size)
                target = tgt[:, 1:].contiguous().view(-1)
                
                loss = criterion(output, target)
                epoch_val_loss += loss.item()
        
        avg_val_loss = epoch_val_loss / len(valid_iterator)
        
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        # Track best validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
        
        print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    return {
        'config': config,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'best_val_loss': best_val_loss,
        'final_train_loss': train_losses[-1],
        'final_val_loss': val_losses[-1]
    }

In [None]:
def run_experiments(experiments, train_data, valid_data, vocab_src, vocab_tgt):
    """Run all hyperparameter experiments"""
    
    results = []
    criterion = nn.CrossEntropyLoss(ignore_index=vocab_tgt['<pad>'])
    
    for i, config in enumerate(experiments):
        print(f"\n{'='*50}")
        print(f"Experiment {i+1}/{len(experiments)}")
        print(f"Config: {config}")
        print(f"{'='*50}")
        
        try:
            # Create model
            model = create_model_with_config(config, len(vocab_src), len(vocab_tgt), vocab_tgt['<pad>'])
            
            # Create optimizer
            optimizer = create_optimizer_with_config(model, config)
            
            # Create data loaders
            train_iterator = create_dataloader_with_config(train_data, config, generate_batch)
            valid_iterator = create_dataloader_with_config(valid_data, config, generate_batch)
            
            # Train and collect results
            result = train_with_metrics(
                model, train_iterator, valid_iterator, 
                optimizer, criterion, config, num_epochs=3
            )
            
            results.append(result)
            
        except Exception as e:
            print(f"Experiment {i+1} failed: {e}")
            continue
    
    return results

# Run experiments
experiment_results = run_experiments(experiments, train_data, valid_data, vocab_src, vocab_tgt)

In [None]:
def analyze_results(results):
    """Analyze experimental results"""
    
    # Convert to DataFrame for easier analysis
    data = []
    for result in results:
        config = result['config']
        data.append({
            'num_heads': config['num_heads'],
            'num_layers': config['num_layers'], 
            'learning_rate': config['learning_rate'],
            'batch_size': config['batch_size'],
            'best_val_loss': result['best_val_loss'],
            'final_val_loss': result['final_val_loss'],
            'final_train_loss': result['final_train_loss']
        })
    
    df = pd.DataFrame(data)
    return df

# Analyze results
results_df = analyze_results(experiment_results)

# Sort by best validation loss
results_df_sorted = results_df.sort_values('best_val_loss')
print("Top 5 configurations by validation loss:")
print(results_df_sorted.head())

In [None]:
def statistical_analysis(results_df):
    """Perform statistical analysis of hyperparameter effects"""
    
    print("Correlation Analysis:")
    print("="*50)
    correlations = results_df[['num_heads', 'num_layers', 'learning_rate', 'batch_size', 'best_val_loss']].corr()
    print(correlations['best_val_loss'].sort_values())
    
    print("\nBest hyperparameter values:")
    print("="*50)
    best_config = results_df.loc[results_df['best_val_loss'].idxmin()]
    print(f"Best validation loss: {best_config['best_val_loss']:.4f}")
    print(f"Configuration:")
    print(f"  - Number of heads: {best_config['num_heads']}")
    print(f"  - Number of layers: {best_config['num_layers']}")
    print(f"  - Learning rate: {best_config['learning_rate']}")
    print(f"  - Batch size: {best_config['batch_size']}")
    
    return best_config

best_config = statistical_analysis(results_df)