In [None]:
# Author: Roi Yehoshua
# Date: January 2024
# MIT License

# Based on the PyTorch implementation from https://nlp.seas.harvard.edu/annotated-transformer/

In [None]:
# Make sure you have the following packages installed:
!pip uninstall torch torchvision torchtext torchdata -y

Found existing installation: torch 2.1.0
Uninstalling torch-2.1.0:
  Successfully uninstalled torch-2.1.0
[0mFound existing installation: torchtext 0.16.0
Uninstalling torchtext-0.16.0:
  Successfully uninstalled torchtext-0.16.0
Found existing installation: torchdata 0.7.0
Uninstalling torchdata-0.7.0:
  Successfully uninstalled torchdata-0.7.0


In [None]:
!python --version


Python 3.10.12


In [None]:
!pip install torch==2.1.0 torchdata==0.7.0 torchtext==0.16.0 portalocker==2.10.0 --quiet


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.17 requires torchvision>=0.11, which is not installed.
torchaudio 2.4.1+cu121 requires torch==2.4.1, but you have torch 2.1.0 which is incompatible.[0m[31m
[0m

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import spacy
import os

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
torch.manual_seed(42)  # For reproducibility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Multi-Head Attention

$$
    \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \ldots, \text{head}_h)W^O \\
    \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\  
    \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

In [None]:
class MultiHeadAttention(nn.Module):
    """The multi-head attention module"""
    def __init__(self, d_model, num_heads):
        super().__init__()

        # Ensure the dimension of the model is divisible by the number of heads.
        # This is necessary to equally divide the embedding dimension across heads.
        assert d_model % num_heads == 0, 'd_model must be divisible by num_heads'

        self.d_model = d_model           # Total dimension of the model
        self.num_heads = num_heads       # Number of attention heads
        self.d_k = d_model // num_heads  # Dimnsion of each head. We assume d_v = d_k

        # Linear transformations for queries, keys, and values
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Final linear layer to project the concatenated heads' outputs back to d_model dimensions
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE

        # 1. Calculate attention scores with scaling.
        # 2. Apply mask (if provided) by setting masked positions to a large negative value.
        # 3. Apply softmax to attention scores to get probabilities.
        # 4. Return the weighted sum of values based on attention probabilities.

        d_k = K.size(-1)  # Dimension of key
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)

        # 2. Apply mask (if provided) - set to a large negative value
        if mask is not None:
          scores = scores.masked_fill(mask == 0, -1e9)

        # 3. Apply softmax to get the attention probabilities
        attn = torch.softmax(scores, dim=-1)

        # 4. Multiply the attention probabilities by values
        output = torch.matmul(attn, V)

        return output

    def split_heads(self, x):
        # Reshape the input tensor to [batch_size, num_heads, seq_length, d_k]
        # to prepare for multi-head attention processing
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Inverse operation of split_heads: combine the head outputs back into the original tensor shape
        # [batch_size, seq_length, d_model]
        batch_size, num_heads, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE

        # 1. Linearly project the queries, keys, and values, and then split them into heads.
        # 2. Apply scaled dot-product attention for each head.
        # 3. Concatenate the heads' outputs and apply the final linear projection.

        # 1. Linearly project the queries, keys, and values
        Q = self.split_heads(self.W_q(Q))  # [batch_size, num_heads, seq_length, d_k]
        K = self.split_heads(self.W_k(K))  # [batch_size, num_heads, seq_length, d_k]
        V = self.split_heads(self.W_v(V))  # [batch_size, num_heads, seq_length, d_k]

        # 2. Apply scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # 3. Concatenate heads and apply final linear projection
        attn_output = self.combine_heads(attn_output)  # [batch_size, seq_length, d_model]
        output = self.W_o(attn_output)

        return output


### Feed-Forward NN

$$
    \text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2
$$

In [None]:
class PositionwiseFeedForward(nn.Module):
    """The Positionwise Feedforward Network (FFN) module"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
    # Apply linear layer, ReLU, Dropout, and then another linear layer
        x = self.dropout(self.relu(self.linear1(x)))
        x = self.linear2(x)
        return x



### Positional Encoding

$$
    \text{PE}(pos, 2i) = \sin(pos/10000^{2i/d_{\text{model}}}) \\
    \text{PE}(pos, 2i + 1) = \cos(pos/10000^{2i/d_{\text{model}}})
$$

In [None]:
class PositionalEncoding(nn.Module):
    """
    Implements the positional encoding module using sinusoidal functions of different frequencies
    for each dimension of the encoding.
    """
    def __init__(self, d_model, max_seq_length):
        super().__init__()

        # Create a positional encoding (PE) matrix with dimensions [max_seq_length, d_model].
        # This matrix will contain the positional encodings for all possible positions up to max_seq_length.
        pe = torch.zeros(max_seq_length, d_model)

        # Generate a tensor of positions (0 to max_seq_length - 1) and reshape it to [max_seq_length, 1].
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # Compute the division term used in the formulas for sin and cos functions.
        # This term is based on the dimension of the model and the position, ensuring that the wavelengths
        # form a geometric progression from 2π to 10000 * 2π. It uses only even indices for the dimensions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply the sin function to even indices in the PE matrix. These values are determined by
        # multiplying the position by the division term, creating a pattern where each position has
        # a unique sinusoidal encoding.
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply the cos function to odd indices in the PE matrix, complementing the sin-encoded positions.
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register 'pe' as a buffer within the module. Unlike parameters, buffers are not updated during training.
        # This is crucial because positional encodings are fixed and not subject to training updates.
        # The unsqueeze(0) adds a batch dimension for easier broadcasting with input tensors.
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to the input tensor x.
        # x is expected to have dimensions [batch_size, seq_length, d_model].
        # The positional encoding 'pe' is sliced to match the seq_length of 'x', and then added to 'x'.
        # This operation leverages broadcasting to apply the same positional encoding across the batch.
        x = x + self.pe[:, :x.size(1)]
        return x

### Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    """An encoder layer consists of a multi-head self-attention sublayer and a feed forward sublayer,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Self-attention layer with residual connection and layer norm
        attn_output = self.self_attn(x, x, x, mask)
        x = self.layer_norm1(x + self.dropout(attn_output))

        # Feed-forward layer with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout(ff_output))

        return x






### Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    """A decoder layer consists of a multi-head self-attention, cross-attention and a feed-forward sublayers,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Self-attention for target with residual connection
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.layer_norm1(x + self.dropout(attn_output))

        # Cross-attention with encoder output and residual connection
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.layer_norm2(x + self.dropout(attn_output))

        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.layer_norm3(x + self.dropout(ff_output))

        return x




### The Full Model

In [None]:
class Transformer(nn.Module):
    """
    Implements the Transformer model for sequence-to-sequence tasks such as machine translation.
    The Transformer model, as described in "Attention is All You Need" by Vaswani et al., consists of an encoder and
    decoder architecture that uses self-attention mechanisms to process input sequences and generate output sequences.

    Parameters:
    - src_vocab_size (int): Size of the source vocabulary.
    - tgt_vocab_size (int): Size of the target vocabulary.
    - d_model (int): Dimension of the model embeddings and hidden states.
    - N (int): Number of layers in both the encoder and decoder stacks.
    - n_heads (int): Number of attention heads in each multi-head attention mechanism.
    - d_ff (int): Dimension of the feed-forward network within each layer.
    - max_seq_length (int): Maximum length of input sequences, used for positional encoding.
    - dropout (float): Dropout rate applied to embeddings and sub-layers.
    - pad_idx (int): Index of the padding token in the source and target vocabularies.

    Attributes:
    - src_embedding (torch.nn.Embedding): Embedding layer for source sequences.
    - tgt_embedding (torch.nn.Embedding): Embedding layer for target sequences.
    - positional_encoding (PositionalEncoding): Adds positional information to embeddings.
    - encoder (torch.nn.ModuleList): Stack of N encoder layers.
    - decoder (torch.nn.ModuleList): Stack of N decoder layers.
    - out (torch.nn.Linear): Linear layer that projects decoder output to target vocabulary size.
    - dropout (torch.nn.Dropout): Dropout layer applied after embedding and positional encoding.

    Methods:
    - init_weights: Initializes model parameters using Glorot uniform initialization.
    - create_source_mask: Creates a mask for padding tokens in the source sequence to ignore them in attention computations.
    - create_target_mask: Creates combined padding and future token masks for the target sequence to prevent attending to future tokens and padding tokens.
    - encode: Processes the source sequence through the encoder stack and generates memory states.
    - decode: Processes the target sequence through the decoder stack using memory states from the encoder and applicable masks.
    - forward: Defines the forward pass of the model using the encode and decode methods.
    """
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, N, n_heads, d_ff, max_seq_length, dropout, pad_idx):
        super().__init__()

        # Embedding layers for source and target
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Encoder and Decoder stacks
        self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])

        # Output linear layer
        self.out = nn.Linear(d_model, tgt_vocab_size)

        self.dropout = nn.Dropout(dropout)

        # Initialization
        self.init_weights()
        self.pad_idx = pad_idx

    def init_weights(self):
        """Initialize parameters with Glorot / fan_avg"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def create_source_mask(self, src):
        """Create a mask for padding tokens in the source"""
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, src_len]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(2) adds a dimension for the attention scores
        # This mask can be broadcasted across the src_len dimension of the attention scores,
        # effectively masking out specific tokens across all heads and all positions in the sequence.
        return src_mask

    def create_target_mask(self, tgt):
        """Create masks for both padding tokens and future tokens"""
        # Target padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(3)  # [batch_size, 1, tgt_len, 1]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(3) adds a dimension for the attention scores
        # The final shape allows the mask to be broadcast across the attention scores, ensuring positions only
        # attend to allowed positions as dictated by the no-peak mask (the preceding positions) and the padding mask.

        # Target no-peak mask
        tgt_len = tgt.size(1)
        tgt_nopeak_mask = torch.tril(torch.ones(tgt_len, tgt_len, device=device)).bool()

        # Combine masks
        tgt_mask = tgt_pad_mask & tgt_nopeak_mask  # [batch_size, 1, tgt_len, tgt_len]
        return tgt_mask

    def encode(self, src):
        """Encodes the source sequence using the Transformer encoder stack.
        """
        src_mask = self.create_source_mask(src)
        src = self.dropout(self.positional_encoding(self.src_embedding(src)))

        # Pass through each layer in the encoder
        for layer in self.encoder:
            src = layer(src, src_mask)
        return src, src_mask

    def decode(self, tgt, memory, src_mask):
        """Decodes the target sequence using the Transformer decoder stack, given the memory from the encoder.
        """
        tgt_mask = self.create_target_mask(tgt)
        tgt = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))

        # Pass through each layer in the decoder
        for layer in self.decoder:
            tgt = layer(tgt, memory, src_mask, tgt_mask)

        # Output layer
        output = self.out(tgt)
        return output

    def forward(self, src, tgt):
      """
      Defines the forward pass of the Transformer model.

      Parameters:
      - src: Source sequence input tensor (batch_size, src_len)
      - tgt: Target sequence input tensor (batch_size, tgt_len)

      Returns:
      - output: Final output tensor (batch_size, tgt_len, tgt_vocab_size)
      """

      # Encode the source sequence
      memory, src_mask = self.encode(src)

      # Decode the target sequence using the encoder memory
      output = self.decode(tgt, memory, src_mask)

      return output


In [None]:
# Define the hyperparameters of the model
src_vocab_size = 5000  # Size of source vocabulary
tgt_vocab_size = 5000  # Size of target vocabulary
d_model = 512          # Embedding dimension
N = 6                  # Number of encoder and decoder layers
num_heads = 8          # Number of attention heads
d_ff = 2048            # Dimension of feed forward networks
max_seq_length = 100   # Maximum sequence length
dropout = 0.1          # Dropout rate
pad_idx = 0            # Index of the padding token

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

### Testing on Random Data

In [None]:
# Generate random sample data
torch.manual_seed(42)

src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)

#### Inference

In [None]:
# Generate the next token using the first token in the first target tensor
model.eval()

memory, src_mask = model.encode(src_data[:1, :])
output = model.decode(tgt_data[:1, :1], memory, src_mask)
y = output.view(-1, tgt_vocab_size).argmax(-1)
y

tensor([990], device='cuda:0')

If your code is correct, you should get tensor([990]).

#### Training

In [None]:
# Train the model for 10 epochs
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
grad_clip = 1
n_epochs = 10

model.train()

for epoch in range(n_epochs):
    optimizer.zero_grad()

    # Forward pass
    output = model(src_data, tgt_data[:, :-1])

    # tgt_data is of shape [batch_size, tgt_len]
    # output is of shape [batch_size, tgt_len, tgt_vocab_size]
    output = output.contiguous().view(-1, tgt_vocab_size)
    tgt = tgt_data[:, 1:].contiguous().view(-1)
    loss = criterion(output, tgt)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

Epoch: 1, Loss: 8.605189323425293
Epoch: 2, Loss: 8.501506805419922
Epoch: 3, Loss: 8.371408462524414
Epoch: 4, Loss: 8.296951293945312


You should see the loss decreasing from around 8.6 to 8.1.

### Machine Translation Example

We now consider a real-world example using the Multi30k German-English translation task. This task is much smaller than the WMT task considered in the paper (only 30K sentence pairs compared to 4.5M pairs in the WMT-14 English-German dataset), but it illustrates the whole system. <br>
It is recommended to run this example on Google Colab, or on a machine with a strong GPU.

#### Define Tokenizers

In [119]:
# Load spacy models for tokenization
try:
    spacy_de = spacy.load('de_core_news_sm')
except IOError:
    os.system("python -m spacy download de_core_news_sm")
    spacy_de = spacy.load('de_core_news_sm')

try:
    spacy_en = spacy.load('en_core_web_sm')
except IOError:
    os.system("python -m spacy download en_core_web_sm")
    spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

tokenizer_de = get_tokenizer(tokenize_de)
tokenizer_en = get_tokenizer(tokenize_en)



#### Build Vocabularies

In [120]:
train_data, _, _ = Multi30k(split=('train', 'valid', 'test'))
vocab_src = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_de, 0),
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab_tgt = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_en, 1),
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])

vocab_src.set_default_index(vocab_src['<unk>'])
vocab_tgt.set_default_index(vocab_tgt['<unk>'])

#### Create the Transformer

In [121]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

# Initialize the Transformer model
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Hyperparameters for the training process
batch_size = 128
grad_clip = 1
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Data Processing

In [122]:
def data_process(raw_data_iter):
    data = []
    for raw_src, raw_tgt in raw_data_iter:
        src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(raw_src)], dtype=torch.long)
        tgt_tensor = torch.tensor([vocab_tgt[token] for token in tokenizer_en(raw_tgt)], dtype=torch.long)
        data.append((src_tensor, tgt_tensor))
    return data

train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
train_data = data_process(train_data)
valid_data = data_process(valid_data)
#test_data = data_process(test_data)
# The test set of Multi30k is corrupted
# See https://discuss.pytorch.org/t/unicodedecodeerror-when-running-test-iterator/192818/3

In [123]:
def generate_batch(data_batch):
    """Processes a batch of source-target pairs by adding start-of-sequence (BOS) and end-of-sequence (EOS) tokens
    to each sequence and padding all sequences to the same length.

    Parameters:
    - data_batch (Iterable[Tuple[Tensor, Tensor]]): A batch of source-target pairs, where each element is a tuple
      containing the source sequence tensor and the target sequence tensor.
    """
    src_batch, tgt_batch = [], []
    src_batch, tgt_batch = [], []

    # Iterate over each source-target pair in the provided batch
    for src_item, tgt_item in data_batch:
        # Prepend the start-of-sequence (BOS) token and append the end-of-sequence (EOS) token to the sequences
        src_batch.append(torch.cat([torch.tensor([vocab_src['<bos>']]), src_item,
                                    torch.tensor([vocab_src['<eos>']])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([vocab_tgt['<bos>']]), tgt_item,
                                    torch.tensor([vocab_tgt['<eos>']])], dim=0))

    # Pad the sequences in the source batch to ensure they all have the same length.
    # 'batch_first=True' indicates that the batch dimension should come first in the resulting tensor.
    src_batch = pad_sequence(src_batch, padding_value=vocab_src['<pad>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocab_tgt['<pad>'], batch_first=True)
    return src_batch, tgt_batch

# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [124]:
def train(model, iterator, optimizer, criterion, grad_clip):
    """
    Trains the model for one epoch over the given dataset.
    This function iterates over the provided data iterator, performing the forward and backward passes for each batch.
    It employs teacher forcing by feeding the shifted target sequence (excluding the last token) as input to the decoder.

    Parameters:
    - model (torch.nn.Module): The model to be trained.
    - iterator (Iterable): An iterable object that returns batches of data.
    - optimizer (torch.optim.Optimizer): The optimizer to use for updating the model parameters.
    - criterion (Callable): The loss function used to compute the difference between the model's predictions and the actual targets.
    - grad_clip (float): The maximum norm of the gradients for gradient clipping.

    Returns:
    - float: The average loss for the epoch, computed as the total loss over all batches divided by the number of batches in the iterator.
    """
    # Set the model to training mode.
    # This enables dropout, layer normalization etc., which behave differently during training.
    model.train()

    epoch_loss = 0

    # Enumerate over the data iterator to get batches
    for i, batch in enumerate(iterator):
        # Unpack the batch to get source (src) and target (tgt) sequences
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # Forward pass through the model.
        # For seq2seq models, the decoder input (tgt[:, :-1]) excludes the last token, implementing teacher forcing.
        output = model(src, tgt[:, :-1])

        # Reshape the output and target tensors to compute loss.
        # The output tensor is reshaped to a 2D tensor where rows correspond to each token in the batch and columns to vocabulary size.

        # tgt is of shape [batch_size, tgt_len]
        # output is of shape [batch_size, tgt_len, tgt_vocab_size]
        output = output.contiguous().view(-1, tgt_vocab_size)

        # The target tensor is reshaped to a 1D tensor, excluding the first token (BOS) from each sequence.
        tgt = tgt[:, 1:].contiguous().view(-1)

        # Compute loss, perform backpropagation, and update model parameters
        loss = criterion(output, tgt)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        epoch_loss += loss.item()

    # Compute average loss per batch for the current epoch
    return epoch_loss / len(iterator)

In [125]:
def evaluate(model, iterator, criterion):
    """
    Evaluates the model's performance on a given dataset.
    This function is similar to the training loop, but without the backward pass and parameter updates.
    """
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

#### Training the Model

In [126]:
n_epochs = 20

for epoch in range(n_epochs):
    train_loss = train(model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')


Epoch: 1
	Train Loss: 5.696
	Val Loss: 5.012

Epoch: 2
	Train Loss: 4.871
	Val Loss: 4.778

Epoch: 3
	Train Loss: 4.672
	Val Loss: 4.546

Epoch: 4
	Train Loss: 4.378
	Val Loss: 4.201

Epoch: 5
	Train Loss: 4.092
	Val Loss: 4.036

Epoch: 6
	Train Loss: 3.906
	Val Loss: 3.889

Epoch: 7
	Train Loss: 3.760
	Val Loss: 3.771

Epoch: 8
	Train Loss: 3.634
	Val Loss: 3.684

Epoch: 9
	Train Loss: 3.534
	Val Loss: 3.614

Epoch: 10
	Train Loss: 3.451
	Val Loss: 3.562

Epoch: 11
	Train Loss: 3.386
	Val Loss: 3.510

Epoch: 12
	Train Loss: 3.323
	Val Loss: 3.491

Epoch: 13
	Train Loss: 3.259
	Val Loss: 3.416

Epoch: 14
	Train Loss: 3.196
	Val Loss: 3.372

Epoch: 15
	Train Loss: 3.135
	Val Loss: 3.329

Epoch: 16
	Train Loss: 3.076
	Val Loss: 3.291

Epoch: 17
	Train Loss: 3.021
	Val Loss: 3.252

Epoch: 18
	Train Loss: 2.969
	Val Loss: 3.226

Epoch: 19
	Train Loss: 2.919
	Val Loss: 3.187

Epoch: 20
	Train Loss: 2.868
	Val Loss: 3.156


[link text](https://)The train loss should decrease from around 5.7 to 2.8 after 20 epochs.

#### Translating a Sample Sentence

In [127]:
def translate_sentence(model, sentence, vocab_src, vocab_tgt, max_length=50):
    """
    Translates a given source sentence into the target language using a trained Transformer model.
    The function preprocesses the input sentence by tokenizing and converting it to tensor format,
    then uses the model's encode and decode methods to generate the translated sentence.

    Parameters:
    - model (torch.nn.Module): The trained Transformer model.
    - sentence (str): The source sentence to translate.
    - vocab_src (dict): The source vocabulary mapping of tokens to indices.
    - vocab_tgt (dict): The target vocabulary mapping of indices to tokens.
    - max_length (int, optional): The maximum allowed length for the generated translation.

    Returns:
    - str: The translated sentence as a string of text in the target language.
    """

    # Set device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)  # Move the model to the appropriate device

    # Tokenize the input sentence
    src_tokens = [vocab_src['<bos>']] + [vocab_src[token] for token in sentence.split()] + [vocab_src['<eos>']]
    src_tensor = torch.tensor(src_tokens).unsqueeze(0).to(device)  # Move input tensor to the device

    # Pass the source tensor through the encoder
    with torch.no_grad():
        memory, src_mask = model.encode(src_tensor)

    # Initialize the target sequence with the <bos> token
    tgt_tokens = [vocab_tgt['<bos>']]
    tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)  # Move target tensor to the device

    # Generate the translation token by token
    for _ in range(max_length):
        with torch.no_grad():
            output = model.decode(tgt_tensor, memory, src_mask)  # Shape: (1, tgt_len, tgt_vocab_size)

        # Get the last output token probabilities and sample the next token
        next_token_logits = output[:, -1, :]  # Shape: (1, tgt_vocab_size)
        next_token = torch.argmax(next_token_logits, dim=-1).item()  # Get the index of the max probability token

        # Append the predicted token to the target tensor
        tgt_tokens.append(next_token)
        tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)  # Update target tensor shape and move to device

        # Stop if the <eos> token is generated
        if next_token == vocab_tgt['<eos>']:
            break

    # Convert the target token indices back to the string representation
    translated_sentence = ' '.join(vocab_tgt.lookup_token(token) for token in tgt_tokens[1:-1])  # Exclude <bos> and <eos>

    return translated_sentence


In [128]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A young boy is playing with a toy .


You should get a translation similar to the reference after 20 epochs of training.