# Encoder-Decoder Transformer (T5-like Model)

In this notebook, we implement an encoder-decoder transformer model similar to T5. The model is designed for sequence-to-sequence tasks like translation or summarization.

In [None]:
%pip install torch numpy

In [4]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import copy

In [5]:
# Set the device ("mps" if you're using an M series mac):
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [6]:
# Sample vocabulary and data
vocab = ['[PAD]', '[BOS]', '[EOS]', 'i', 'like', 'eating', 'apples', 'bananas', 'fruits']
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

### Special Tokens
There are some different tokens you might stumble across when dealing with inputs in NLP. 
- The [PAD] Token acts as a padding if a sentence does not have the desired fixed length. 
- The [BOS] Token indicates the Beginning of the Sentence. 
- The [EOS] Token indicates the End of the Sentence. 
- The [CLS] Token represents Sentence Level Classification. 
- The [SEP] Token represents Separation of Sentences (used by BERT). 
- The [UNK] Token represents OOB-Tokens, meaning unknown Tokens that are not included in the vocabulary. 

In [7]:
# Sample input-output pairs (e.g., paraphrasing)
input_sentences = [
    ['[BOS]', 'i', 'like', 'eating', 'apples', '[EOS]'],
    ['[BOS]', 'i', 'like', 'eating', 'bananas', '[EOS]']
]
output_sentences = [
    ['[BOS]', 'i', 'like', 'fruits', '[EOS]'],
    ['[BOS]', 'i', 'like', 'fruits', '[EOS]']
]

In [8]:
# Prepare data
def prepare_data(sentences):
    inputs = []
    for sent in sentences:
        input_ids = [word_to_idx[word] for word in sent]
        inputs.append(input_ids)
    return torch.tensor(inputs)

In [9]:
# Convert data to tensors
encoder_inputs = prepare_data(input_sentences).to(device)
decoder_inputs = prepare_data([['[BOS]'] + sent[1:] for sent in output_sentences]).to(device)
decoder_targets = prepare_data([sent[1:] + ['[PAD]'] for sent in output_sentences]).to(device)

In [None]:
# Positional Encoding (same as before)
class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, max_len=512):
        super(PositionalEncoding, self).__init__()
        # TODO: Implement positional encoding
        # Create a positional encoding matrix with shape (max_len, d_model)
        # Use torch.arange and torch.exp to calculate the positional encoding

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x

In [None]:
# Encoder Layer (same as before)
class EncoderLayer(nn.Module):
    def __init__(self, model_dimension, num_attention_heads, dim_feedforward, dropout=0.1):
        super(EncoderLayer, self).__init__()
        # TODO: Implement encoder layer components
        # Multi-head self-attention
        
        # Feedforward network (linear, dropout, linear)
        
        # Layer normalization, twice
        
        # Dropout layers, twice
        

    def forward(self, input_tensor, src_mask=None):
        # TODO: Implement forward pass
        # Self-attention, dropout and norm
        
        # Feedforward network
        # linear, relu, dropout, linear
        
        # dropout and norm
        
        return input_tensor

In [None]:
# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, model_dimension, num_attention_heads, dim_feedforward, dropout=0.1):
        super(DecoderLayer, self).__init__()
        # TODO: Implement decoder layer components
        # Multi-head self-attention
        
        # Feedforward network (linear, dropout, linear)
        
        # Layer normalization, thrice
        
        # Dropout layers
        

    def forward(self, target, memory, target_mask=None, memory_mask=None):
        # TODO: Implement forward pass
        # Self-attention
        
        # Multi-head attention with encoder output
        
        # Feedforward network
        
        return target

In [None]:
# Encoder (same as before)
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_attention_heads, vocab_size, dim_feedforward, dropout=0.1):
        super(Encoder, self).__init__()
        # TODO: Implement encoder components
        # Embedding and Positional Encoding
        
        # Create multiple encoder layers
        

    def forward(self, input_sequence, src_mask=None):
        # TODO: Implement forward pass
        # Embedding Positional Encoding and Permutation
        
        # Pass through encoder layers
        
        return input_sequence

In [None]:
# Decoder
class Decoder(nn.Module):
    def __init__(self, num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout=0.1):
        super(Decoder, self).__init__()
        # TODO: Implement decoder components
        # Embedding and Positional Encoding
        
        # Create multiple decoder layers
        

    def forward(self, target, memory, tgt_mask=None, memory_mask=None):
        # TODO: Implement forward pass
        # Embedding Positional Encoding and Permutation
        
        # Pass through decoder layers
        
        return target

In [None]:
# Seq2Seq Model
class Seq2SeqModel(nn.Module):
    def __init__(self, num_layers, d_model, nhead, vocab_size, dim_feedforward, dropout=0.1):
        super(Seq2SeqModel, self).__init__()
        # TODO: Implement Seq2Seq model components
        # Encoder and Decoder, as well as output layer
        

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # TODO: Implement forward pass
        # Pass through encoder, then decoder, then output layer
        
        return output

In [16]:
# Hyperparameters
num_layers = 2
model_dimension = 64
num_attention_heads = 4
dim_feedforward = 128
dropout = 0.1

In [17]:
# Initialize the model, loss function, and optimizer
model = Seq2SeqModel(num_layers, model_dimension, num_attention_heads, vocab_size, dim_feedforward, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx['[PAD]'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
# Create masks (not used in this simple example)
source_attention_mask = None
target_mask = None

In [None]:
# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    # TODO: Forward pass
    # Pass encoder inputs, decoder inputs, and masks to the model
    
    # Reshape outputs and targets
    outputs = outputs.view(-1, vocab_size)
    targets = decoder_targets.view(-1)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 10/100, Loss: 0.3137
Epoch 20/100, Loss: 0.0894
Epoch 30/100, Loss: 0.0484
Epoch 40/100, Loss: 0.0372
Epoch 50/100, Loss: 0.0279
Epoch 60/100, Loss: 0.0227
Epoch 70/100, Loss: 0.0197
Epoch 80/100, Loss: 0.0160
Epoch 90/100, Loss: 0.0147
Epoch 100/100, Loss: 0.0135


In [22]:
# Inference: Generate output sequence
def generate_sequence(model, input_sentence, max_length=10):
    model.eval()
    input_ids = [word_to_idx.get(word, word_to_idx['[PAD]']) for word in input_sentence]
    source_tensor = torch.tensor([input_ids]).to(device)
    memory = model.encoder(source_tensor)
    target_tokens = [word_to_idx['[BOS]']]
    for _ in range(max_length):
        target = torch.tensor([target_tokens]).to(device)
        output = model.decoder(target, memory)
        output = model.output_layer(output.permute(1, 0, 2))
        next_token = output.argmax(-1)[:, -1].item()
        target_tokens.append(next_token)
        if next_token == word_to_idx['[EOS]']:
            break
    output_sentence = [idx_to_word[idx] for idx in target_tokens]
    return output_sentence

In [23]:
# Test the model
test_sentence = ['[BOS]', 'i', 'like', 'eating', 'apples', '[EOS]']
generated_sentence = generate_sequence(model, test_sentence)
print("Input Sentence:", test_sentence)
print("Generated Sentence:", generated_sentence)

Input Sentence: ['[BOS]', 'i', 'like', 'eating', 'apples', '[EOS]']
Generated Sentence: ['[BOS]', 'i', 'like', 'fruits', '[EOS]']
