In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, BertModel, BartForConditionalGeneration, T5ForConditionalGeneration

In [None]:
#Define important variables and hyperparameters for later use
SRC_LANG = "en"
TGT_LANG = "fr"
MAX_LEN = 50
BATCH_SIZE = 32
EPOCHS = 2
LR = 0.01
EMB_DIM = 256
HID_DIM = 512
NUM_HEADS = 4
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load the "Opus Books" dataset: A translation dataset with various language pairs available
# Link to interactive dataset viewer: https://huggingface.co/datasets/Helsinki-NLP/opus_books
dataset = load_dataset("opus_books", f"en-fr", split="train[:1%]")

# Loading a pre-trained tokenizer to tokenize the text in the dataset
# Tokenization output example: This is a cat -> 43 105 1236 4
tkr = AutoTokenizer.from_pretrained("t5-small")

# Ensuring padding and EOS (end of sentence) tokens exist
if tkr.eos_token is None:
  tkr.eos_token = tkr.sep_token or "</s>"
if tkr.pad_token is None:
  tkr.pad_token = tkr.eos_token

SRC_VOCAB_SIZE = len(tkr)
TGT_VOCAB_SIZE = len(tkr)

# Preprocessing the data and creating a dataset that returns tokenized text (not strings!) which we can pass to the model
# Whenever you're training a model to do something, chances are you'll have to preprocess your input data yourself
# You can use the following as a reference template for how you can do that

class TranslationDataset(Dataset): # Custom dataset class must inherit from PyTorch's Dataset class!
  def __init__(self, dataset, tkr, max_len=50):
      self.dataset = dataset
      self.tkr = tkr
      self.max_len = max_len

  # The __len__ and __getitem__ methods MUST be implemented since PyTorch uses that at the back-end to retrieve samples
  def __len__(self):
    return len(self.dataset)

  # Returns a preprocessed dataitem at a given index
  def __getitem__(self, idx):
    src_text = self.dataset[idx]["translation"][SRC_LANG]
    tgt_text = self.dataset[idx]["translation"][TGT_LANG]

    # Tokenizing both source and target texts and returning the tokenized IDs
    src = self.tkr(
        src_text,
        padding="max_length",
        truncation=True,
        max_length=self.max_len,
        return_tensors="pt",
    )

    tgt = self.tkr(
        tgt_text,
        padding="max_length",
        truncation=True,
        max_length=self.max_len+1,
        return_tensors="pt",
    )

    return (
        src.input_ids.squeeze(0),
        tgt.input_ids.squeeze(0),
    )

train_data = TranslationDataset(dataset, tkr, MAX_LEN)

# Dataloaders allow us to batch data, shuffle it, and much more.
# The first input to this function MUST be a PyTorch Dataset class (like the custom one we created above)
# Doc: https://docs.pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

en-fr/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [None]:
# A sample from the dataset
dataset[0]

{'id': '0', 'translation': {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}}

# Implementing a Simple Attention Layer

We require this layer to receive some input embeddings and possibly an attention mask. In return, it computes the attention scores among all of those embeddings according to the following formula:

$Attention(Q, K, V) = Softmax(\frac{QK^T}{\sqrt{d_k}})V$

In [None]:
class MultiHeadAttention(nn.Module):
  # Input IDs -> [batch_size, seq_len]
  # Input embeddings -> [batch_size, seq_len, emb_dim]
  def __init__(self, emb_dim, num_heads):
    super().__init__()
    assert emb_dim % num_heads == 0, "Embedding dimension MUST be divisible by number of heads!"

    self.num_heads = num_heads
    self.head_dim = emb_dim // num_heads

    # Linear layers that project input embeddings to queries, keys, and values
    self.q_proj = nn.Linear(emb_dim, emb_dim)
    self.k_proj = nn.Linear(emb_dim, emb_dim)
    self.v_proj = nn.Linear(emb_dim, emb_dim)

    # Final linear layer to project concatenated heads
    self.out_proj = nn.Linear(emb_dim, emb_dim)

  def forward(self, query, key, value, mask=None):

    B, L_q, D = query.size()
    L_k = key.size(1)
    H = self.num_heads
    head_dim = self.head_dim

    # First, we project the input embeddings into query, key and value vectors respectively
    Q = self.q_proj(query)  # [batch_size, seq_len, emb_dim]
    K = self.k_proj(key)
    V = self.v_proj(value)

    # Then, we split each matrix into H heads
    # In other words, we can reshape the last (rightmost) dimension of each matrix from (emb_dim) to (H, head_dim)
    Q = Q.view(B, L_q, H, head_dim).transpose(1, 2) # [B, H, L, head_dim]
    K = K.view(B, L_k, H, head_dim).transpose(1, 2)
    V = V.view(B, L_k, H, head_dim).transpose(1, 2)

    # Compute scaled dot-product attention (the inner part of the softmax in the formula given above)
    scores = (Q @ K.transpose(-2, -1)) / (head_dim ** 0.5) # [B, H, L, L]

    # You might want to pass a mask to this layer to denote tokens/embedding vectors that you DO NOT want to attend to
    # 1. In cases of padding, you do not want to attend to padding tokens
    # 2. In the case of cross attention in decoders, you want the decoder to mask out future tokens
    if mask is not None:
      # Wherever the mask is 0, we replace the corresponding elements in scores with -infinity (why not 0?)
      # doc: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.masked_fill_.html#torch.Tensor.masked_fill_
      scores = scores.masked_fill(mask == 0, float('-inf'))

    attn = F.softmax(scores, dim=-1) # [B, H, L, L]

    out = attn @ V  # [B, H, L, head_dim]

    # Now we concatenate the heads back together by:
    # 1. Transposing the 2nd and 3rd dimensions to make the tensor [B, L, H, head_dim]
    # 2. Reshape the tensor to make it [B, L, D] (remember, D = H * head_dim)
    out = out.transpose(1, 2).reshape(B, L_q, D)

    # Final output projection
    out = self.out_proj(out)

    return out, attn


# Creating an Encoder

In [None]:
# Creating the encoder of our transformer model
class Encoder(nn.Module):
  def __init__(self, vocab_size, emb_dim, num_heads, hid_dim):
    super().__init__()

    # Embedding layer
    self.emb = nn.Embedding(vocab_size, emb_dim)

    # Self attention block
    # Right now, we are using our custom attention layer but PyTorch also has its own implementation of the same
    # PyTorch attention layer: https://docs.pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
    self.self_attn = MultiHeadAttention(emb_dim, num_heads)

    # Feedforward network (AKA: MLP)
    self.linear1 = nn.Linear(emb_dim, hid_dim)
    self.linear2 = nn.Linear(hid_dim, emb_dim)

    # Activation function (usually this is a non-linear function like ReLU)
    self.activation = nn.ReLU()

    # Layer normalization layers
    self.norm1 = nn.LayerNorm(emb_dim)
    self.norm2 = nn.LayerNorm(emb_dim)

  def forward(self, input_ids):
    # Convert the input IDs into embedding vectors
    x = self.emb(input_ids)

    # Self attend over the input embeddings
    attn_out, _ = self.self_attn(x, x, x)

    # Residual connection makes sure the input signal doesn't get completely lost
    # Layer normalization are often done to make sure the output values are small and close together to maintain stability
    x = self.norm1(x + attn_out)

    #Feed-forward Network/MLP
    mlp_out = self.linear2(self.activation(self.linear1(x)))
    x = self.norm2(x + mlp_out)

    return x


# Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, emb_dim, num_heads, hid_dim):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.self_attn = MultiHeadAttention(emb_dim, num_heads)
    self.cross_attn = MultiHeadAttention(emb_dim, num_heads)

    self.linear1 = nn.Linear(emb_dim, hid_dim)
    self.linear2 = nn.Linear(hid_dim, emb_dim)

    self.norm1 = nn.LayerNorm(emb_dim)
    self.norm2 = nn.LayerNorm(emb_dim)
    self.norm3 = nn.LayerNorm(emb_dim)

    self.activation = nn.ReLU()

  def forward(self, output_ids, enc_out):
    batch_size, seq_len = output_ids.shape

    y = self.emb(output_ids)

    # Mask future tokens because we don't want the model to be able to peek at the "future" tokens
    # We do this by zeroing out all tokens that come after the current token so that the decoder can only look at previous tokens
    attn_mask = torch.tril(torch.ones((1, 1, seq_len, seq_len), device=output_ids.device), diagonal=0).bool()

    # Self-attention: attending over the current hidden state of the decoder itself
    self_attn_out, _ = self.self_attn(y, y, y, mask=attn_mask)
    y = self.norm1(y + self_attn_out)

    # Cross-attention: attending over the current decoder state and the output of the encoder
    # Query: Current decoder state
    # Key, Value: Output of the encoder
    cross_attn_out, _ = self.cross_attn(y, enc_out, enc_out)
    y = self.norm2(y + cross_attn_out)

    # FFN
    mlp_out = self.linear2(self.activation(self.linear1(y)))
    y = self.norm3(y + mlp_out)

    return y


In [None]:
# Model class that controls how the components work with each other
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, emb_dim, vocab_size):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.lm_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, x, y):
        enc_out = self.encoder(x)
        output = self.decoder(y, enc_out)
        output = self.lm_head(output)
        return output

In [None]:
encoder = Encoder(SRC_VOCAB_SIZE, EMB_DIM, NUM_HEADS, HID_DIM)
decoder = Decoder(TGT_VOCAB_SIZE, EMB_DIM, NUM_HEADS, HID_DIM)
model = Seq2Seq(encoder, decoder, EMB_DIM, TGT_VOCAB_SIZE).to(device)

In [None]:
# A simple training loop
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=tkr.pad_token_id)

for epoch in range(EPOCHS):
  model.train()
  total_loss = 0

  for input_ids, output_ids in train_loader:
    input_ids = input_ids.to(device)
    output_ids = output_ids.to(device)

    # We take all of the output_ids except the last one and treat it as the current state of generation
    y_input = output_ids[:, :-1]

    # [1, 2, 3, 4, 5]
    # [1, 2, 3, 4]
    # [2, 3, 4, 5] <- Expected output
    # [2, 10, 4, 6] <- Model output

    # The next step of generation must generate the same set of tokens SHIFTED TO THE LEFT by one step
    y_output = output_ids[:, 1:]

    optimizer.zero_grad()
    output = model(input_ids, y_input)

    output = output.reshape(-1, TGT_VOCAB_SIZE)
    y_output = y_output.reshape(-1)

    loss = criterion(output, y_output)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/2 | Loss: 6.2894
Epoch 2/2 | Loss: 5.5086


In [None]:
def translate_sentence(sentence, model, tkr, max_len=50):
  model.eval()
  with torch.no_grad():
    # Tokenize the English sentence
    input_ids = tkr(
        sentence,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_len
    ).input_ids.to(device)

    # Encode source
    enc_out = model.encoder(input_ids)

    # Start decoding with <pad> or <sos>
    output_tokens = torch.tensor([[tkr.pad_token_id]], device=device)

    for _ in range(max_len):
      # Get logits for the next token
      output = model.decoder(output_tokens, enc_out)
      logits = output[:, -1, :]  # [batch, vocab]
      next_token = torch.argmax(logits, dim=-1).unsqueeze(0)  # [1, 1]

      output_tokens = torch.cat([output_tokens, next_token], dim=1)

      # Stop if EOS generated
      if next_token.item() == tkr.eos_token_id:
        break

    translated = tkr.decode(output_tokens.squeeze().tolist(), skip_special_tokens=True)
    return translated

# Testing the model by translating an arbitrary sentence
test_sentence = "Hello. My name is Bob"
translation = translate_sentence(test_sentence, model, tkr)
print("\nInput:", test_sentence)
print("Predicted Translation:", translation)



Input: Hello. My name is Bob
Predicted Translation: nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn


# Encoder-Decoder Model
Used to perform seq2seq tasks such as machine translation, text summarization & question answering.

Eg: BART, T5

In [None]:
bart = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
bart

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_n

# Encoder-Only Model
Useful for tasks where bidirectional context needs to be processed such as text classification, sentiment analysis, extracting the answer to a question from some text.

Eg: BERT, RoBERTa

In [None]:
model = BertModel.from_pretrained("bert-base-uncased")
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# Decoder-Only Model
Used to autoregressively generate text given some input. These days, decoder-only models are quite common since they can also perform the tasks of encoder-decoder models without requiring an encoder.

Eg: GPT, Llama, Qwen, Phi

In [None]:
from transformers import Qwen2ForCausalLM

In [None]:
qwen = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
qwen

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar