Duration: 1.5 hours
Goal: Build and train a sequence-to-sequence (seq2seq) encoder-decoder model that can translate simple English sentences into French.

⸻

Step 1 – Dataset (Provided)
	•	Load English–French sentence pairs.
	•	Preprocess: lowercase, tokenize, pad, and build vocabularies.

Deliverable: Show a few tokenized sentence pairs.

⸻

Step 2 – Encoder and Decoder (Skeleton Provided)
	•	Complete the EncoderRNN and DecoderRNN classes.
	•	Verify that they can process batches and produce outputs.

Deliverable: Run dummy data through your models and print output shapes.

⸻

Step 3 – Training Loop
	•	Implement the training step:
	•	Encode the input sequence.
	•	Decode step-by-step to generate the target sequence.
	•	Compute loss and backpropagate.

Deliverable: Train for a few iterations and show the loss decreasing.

⸻

Step 4 – Evaluation
	•	Write an evaluate() function that translates English sentences into French.
	•	Compare model predictions with target translations.

Deliverable: Show at least three input → predicted output → true output examples.

⸻

Reflection Questions
	1.	What was the hardest part of getting the model to train?
	2.	What do you think attention would add to this model?
	3.	How could we extend this to handle longer sentences?
# ============================================================
# 1. Setup & Dataset
# ============================================================

import torch
import torch.nn as nn
import torch.optim as optim
import random
import re
import unicodedata
from torch.utils.data import DataLoader, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Download dataset ----
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip
!head -5 fra.txt

# ---- Preprocessing helpers ----
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# ---- Load sentence pairs ----
pairs = []
with open("fra.txt", encoding="utf-8") as f:
    for line in f.readlines()[:10000]:  # limit for speed
        eng, fra, _ = line.strip().split("\t")
        pairs.append((normalize_string(eng), normalize_string(fra)))

print(random.choice(pairs))

# ---- Vocabulary class ----
class Vocab:
    def __init__(self):
        self.word2idx = {"<PAD>":0, "<SOS>":1, "<EOS>":2}
        self.idx2word = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>"}
        self.n_words = 3
    
    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.idx2word[self.n_words] = word
            self.n_words += 1

input_vocab = Vocab()
output_vocab = Vocab()

for eng, fra in pairs:
    input_vocab.add_sentence(eng)
    output_vocab.add_sentence(fra)

print("Input vocab size:", input_vocab.n_words)
print("Output vocab size:", output_vocab.n_words)

# ============================================================
# 2. Dataset & DataLoader
# ============================================================

MAX_LEN = 10

def indexes_from_sentence(vocab, sentence):
    return [vocab.word2idx[word] for word in sentence.split(" ") if word in vocab.word2idx] + [2]  # EOS

def pad_seq(seq, max_length=MAX_LEN):
    seq += [0 for _ in range(max_length - len(seq))]
    return seq[:max_length]

class TranslationDataset(Dataset):
    def __init__(self, pairs, input_vocab, output_vocab):
        self.data = []
        for eng, fra in pairs:
            self.data.append((
                pad_seq(indexes_from_sentence(input_vocab, eng)),
                pad_seq(indexes_from_sentence(output_vocab, fra))
            ))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

dataset = TranslationDataset(pairs, input_vocab, output_vocab)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# ============================================================
# 3. Model Skeleton
# ============================================================

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        # TODO: implement embedding + RNN
    
    def forward(self, x, hidden):
        # TODO: implement encoder forward pass
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        # TODO: implement embedding + RNN + output layer
    
    def forward(self, x, hidden):
        # TODO: implement decoder forward pass
        return output, hidden

# ============================================================
# 4. Training Loop (Skeleton)
# ============================================================

def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    # TODO: implement training step
    loss = 0
    return loss

# ============================================================
# 5. Evaluation
# ============================================================

def evaluate(encoder, decoder, sentence):
    # TODO: implement greedy decoding
    return translated_sentence

# ============================================================
# Main
# ============================================================

hidden_size = 256
encoder = EncoderRNN(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_vocab.n_words).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.01)

# Train for a few steps
for epoch in range(3):
    for i, (inp, tgt) in enumerate(loader):
        loss = train_step(inp.to(device), tgt.to(device),
                          encoder, decoder,
                          encoder_optimizer, decoder_optimizer,
                          criterion)
        if i % 100 == 0:
            print(f"Epoch {epoch}, Step {i}, Loss {loss:.4f}")

# Test evaluation
print(evaluate(encoder, decoder, "i am tired ."))

In [1]:
### Importing utilities
from __future__ import unicode_literals, print_function, division
import pandas as pd
import numpy as np
import time
import math
import pickle

import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
from prettytable import PrettyTable

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import  nltk.translate.bleu_score as bleu
from matplotlib.font_manager import FontProperties

from io import open
import unicodedata
import string
import re
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = open('/kaggle/input/frenchenglish/fra.txt', encoding='utf-8').\
        read().strip().split('\n')
pairs = [[s for s in l.split('\t')[:2]] for l in data]
pairs[100:110]

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/frenchenglish/fra.txt'