In [None]:
import urllib.request
import tarfile

# downloading the europarl fr-en dataset
url = "http://www.statmt.org/europarl/v7/fr-en.tgz"
filename = "fr-en.tgz"
urllib.request.urlretrieve(url, filename)
with tarfile.open(filename, "r:gz") as tar:
    tar.extractall("europarl")

In [12]:
ENGLISH_PATH = "europarl/europarl-v7.fr-en.en"
FRENCH_PATH = "europarl/europarl-v7.fr-en.fr"

def zip_files(english_path, french_path):
    with open(english_path, "r", encoding="utf-8") as f_english, \
         open(french_path, "r", encoding="utf-8") as f_french:
        french_lines = f_french.readlines()
        english_lines = f_english.readlines()

    assert len(english_lines) == len(french_lines), "different number of lines in files!!"
    pairs = list(zip(french_lines, english_lines))
    return pairs

sentence_pairs = zip_files(ENGLISH_PATH, FRENCH_PATH)

print("total sentence pairs:", len(sentence_pairs))


total sentence pairs: 2007723


In [13]:
import unicodedata
import re

def clean_text(s: str) -> str:
    # normalize unicode to NFC
    s = unicodedata.normalize("NFC", s)
    # replace non breaking space with regular space
    s = s.replace("\xa0", " ")
    # collapse multiple spaces into a single space
    s = re.sub(r"\s+", " ", s)
    # strip whitespace and lowercase all letters
    return s.strip().lower()

def clean_data(sentences):
    cleaned_data = []
    for english, french in sentences:
        english_cleaned = clean_text(english)
        french_cleaned = clean_text(french)
        # skip empty sentences
        if english_cleaned == "" or french_cleaned == "":
            continue
        cleaned_data.append((french_cleaned, english_cleaned))
    return cleaned_data

cleaned_data = clean_data(sentence_pairs)
print("after cleaning total sentence pairs:", len(cleaned_data))


after cleaning total sentence pairs: 2002756


In [14]:
def tokenize(sentence):
    return sentence.split()

def tokenize_and_filter(sentence_pairs, max_len=40, min_len=1):
    # tokenize and filter sentences by length
    token_pairs = []
    for english, french in sentence_pairs:
        english_tokens = tokenize(english)
        french_tokens = tokenize(french)

        if not (min_len <= len(english_tokens) <= max_len):
            continue
        if not (min_len <= len(french_tokens) <= max_len):
            continue

        token_pairs.append((french_tokens, english_tokens))
    return token_pairs

tokenized_pairs = tokenize_and_filter(cleaned_data, max_len=40)
print("after tokenization & length filtering:", len(tokenized_pairs))
print("example:", tokenized_pairs[1])


after tokenization & length filtering: 1654263
example: (['je', 'déclare', 'reprise', 'la', 'session', 'du', 'parlement', 'européen', 'qui', 'avait', 'été', 'interrompue', 'le', 'vendredi', '17', 'décembre', 'dernier', 'et', 'je', 'vous', 'renouvelle', 'tous', 'mes', 'vux', 'en', 'espérant', 'que', 'vous', 'avez', 'passé', 'de', 'bonnes', 'vacances.'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'])


In [None]:
import random

random.seed(42)
random.shuffle(tokenized_pairs)

# split into 80% train, 10% validation, 10% test
n_total = len(tokenized_pairs)
n_train = int(0.80 * n_total)
n_validation = int(0.10 * n_total)
n_test = n_total - n_train - n_validation

train_pairs = tokenized_pairs[:n_train]
validation_pairs = tokenized_pairs[n_train:n_train+n_validation]
test_pairs = tokenized_pairs[n_train+n_validation:]

print(f"number of train samples: {len(train_pairs)}, number of validation samples: {len(validation_pairs)}, number of test samples: {len(test_pairs)}")

number of train samples: 1323410, number of validation samples: 165426, number of test samples: 165427


In [18]:
from collections import Counter

SPECIAL_TOKENS = ["<pad>", "<sos>", "<eos>", "<unk>"]

def build_vocab(pairs, lang="source", min_frequency=3):
    """
    pairs: list of (src_tokens, tgt_tokens)
    lang: "src" for English, "tgt" for French
    min_freq: minimum times a word must appear to be kept
    """
    counter = Counter()
    for source_tokens, target_tokens in pairs:
        tokens = source_tokens if lang == "source" else target_tokens
        counter.update(tokens)

    # keep only tokens that appear at least min_frequency times
    itos = SPECIAL_TOKENS + [w for w, c in counter.items() if c >= min_frequency]
    stoi = {w: i for i, w in enumerate(itos)}
    return stoi, itos

source_stoi, source_itos = build_vocab(train_pairs, lang="source", min_frequency=3)  #French
target_stoi, target_itos = build_vocab(train_pairs, lang="target", min_frequency=3)  #English

print("source vocab size (french):", len(source_itos))
print("target vocab size (english):", len(target_itos))
print("first 20 source tokens:", source_itos[:20])
print("first 20 target tokens:", target_itos[:20])


source vocab size (french): 111728
target vocab size (english): 79441
first 20 source tokens: ['<pad>', '<sos>', '<eos>', '<unk>', 'toutefois,', 'nous', 'sommes', "d'avis,", 'avec', 'un', 'grand', 'nombre', 'de', 'blogueurs,', 'que', 'les', 'atteintes', 'à', 'la', 'vie']
first 20 target tokens: ['<pad>', '<sos>', '<eos>', '<unk>', 'however,', 'we', 'share', 'the', 'view,', 'with', 'many', 'that', 'violations', 'and', 'slander', 'are', 'equally', 'punishable', 'on', 'as']


In [19]:
PAD_IDX = source_stoi["<pad>"]
SOS_IDX = source_stoi["<sos>"]
EOS_IDX = source_stoi["<eos>"]
UNK_IDX = source_stoi["<unk>"]

def tokens_to_ids(tokens, stoi, add_sos_eos=False):
    # convert list of tokens to list of ids using the provided stoi mapping
    # only add <sos> and <eos> if add_sos_eos is True (for target sequences)
    ids = []
    if add_sos_eos:
        ids.append(stoi["<sos>"])
    for token in tokens:
        ids.append(stoi.get(token, UNK_IDX))
    if add_sos_eos:
        ids.append(stoi["<eos>"])
    return ids

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class TranslationDataset(Dataset):
    def __init__(self, token_pairs, source_stoi, target_stoi):
        self.token_pairs = token_pairs
        self.source_stoi = source_stoi
        self.target_stoi = target_stoi

    def __len__(self):
        return len(self.token_pairs)

    def __getitem__(self, idx):
        source_tokens, target_tokens = self.token_pairs[idx]  # (FRENCH, ENGLISH)

        # french source, don't add <sos> and <eos>
        source_ids = tokens_to_ids(source_tokens, self.source_stoi, add_sos_eos=False)
        # english target, add <sos> and <eos>
        target_ids = tokens_to_ids(target_tokens, self.target_stoi, add_sos_eos=True)

        return (
            torch.tensor(source_ids, dtype=torch.long),
            torch.tensor(target_ids, dtype=torch.long),
        )

def collate_fn(batch):
    # batch is the list of (source_sequence, target_sequence) tuples
    source_sequences, target_sequences = zip(*batch)

    source_lengths = torch.tensor([len(s) for s in source_sequences], dtype=torch.long)
    target_lengths = torch.tensor([len(t) for t in target_sequences], dtype=torch.long)

    # pad sequences to max sequence length in this batch
    source_padded = pad_sequence(source_sequences, batch_first=True, padding_value=PAD_IDX)
    target_padded = pad_sequence(target_sequences, batch_first=True, padding_value=PAD_IDX)

    return source_padded, target_padded, source_lengths, target_lengths


In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64

train_dataset = TranslationDataset(train_pairs, source_stoi, target_stoi)
val_dataset   = TranslationDataset(validation_pairs, source_stoi, target_stoi)
test_dataset  = TranslationDataset(test_pairs, source_stoi, target_stoi)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

source_batch, target_batch, source_lengths, target_lengths = next(iter(train_loader))


src_batch shape: torch.Size([64, 40])
tgt_batch shape: torch.Size([64, 41])
src_lens[:5]: tensor([25, 37, 19, 12, 25])
tgt_lens[:5]: tensor([27, 41, 19, 15, 33])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

ENCODER_EMBED_DIM = 256  
DECODER_EMBED_DIM = 256  
HIDDEN_DIM = 512
ENCODER_DROPOUT = 0.2
DECODER_DROPOUT = 0.2

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, dropout):
        super().__init__()
        # input_dim = size of French vocab (len(source_stoi))

        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=PAD_IDX)

        # bidirectional GRU: outputs have size 2 * hid_dim
        self.gru = nn.GRU(
            embed_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
        )

        # to turn [fwd; bwd] into a single decoder hidden state
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        src: [batch_size, src_len]  (French token ids)
        returns:
          enc_outputs: [batch_size, src_len, 2*hid_dim]
          dec_init_hidden: [1, batch_size, hid_dim]
        """

        # 1) embed tokens
        embedded = self.dropout(self.embedding(x))   # [B, src_len, emb_dim]

        # 2) run bidirectional GRU
        enc_outputs, hidden = self.gru(embedded)
        # enc_outputs: [B, src_len, 2*hid_dim]
        # hidden: [num_layers * num_directions, B, hid_dim] = [2, B, hid_dim]

        # 3) take last forward & backward hidden states
        forward_hidden  = hidden[-2]   # [B, hid_dim]
        backward_hidden = hidden[-1]   # [B, hid_dim]

        # 4) concatenate & map to decoder hidden size
        hidden_cat = torch.cat((forward_hidden, backward_hidden), dim=1)  # [B, 2*hid_dim]
        dec_init_hidden = torch.tanh(self.fc(hidden_cat)).unsqueeze(0)    # [1, B, hid_dim]

        return enc_outputs, dec_init_hidden
