## 1) Install and imports


In [1]:
!pip install -q sentencepiece tqdm sacrebleu

import os
import math
import random
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import sentencepiece as spm
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


#This brief install block ensures the tokenizer and helper libraries are available.
#sentencepiece is required; sacrebleu is optional for BLEU evaluation but recommended if internet access during environment creation is allowed. 
#If you plan to run on an offline system, install these dependencies beforehand.


##### This brief install block ensures the tokenizer and helper libraries are available. sentencepiece is required; sacrebleu is optional for BLEU evaluation but recommended if internet access during environment creation is allowed. If you plan to run on an offline system, install these dependencies beforehand.

## 2) Environment and file paths


In [2]:
# Device selection
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# File paths (these are the outputs the notebook will produce)
DATA_CSV = "eng-french.csv"         # input dataset (must be present)
SPM_PREFIX = "bpe_enfr"             # SentencePiece model prefix (will produce .model and .vocab)
SPM_MODEL = f"{SPM_PREFIX}.model"
SPM_VOCAB  = f"{SPM_PREFIX}.vocab"
CKPT_PATH = "best_local_transformer.pt"
PICKLE_PATH = "best_local_transformer.pkl"

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == "cuda":
    torch.cuda.manual_seed_all(SEED)



Device: cpu


##### The notebook is reproducible by setting a fixed random seed. GPU availability is auto-detected. Keep DATA_CSV in the same directory as the notebook.

## 3) Hyperparameters and design choices (tuned for a laptop GPU with 4-6 GB VRAM)


In [3]:

# Tokenizer
VOCAB_SIZE = 8000   # chosen as a balance between expressiveness and memory

# - Why 8000: good trade-off for general translation tasks on modest datasets; captures common subwords
# - Alternative (advantage/cost): 16000 (better rare-word handling, more parameters and memory), 4000 (smaller memory but more fragmentation of common words)

# Sequence lengths
MAX_SRC_LEN = 50
MAX_TGT_LEN = 50

# - 50 tokens covers most short/medium sentences; larger lengths increase memory and computation quadratically in attention

# Transformer architecture
D_MODEL = 384      # model dimension
NHEAD = 6          # number of attention heads (must divide D_MODEL)
ENC_LAY = 4        # encoder layers
DEC_LAY = 4        # decoder layers
FF_DIM = 1024      # feed forward hidden dimension
DROPOUT = 0.15     # regularization

# - If you have more VRAM (e.g., T4 in Colab), you can scale D_MODEL and FF_DIM up for better accuracy.

# Training
BATCH_SIZE = 64        # per step batch size; actual effective = BATCH_SIZE * GRAD_ACCUM_STEPS
GRAD_ACCUM_STEPS = 3   # effective batch = 192 (64 * 3)
EPOCHS = 45
PATIENCE = 5
LABEL_SMOOTHING = 0.05
WARMUP_STEPS = 2500
LR_BASE = 1.0
WEIGHT_DECAY = 1e-4
MAX_GRAD_NORM = 1.0


# - Gradient accumulation improves effective batch size without OOM. Typical accum steps: 2-4 on small GPUs.
# - Label smoothing prevents overconfidence; 0.05 is a conservative value that improves generalization.
print("Hyperparameters set.")


Hyperparameters set.


##### Each hyperparameter includes a concise rationale and alternatives.

## 4) Load dataset and create plain text corpus for SentencePiece


In [4]:

assert Path(DATA_CSV).exists(), f"Dataset file {DATA_CSV} not found."

df = pd.read_csv(DATA_CSV)
df.dropna(inplace=True)

# Convert columns to strings and lowercase for consistency
eng = df["English words/sentences"].astype(str).str.strip().str.lower()
fra = df["French words/sentences"].astype(str).str.strip().str.lower()

print(f"Total examples: {len(df)}")

# Create a combined text file for SentencePiece training: include both languages to get a shared BPE
CORPUS_PATH = "spm_corpus.txt"
with open(CORPUS_PATH, "w", encoding="utf-8") as f:
    for s in eng:
        f.write(s + "\n")
    for s in fra:
        f.write(s + "\n")

print(f"Corpus for SentencePiece written to {CORPUS_PATH}")


Total examples: 175621
Corpus for SentencePiece written to spm_corpus.txt


##### A shared BPE (single tokenizer for both languages) can improve subword alignment and simplify model embeddings (single joint vocabulary). Alternative: separate tokenizers per language advantage: tailored tokenization per language; cost: larger embedding matrices and extra complexity.

## 5) Train SentencePiece (BPE) tokenizer


In [5]:
# This produces bpe_enfr.model and bpe_enfr.vocab
spm.SentencePieceTrainer.Train(
    input=CORPUS_PATH,
    model_prefix=SPM_PREFIX,
    vocab_size=VOCAB_SIZE,
    model_type='bpe',
    character_coverage=1.0,
    bos_id=1, eos_id=2, pad_id=0, unk_id=3
)

# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load(SPM_MODEL)
print("SentencePiece trained and loaded. Vocab size:", sp.get_piece_size())

# Save a short README about tokenizer choices
with open("TOKENIZER_README.md", "w") as f:
    f.write("Tokenizer: SentencePiece (BPE)\n")
    f.write(f"vocab_size = {VOCAB_SIZE}\n")
    f.write("Remarks: joint BPE for source and target languages.\n")


SentencePiece trained and loaded. Vocab size: 8000


##### Training the tokenizer inside the notebook ensures reproducibility. vocab_size=8000 is the chosen compromise. character_coverage=1.0 is fine for Latin scripts; for multilingual or special scripts, adjust accordingly.

## 6) Encoding helpers and prepare numeric arrays for model

In [6]:
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2

def encode_sentence(text, maxlen):
    # encode using sentencepiece; add BOS/EOS and pad/truncate
    ids = [BOS_ID] + sp.encode(text, out_type=int)[:maxlen-2] + [EOS_ID]
    if len(ids) < maxlen:
        ids += [PAD_ID] * (maxlen - len(ids))
    return ids[:maxlen]

# Encode entire dataset into numpy arrays
X = np.array([encode_sentence(s, MAX_SRC_LEN) for s in eng], dtype=np.int64)
Y = np.array([encode_sentence(s, MAX_TGT_LEN) for s in fra], dtype=np.int64)

# Train / Validation / Test split (80/10/10)
from sklearn.model_selection import train_test_split
X_train, X_tmp, Y_train, Y_tmp = train_test_split(X, Y, test_size=0.20, random_state=SEED)
X_valid, X_test, Y_valid, Y_test = train_test_split(X_tmp, Y_tmp, test_size=0.50, random_state=SEED)

print("Dataset shapes:", X_train.shape, X_valid.shape, X_test.shape)


Dataset shapes: (140496, 50) (17562, 50) (17563, 50)


##### Split stratified by default randomness; for small datasets, consider k-fold cross validation as an alternative to obtain more robust estimates. Here we use a single holdout for simplicity and reproducibility.

## 7) Dataset and DataLoaders (teacher forcing: tgt_in and tgt_out)


In [7]:
class NMTDataset(Dataset):
    def __init__(self, src_array, tgt_array):
        self.src = torch.tensor(src_array, dtype=torch.long)
        self.tgt = torch.tensor(tgt_array, dtype=torch.long)
    def __len__(self):
        return self.src.size(0)
    def __getitem__(self, idx):
        src = self.src[idx]
        tgt = self.tgt[idx]
        # return source, decoder input (t=0..T-1), decoder target (t=1..T)
        return src, tgt[:-1], tgt[1:]

train_loader = DataLoader(NMTDataset(X_train, Y_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
valid_loader = DataLoader(NMTDataset(X_valid, Y_valid), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(NMTDataset(X_test,  Y_test),  batch_size=BATCH_SIZE, shuffle=False)


##### drop_last=True during training ensures consistent batch sizes for gradient accumulation. For evaluation, keep drop_last=False to measure on full data.

## 8) Model definition: positional encoding and seq2seq transformer


In [8]:
class PositionalEncoding(nn.Module):
    """Standard sinusoidal positional encoding as used in the Transformer paper.
    max_len is set to MAX_LEN to match training sequence lengths; this avoids positional-buffer mismatch when loading checkpoints.
    """
    def __init__(self, d_model, dropout=0.1, max_len=MAX_SRC_LEN):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # shape [1, max_len, d_model]

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

class TransformerNMT(nn.Module):
    """Encoder-decoder Transformer wrapper using torch.nn.Transformer (batch_first=True).
    Uses a shared vocabulary (joint BPE) with tied embeddings for decoder and output projection.
    """
    def __init__(self, vocab_size, d_model, nhead, enc_layers, dec_layers, dim_feedforward, dropout):
        super().__init__()
        self.src_embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
        self.tgt_embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)

        # Positional encodings for both sides; keep max_len equal to MAX_SRC_LEN/TGT_LEN as appropriate
        self.pos_src = PositionalEncoding(d_model, dropout, max_len=MAX_SRC_LEN)
        self.pos_tgt = PositionalEncoding(d_model, dropout, max_len=MAX_TGT_LEN)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=enc_layers,
            num_decoder_layers=dec_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )

        self.output = nn.Linear(d_model, vocab_size, bias=False)
        # weight tying: share decoder embedding and output projection
        self.output.weight = self.tgt_embed.weight

    @staticmethod
    def _look_ahead_mask(size, device):
        """Return a boolean mask of shape [size, size] where True blocks attention to future positions."""
        return torch.triu(torch.ones(size, size, dtype=torch.bool, device=device), diagonal=1)

    def forward(self, src, tgt_in):
        # src: [B, S]; tgt_in: [B, T]
        src_pad_mask = (src == PAD_ID)      # [B, S] bool
        tgt_pad_mask = (tgt_in == PAD_ID)   # [B, T] bool
        tgt_mask = self._look_ahead_mask(tgt_in.size(1), src.device)  # [T, T] bool

        src_emb = self.pos_src(self.src_embed(src))
        tgt_emb = self.pos_tgt(self.tgt_embed(tgt_in))

        out = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_pad_mask,
            tgt_key_padding_mask=tgt_pad_mask,
            memory_key_padding_mask=src_pad_mask
        )
        logits = self.output(out)  # [B, T, V]
        return logits


##### We use boolean tgt_mask (look-ahead) to avoid PyTorch deprecation warnings caused by mismatched mask types. Weight tying reduces parameters and often improves learning.

## 9) Instantiate model and training utilities


In [9]:

VOCAB_SIZE_ACTUAL = sp.get_piece_size()  # use actual trained vocabulary
model = TransformerNMT(VOCAB_SIZE_ACTUAL, D_MODEL, NHEAD, ENC_LAY, DEC_LAY, FF_DIM, DROPOUT).to(device)

# Loss with label smoothing (improves generalization)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=LABEL_SMOOTHING)

# Optimizer: AdamW is stable for Transformers and works well with weight decay
optimizer = optim.AdamW(model.parameters(), lr=LR_BASE, weight_decay=WEIGHT_DECAY)

# Noam learning rate schedule (the original Transformer schedule with warmup)
class NoamLR(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, d_model, warmup_steps, last_epoch=-1):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        super().__init__(optimizer, last_epoch)
    def get_lr(self):
        step = max(self._step_count, 1)
        scale = (self.d_model ** -0.5) * min(step ** -0.5, step * (self.warmup_steps ** -1.5))
        return [LR_BASE * scale for _ in self.optimizer.param_groups]

scheduler = NoamLR(optimizer, D_MODEL, WARMUP_STEPS)

# AMP scaler (GPU only). If CPU, set to None.
scaler = torch.cuda.amp.GradScaler() if device == "cuda" else None

print("Model parameters:", sum(p.numel() for p in model.parameters()) )


Model parameters: 19559936


##### Noam schedule is well-suited for Transformers. On small datasets, simpler schedulers (ReduceLROnPlateau) also work; Noam helps with initial warmup then decay behavior. AMP reduces memory & speeds up training on CUDA.

## 10) Helpers: masked accuracy and decode helpers


In [10]:

def masked_token_accuracy(logits, targets):
    """Compute token-level accuracy ignoring padding."""
    preds = logits.argmax(dim=-1)  # [B, T]
    mask = (targets != PAD_ID)
    correct = (preds[mask] == targets[mask]).sum().item()
    total = mask.sum().item()
    return correct / max(total, 1)

def decode_ids(ids):
    """Convert a list of token ids to a string, trimming BOS/EOS and PAD."""
    # remove BOS and trailing EOS/PAD
    if isinstance(ids, torch.Tensor):
        ids = ids.tolist()
    # remove leading BOS if present
    if len(ids) and ids[0] == BOS_ID:
        ids = ids[1:]
    # cut at EOS if present
    if EOS_ID in ids:
        ids = ids[:ids.index(EOS_ID)]
    # drop PADs
    ids = [i for i in ids if i not in (PAD_ID, BOS_ID, EOS_ID)]
    return sp.decode(ids)


##### Tokenlevel accuracy is an interpretable proxy; BLEU is recommended for sentence level quality assessment (we include optional BLEU evaluation below).

## 11) Training loop (gradient accumulation, AMP safe, live metrics)


In [None]:
GRAD_ACCUM_STEPS = GRAD_ACCUM_STEPS  # from hyperparameters above
best_val_acc = 0.0
wait = 0

def run_epoch(dataloader, training=True):
    model.train(training)
    total_loss = 0.0
    total_acc = 0.0
    steps = 0

    if training:
        optimizer.zero_grad(set_to_none=True)
        accum = 0

    pbar = tqdm(dataloader)
    for src, tgt_in, tgt_out in pbar:
        src = src.to(device); tgt_in = tgt_in.to(device); tgt_out = tgt_out.to(device)

        # Autocast (GPU only)
        if scaler is not None:
            autocast_ctx = torch.cuda.amp.autocast()
        else:
            # Dummy context manager for CPU to keep code uniform
            class _DummyCtx:
                def __enter__(self): return None
                def __exit__(self, exc_type, exc, tb): return False
            autocast_ctx = _DummyCtx()

        with autocast_ctx:
            logits = model(src, tgt_in)                     # [B, T, V]
            loss = criterion(logits.transpose(1, 2), tgt_out)  # CE expects [B, V, T]

        if training:
            # scale loss for accumulation
            if scaler is not None:
                scaler.scale(loss / GRAD_ACCUM_STEPS).backward()
            else:
                (loss / GRAD_ACCUM_STEPS).backward()
            accum += 1

            if accum == GRAD_ACCUM_STEPS:
                # gradient clipping and optimizer step
                nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                if scaler is not None:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                scheduler.step()
                optimizer.zero_grad(set_to_none=True)
                accum = 0

        acc = masked_token_accuracy(logits, tgt_out)
        total_loss += loss.item()
        total_acc += acc
        steps += 1

        # live display
        pbar.set_postfix(loss=loss.item(), acc=acc)

    # if training and leftover accum gradients exist, flush them
    if training and 'accum' in locals() and accum > 0:
        nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        if scaler is not None:
            scaler.step(optimizer); scaler.update()
        else:
            optimizer.step()
        scheduler.step()
        optimizer.zero_grad(set_to_none=True)

    return total_loss / steps, total_acc / steps

# Main training loop with early stopping
for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_loss, train_acc = run_epoch(train_loader, training=True)
    val_loss, val_acc = run_epoch(valid_loader, training=False)

    print(f"Train Loss {train_loss:.3f} | Train Acc {train_acc:.4f}")
    print(f"Val   Loss {val_loss:.3f} | Val   Acc {val_acc:.4f}")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        wait = 0
        # Save state_dict (recommended over pickling full model)
        torch.save(model.state_dict(), CKPT_PATH)
        print(f"New best model saved (Val Acc = {val_acc:.4f})")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stopping triggered.")
            break


##### This training loop uses gradient accumulation for effective large batches while preventing OOM. We save the state_dict as the canonical checkpoint; that is portable and recommended over full model pickles.

## 12) Load best model and evaluate on test set (token accuracy and BLEU)


In [None]:

# Load checkpoint
state = torch.load(CKPT_PATH, map_location=device, weights_only=True)
model.load_state_dict(state)
model.to(device)
model.eval()

# Token-level accuracy on test set
total_loss = 0.0
total_acc = 0.0
steps = 0
all_hypotheses = []
all_references = []

with torch.no_grad():
    for src, tgt_in, tgt_out in tqdm(test_loader):
        src = src.to(device); tgt_in = tgt_in.to(device); tgt_out = tgt_out.to(device)
        logits = model(src, tgt_in)
        loss = criterion(logits.transpose(1, 2), tgt_out)
        acc = masked_token_accuracy(logits, tgt_out)

        total_loss += loss.item()
        total_acc += acc
        steps += 1

        # Collect for BLEU: greedy decode
        preds = logits.argmax(dim=-1).cpu().numpy()  # [B, T]
        for p, t in zip(preds, tgt_out.cpu().numpy()):
            all_hypotheses.append(sp.decode([i for i in p.tolist() if i not in (PAD_ID, BOS_ID, EOS_ID)]))
            all_references.append(sp.decode([i for i in t.tolist() if i not in (PAD_ID, BOS_ID, EOS_ID)]))

token_acc = total_acc / steps
avg_loss = total_loss / steps
print(f"\nTest token accuracy: {token_acc:.4f} | Avg loss: {avg_loss:.3f}")

# BLEU (sacrebleu expects references as list of list)
bleu = sacrebleu.corpus_bleu(all_hypotheses, [all_references])
print("BLEU score (sacrebleu):", round(bleu.score, 2))

# - BLEU is a sentence-level metric that correlates imperfectly with perceived quality.
# - Use human inspection and examples for final judgement.


##### We compute both token-level accuracy and BLEU to provide both token-wise and sentence-level quality measures. BLEU here is computed on greedy outputs; beam search will improve BLEU and perceived quality.

## 13) Save model state_dict as pickle (portable backup) and ensure tokenizer files are present


In [None]:
# 13) Save model state_dict as pickle (portable backup) and ensure tokenizer files are present

with open(PICKLE_PATH, "wb") as f:
    pickle.dump(model.state_dict(), f)

# The SentencePiece model and vocab are already created by the training step; ensure they exist
assert Path(SPM_MODEL).exists(), f"{SPM_MODEL} not found."
assert Path(SPM_VOCAB).exists(), f"{SPM_VOCAB} not found."

print(f"Saved: {CKPT_PATH}, {PICKLE_PATH}, {SPM_MODEL}, {SPM_VOCAB}")
