In [7]:
import os
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# ================================
# 1. Configuration
# ================================
# --- Hyperparameters ---
EMBED_DIM = 256
HIDDEN_DIM = 512
N_EPOCHS = 10  # 100 epochs is long, let's start with 10
BATCH_SIZE = 32
LEARNING_RATE = 0.001

# --- Special Tokens ---
PAD_TOKEN = '<PAD>'
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'

# --- Device Setup ---
# This will automatically use your RTX 3050 if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def load_local_csv(path):
    """Loads data from a local tab-separated .csv file."""
    data = []
    try:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                parts = line.split("\t")
                if len(parts) == 2:
                    # Match the dict structure of the Colab example
                    data.append({
                        "english word": parts[0],
                        "native word": parts[1]
                    })
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def build_vocab(pairs):
    """Builds source and target vocabularies from the data."""
    src_chars = Counter()
    tgt_chars = Counter()
    for p in pairs:
        src_chars.update(list(p["english word"]))
        tgt_chars.update(list(p["native word"]))

    src_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN] + sorted(src_chars)
    tgt_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN] + sorted(tgt_chars)

    src_to_ix = {ch: i for i, ch in enumerate(src_vocab)}
    tgt_to_ix = {ch: i for i, ch in enumerate(tgt_vocab)}
    ix_to_tgt = {i: ch for ch, i in tgt_to_ix.items()}

    return src_to_ix, tgt_to_ix, ix_to_tgt

def encode_sequence(text, mapping):
    """Converts a text string to a list of token indices."""
    return [mapping[SOS_TOKEN]] + \
           [mapping[ch] for ch in text if ch in mapping] + \
           [mapping[EOS_TOKEN]]

In [9]:
class TransliterationDataset(Dataset):
    def __init__(self, data_pairs):
        self.data_pairs = data_pairs

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        src_seq, tgt_seq = self.data_pairs[idx]
        return torch.tensor(src_seq, dtype=torch.long), \
               torch.tensor(tgt_seq, dtype=torch.long)

def collate_fn_factory(pad_idx_src, pad_idx_tgt):
    """Creates a collate_fn to pad batches."""
    def collate_fn(batch):
        src_batch, tgt_batch = zip(*batch)
        src_batch = pad_sequence(src_batch, padding_value=pad_idx_src, batch_first=True)
        tgt_batch = pad_sequence(tgt_batch, padding_value=pad_idx_tgt, batch_first=True)
        return src_batch, tgt_batch
    return collate_fn

In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        # Using GRU as in the simple example
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        # src = [batch_size, src_len]
        embedded = self.embedding(src)
        # embedded = [batch_size, src_len, embed_dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [batch_size, src_len, hidden_dim]
        # hidden = [1, batch_size, hidden_dim]
        return hidden


class DecoderRNN(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_token, hidden):
        # input_token = [batch_size]
        # hidden = [1, batch_size, hidden_dim]
        embedded = self.embedding(input_token.unsqueeze(1))
        # embedded = [batch_size, 1, embed_dim]
        output, hidden = self.rnn(embedded, hidden)
        # output = [batch_size, 1, hidden_dim]
        # hidden = [1, batch_size, hidden_dim]
        prediction = self.fc_out(output.squeeze(1))
        # prediction = [batch_size, output_dim]
        return prediction, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        hidden = self.encoder(src)
        input_token = tgt[:, 0]  # <SOS> token

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = tgt[:, t] if teacher_force else top1

        return outputs

In [11]:
def transliterate_word(model, word, src_to_ix, tgt_to_ix, ix_to_tgt, device, max_len=30):
    model.eval()

    # Encode input sequence
    input_seq = [src_to_ix[SOS_TOKEN]] + \
                [src_to_ix[ch] for ch in word if ch in src_to_ix] + \
                [src_to_ix[EOS_TOKEN]]
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden = model.encoder(input_tensor)

    # Start decoding
    input_token = torch.tensor([tgt_to_ix[SOS_TOKEN]], dtype=torch.long).to(device)
    decoded_chars = []

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden = model.decoder(input_token, hidden)

        pred_token = output.argmax(1).item()

        if pred_token == tgt_to_ix[EOS_TOKEN]:
            break

        decoded_chars.append(ix_to_tgt[pred_token])
        input_token = torch.tensor([pred_token], dtype=torch.long).to(device)

    return ''.join(decoded_chars)

In [12]:
# ================================
# 5.5 New Evaluate Function
# ================================
# Add this new function right before your "Main Execution" section
def evaluate(model, dataloader, criterion, device):
    """Runs the model on the validation dataset."""
    model.eval()  # Set model to evaluation mode
    epoch_loss = 0

    with torch.no_grad(): # No gradients needed
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)

            # Forward pass (no teacher forcing)
            output = model(src, tgt, teacher_forcing_ratio=0.0)

            # Reshape for loss
            output_dim = output.shape[-1]
            loss = criterion(
                output[:, 1:].reshape(-1, output_dim),
                tgt[:, 1:].reshape(-1)
            )
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


# ================================
# 6. Main Execution (Rewritten for 'hin' dataset)
# ================================
if __name__ == "__main__":
    print(f"Running on device: {DEVICE}")

    # --- Step 1: Define file paths ---
    # We hardcode the paths to the 'hin' dataset
    # Based on your previous path, assuming 'aksharantar_sampled' is the root
    DATA_ROOT = r"W:\CV\HTIC\Qns-3\hin"
    train_file = os.path.join(DATA_ROOT, "hin_train.txt")
    valid_file = os.path.join(DATA_ROOT, "hin_valid.txt")
    # test_file = os.path.join(DATA_ROOT, "hin_test.csv") # We'll use valid for now

    # --- Step 2: Load Data ---
    # We use the same load_local_csv function from before
    train_data = load_local_csv(train_file)
    valid_data = load_local_csv(valid_file)

    if train_data and valid_data:
        print(f"\nLoaded {len(train_data)} training pairs.")
        print(f"Loaded {len(valid_data)} validation pairs.")

        # --- Step 3: Build Vocabs (from training data ONLY) ---
        src_to_ix, tgt_to_ix, ix_to_tgt = build_vocab(train_data)
        print(f"Source vocab size: {len(src_to_ix)}")
        print(f"Target vocab size: {len(tgt_to_ix)}")

        # --- Step 4: Encode Data ---
        encoded_train_data = [
            (encode_sequence(d["english word"], src_to_ix),
             encode_sequence(d["native word"], tgt_to_ix))
            for d in train_data
        ]
        encoded_valid_data = [
            (encode_sequence(d["english word"], src_to_ix),
             encode_sequence(d["native word"], tgt_to_ix))
            # Filter out words with chars not in training vocab
            for d in valid_data
            if all(c in src_to_ix for c in d["english word"]) and \
               all(c in tgt_to_ix for c in d["native word"])
        ]

        # --- Step 5: Create DataLoaders ---
        pad_src_idx = src_to_ix[PAD_TOKEN]
        pad_tgt_idx = tgt_to_ix[PAD_TOKEN]
        my_collate_fn = collate_fn_factory(pad_src_idx, pad_tgt_idx)

        train_dataset = TransliterationDataset(encoded_train_data)
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=my_collate_fn)

        valid_dataset = TransliterationDataset(encoded_valid_data)
        valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=my_collate_fn)

        # --- Step 6: Initialize Model ---
        input_dim = len(src_to_ix)
        output_dim = len(tgt_to_ix)

        encoder = EncoderRNN(input_dim, EMBED_DIM, HIDDEN_DIM).to(DEVICE)
        decoder = DecoderRNN(output_dim, EMBED_DIM, HIDDEN_DIM).to(DEVICE)
        model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

        criterion = nn.CrossEntropyLoss(ignore_index=pad_tgt_idx)
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

        # --- Step 7: Training Loop ---
        print(f"\nStarting training for {N_EPOCHS} epochs...")
        for epoch in range(N_EPOCHS):
            model.train() # Set to train mode
            total_loss = 0

            for src, tgt in train_dataloader:
                src, tgt = src.to(DEVICE), tgt.to(DEVICE)

                optimizer.zero_grad()
                output = model(src, tgt, teacher_forcing_ratio=0.5) # Use teacher forcing

                output_dim_loss = output.shape[-1]
                loss = criterion(
                    output[:, 1:].reshape(-1, output_dim_loss),
                    tgt[:, 1:].reshape(-1)
                )

                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            # --- Evaluate on validation data ---
            valid_loss = evaluate(model, valid_dataloader, criterion, DEVICE)

            print(f"Epoch [{epoch+1}/{N_EPOCHS}], Train Loss: {total_loss/len(train_dataloader):.4f}, Valid Loss: {valid_loss:.4f}")

        print("...Training complete!")

        # --- Step 8: Interactive Prediction Loop ---
        print("\n--- Interactive Transliteration (Hindi) ---")
        print("Type a Romanized word and press Enter (or 'quit' to stop).")

        while True:
            user_input = input("🔤 > ").strip()
            if user_input.lower() == "quit":
                break
            if not user_input:
                continue

            result = transliterate_word(model, user_input, src_to_ix, tgt_to_ix, ix_to_tgt, DEVICE)
            print(f"   → {result}\n")

    else:
        print("Could not start. Make sure these files exist:")
        print(f"  {train_file}")
        print(f"  {valid_file}")

Running on device: cuda

Loaded 51200 training pairs.
Loaded 4096 validation pairs.
Source vocab size: 29
Target vocab size: 67

Starting training for 10 epochs...
Epoch [1/10], Train Loss: 1.4548, Valid Loss: 1.2972
Epoch [2/10], Train Loss: 0.9357, Valid Loss: 1.2182
Epoch [3/10], Train Loss: 0.8305, Valid Loss: 1.1827
Epoch [4/10], Train Loss: 0.7740, Valid Loss: 1.1782
Epoch [5/10], Train Loss: 0.7242, Valid Loss: 1.1780
Epoch [6/10], Train Loss: 0.6935, Valid Loss: 1.1651
Epoch [7/10], Train Loss: 0.6692, Valid Loss: 1.1898
Epoch [8/10], Train Loss: 0.6442, Valid Loss: 1.2199
Epoch [9/10], Train Loss: 0.6235, Valid Loss: 1.2006
Epoch [10/10], Train Loss: 0.6087, Valid Loss: 1.1955
...Training complete!

--- Interactive Transliteration (Hindi) ---
Type a Romanized word and press Enter (or 'quit' to stop).
🔤 > ghar
   → घर

🔤 > ajanabee
   → अजनबी

🔤 > ajanabee,अजनबी
   → अजनबी



KeyboardInterrupt: Interrupted by user

In [18]:
# --- SAVE THE MODEL ---
# This saves the model's learned weights to a file
MODEL_SAVE_PATH = r"W:\CV\HTIC\Qns-3\hin\hin_model.pth"
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"✅ Model saved to {MODEL_SAVE_PATH}")

# --- Step 8: Run Validation on Test Set ---
print("\nRunning validation on the test set...")
# (The rest of your script...)

# ================================
# NEW: Step 8: Run Validation on Test Set
# ================================
print("\nRunning validation on the test set...")
!pip install editdistance
try:
    import editdistance
except ImportError:
    print("---")
    print("To calculate Character Error Rate (CER), please install 'editdistance'")
    print("Run this in your (torch_gpu) terminal: pip install editdistance")
    print("---")
    editdistance = None

# 1. Load test data
test_file = os.path.join(DATA_ROOT, "hin_test.txt")
test_data = load_local_csv(test_file)

if test_data:
    # Filter test data to only include characters seen during training
    filtered_test_data = []
    for d in test_data:
        if all(c in src_to_ix for c in d["english word"]) and \
            all(c in tgt_to_ix for c in d["native word"]):
            filtered_test_data.append(d)

    print(f"Loaded {len(filtered_test_data)} test pairs (filtered from {len(test_data)}).")

    correct_count = 0
    total_count = len(filtered_test_data)
    total_edit_distance = 0
    total_target_chars = 0

    model.eval() # Set model to evaluation mode

    # Loop through every single word in the test set
    for pair in filtered_test_data:
        source_word = pair["english word"]
        target_word = pair["native word"]

        # Use your existing function to get the prediction
        predicted_word = transliterate_word(
            model, source_word, src_to_ix, tgt_to_ix, ix_to_tgt, DEVICE
        )

        # 1. Check for exact match
        if predicted_word == target_word:
            correct_count += 1

        # 2. Calculate edit distance (if library is installed)
        if editdistance:
            total_edit_distance += editdistance.eval(predicted_word, target_word)
            total_target_chars += len(target_word)

    # --- Print Final Results ---
    accuracy = (correct_count / total_count) * 100
    print(f"\n--- Test Set Results ---")
    print(f"✅ Exact Match Accuracy: {accuracy:.2f}% ({correct_count} / {total_count})")

    if editdistance and total_target_chars > 0:
        cer = (total_edit_distance / total_target_chars) * 100
        print(f"📊 Character Error Rate (CER): {cer:.2f}% (Lower is better)")

else:
    print("Could not find test file to run validation.")

# ================================
# OLD: Step 9: Interactive Prediction Loop (was Step 8)
# ================================
print("\n--- Interactive Transliteration (Hindi) ---")
# (Your existing interactive loop code goes here...)

✅ Model saved to W:\CV\HTIC\Qns-3\hin\hin_model.pth

Running validation on the test set...

Running validation on the test set...
Collecting editdistance
  Downloading editdistance-0.8.1-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Downloading editdistance-0.8.1-cp311-cp311-win_amd64.whl (79 kB)
Installing collected packages: editdistance
Successfully installed editdistance-0.8.1
Loaded 4094 test pairs (filtered from 4096).

--- Test Set Results ---
✅ Exact Match Accuracy: 26.50% (1085 / 4094)
📊 Character Error Rate (CER): 23.67% (Lower is better)

--- Interactive Transliteration (Hindi) ---


In [33]:
# --- 1. Install 'editdistance' for Character Error Rate (CER) ---
# We use -q (quiet) to hide the installation logs
print("Installing 'editdistance' for CER calculation...")
!pip install editdistance -q
import editdistance
import os
print("Installation complete.")

# --- 2. Define the Dynamic Model Analyzer ---
def analyze_and_print_report(model, embed_dim, hidden_dim, v_src, v_tgt):
    """
    Analyzes this specific 1-layer GRU model and prints a detailed parameter report.
    """

    # --- 1. Set model-specific parameters ---
    cell_type = "GRU"
    num_layers = 1
    gates = 3 # A GRU has 3 gates (reset, update, new)

    # Aliases for easier formula reading
    e, h, n = embed_dim, hidden_dim, num_layers

    print("\n" + "=" * 66)
    print("      Dynamic Model Analysis (for 1-Layer GRU)")
    print("=" * 66)
    print("\n--- Model Configuration ---")
    print(f"  - Cell Type: {cell_type} (hardcoded in notebook)")
    print(f"  - Num Layers (n): {n} (hardcoded in notebook)")
    print(f"  - Embedding Dim (e): {e}")
    print(f"  - Hidden Dim (h): {h}")
    print(f"  - Source Vocab (V_src): {v_src}")
    print(f"  - Target Vocab (V_tgt): {v_tgt}")

    print("\n--- 1. Practical Parameter Calculation (from Config) ---")

    # --- Calculate Layer by Layer ---
    p_enc_embed = v_src * e
    p_dec_embed = v_tgt * e
    p_dec_linear = (h * v_tgt) + v_tgt # (weights + bias)

    # GRU Layer 1 (e -> h)
    # Formula: gates * ( (e*h) + (h*h) + (bias_in) + (bias_hidden) )
    rnn_l1_params = gates * (e * h + h * h) + 2 * (gates * h)

    p_enc_rnn = rnn_l1_params
    p_dec_rnn = rnn_l1_params

    p_total_theoretical = p_enc_embed + p_enc_rnn + p_dec_embed + p_dec_rnn + p_dec_linear

    # --- Get Actual Count from the model object ---
    p_total_actual = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"  1. Encoder Embedding (V_src * e):      {v_src:7} * {e} = {p_enc_embed:10,}")
    print(f"  2. Encoder GRU (n=1):                        {p_enc_rnn:10,}")
    print(f"  3. Decoder Embedding (V_tgt * e):      {v_tgt:7} * {e} = {p_dec_embed:10,}")
    print(f"  4. Decoder GRU (n=1):                        {p_dec_rnn:10,}")
    print(f"  5. Decoder Linear (h * V_tgt + V_tgt): ({h} * {v_tgt}) + {v_tgt} = {p_dec_linear:10,}")
    print("  " + "-" * 50)
    print(f"  CALCULATED TOTAL:                        {int(p_total_theoretical):10,}")

    print("\n--- Verification ---")
    print(f"  - Actual Model Parameters:         {p_total_actual:13,}")
    print(f"  - Verification:                  {'✅ MATCH' if p_total_actual == p_total_theoretical else '❌ MISMATCH'}")
    print("=" * 66)

    # --- 2. Print the Assignment's Formulas (for 1-layer GRU) ---
    print("\n\n" + "=" * 66)
    print("  Assignment's Theoretical Formulas (for 1-Layer GRU)")
    print("=" * 66)
    print("\nQ: What is the total number of parameters... (e, h, V, 1-layer)?")
    print("A: V(2e + h + 1) + h(6e + 6h + 12)")

    print("\nQ: What is the total number of computations... (e, h, V, T, 1-layer)?")
    print("A: T * (6eh + 6h^2 + hV)")
    print("=" * 66)


# =============================================================
# SCRIPT EXECUTION STARTS HERE
# (This assumes 'model', 'src_to_ix', etc. exist from
# running the previous cells in your notebook)
# =============================================================

# --- 3. Run Validation on Test Set ---
print("\nRunning validation on the test set...")

# Make sure DATA_ROOT is defined (it's in your training cell)
DATA_ROOT = r"W:\CV\HTIC\Qns-3\hin"
TEST_FILE = os.path.join(DATA_ROOT, "hin_test.txt")

test_data = load_local_csv(TEST_FILE)
if test_data:
    # Filter test data
    filtered_test_data = [
        d for d in test_data
        if all(c in src_to_ix for c in d["english word"]) and \
           all(c in tgt_to_ix for c in d["native word"])
    ]
    print(f"Loaded {len(filtered_test_data)} test pairs (filtered from {len(test_data)}).")

    correct_count, total_edit_distance, total_target_chars = 0, 0, 0
    total_count = len(filtered_test_data)

    for pair in filtered_test_data:
        # Use the 'transliterate_word' function defined in a previous cell
        pred = transliterate_word(model, pair["english word"], src_to_ix, tgt_to_ix, ix_to_tgt, DEVICE)
        if pred == pair["native word"]: correct_count += 1
        total_edit_distance += editdistance.eval(pred, pair["native word"])
        total_target_chars += len(pair["native word"])

    print("\n--- Test Set Results ---")
    print(f"✅ Exact Match Accuracy: {(correct_count / total_count) * 100:.2f}%")
    if total_target_chars > 0:
        print(f"📊 Character Error Rate (CER): {(total_edit_distance / total_target_chars) * 100:.2f}%")
else:
    print("Could not find test file. Skipping final validation.")

# --- 4. Save the Model ---
MODEL_SAVE_PATH = os.path.join(DATA_ROOT, "hin_model.pth")
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"\n✅ Model weights saved to {MODEL_SAVE_PATH}")

# --- 5. Call the Analyzer ---
# We pass the variables that are in your notebook's global state
analyze_and_print_report(
    model=model,
    embed_dim=EMBED_DIM,      # From your config cell
    hidden_dim=HIDDEN_DIM,    # From your config cell
    v_src=len(src_to_ix),     # From your training cell
    v_tgt=len(tgt_to_ix)      # From your training cell
)

# --- 6. Start Interactive Loop ---
print("\n--- Interactive Transliteration (Hindi) ---")
print("Type a Romanized word and press Enter (or 'quit' to stop).")
while True:
    try:
        user_input = input("🔤 > ").strip()
        if user_input.lower() == "quit":
            break
        if not user_input:
            continue
        # Use the 'transliterate_word' function defined in a previous cell
        result = transliterate_word(model, user_input, src_to_ix, tgt_to_ix, ix_to_tgt, DEVICE)
        print(f"   → {result}\n")
    except EOFError:
        # This handles the end of input in some notebook environments
        print("Interactive loop finished.")
        break
    except KeyboardInterrupt:
        print("\nInteractive loop stopped.")
        break


Installing 'editdistance' for CER calculation...
Installation complete.

Running validation on the test set...
Loaded 4094 test pairs (filtered from 4096).

--- Test Set Results ---
✅ Exact Match Accuracy: 26.50%
📊 Character Error Rate (CER): 23.67%

✅ Model weights saved to W:\CV\HTIC\Qns-3\hin\hin_model.pth

      Dynamic Model Analysis (for 1-Layer GRU)

--- Model Configuration ---
  - Cell Type: GRU (hardcoded in notebook)
  - Num Layers (n): 1 (hardcoded in notebook)
  - Embedding Dim (e): 256
  - Hidden Dim (h): 512
  - Source Vocab (V_src): 29
  - Target Vocab (V_tgt): 67

--- 1. Practical Parameter Calculation (from Config) ---
  1. Encoder Embedding (V_src * e):           29 * 256 =      7,424
  2. Encoder GRU (n=1):                         1,182,720
  3. Decoder Embedding (V_tgt * e):           67 * 256 =     17,152
  4. Decoder GRU (n=1):                         1,182,720
  5. Decoder Linear (h * V_tgt + V_tgt): (512 * 67) + 67 =     34,371
  --------------------------------

In [34]:
# ================================
# Calculate Accuracy on Test Set
# ================================
import os
import torch
# Make sure the following are defined from previous cells:
# - model (your trained Seq2Seq model)
# - src_to_ix, tgt_to_ix, ix_to_tgt (your vocab mappings)
# - DEVICE (your torch.device)
# - load_local_csv (your data loading function)
# - transliterate_word (your prediction function)
# - DATA_ROOT (path to your 'hin' folder)

print("Calculating accuracy on the test set...")

# --- 1. Load Test Data ---
TEST_FILE = os.path.join(DATA_ROOT, "hin_test.txt")
test_data = load_local_csv(TEST_FILE)

if test_data:
    # --- 2. Filter Test Data ---
    # Keep only words where all characters are known to the model
    filtered_test_data = [
        d for d in test_data
        if all(c in src_to_ix for c in d["english word"]) and \
           all(c in tgt_to_ix for c in d["native word"])
    ]
    print(f"Loaded {len(filtered_test_data)} test pairs (filtered from {len(test_data)}).")

    # --- 3. Initialize Counters ---
    correct_count = 0
    total_count = len(filtered_test_data)

    # Ensure model is in evaluation mode
    model.eval()

    # --- 4. Iterate and Predict ---
    print("Running predictions...")
    with torch.no_grad(): # Faster inference without gradient calculation
        for i, pair in enumerate(filtered_test_data):
            source_word = pair["english word"]
            target_word = pair["native word"]

            # Get the model's prediction
            predicted_word = transliterate_word(
                model, source_word, src_to_ix, tgt_to_ix, ix_to_tgt, DEVICE
            )

            # --- 5. Compare and Count ---
            if predicted_word == target_word:
                correct_count += 1

            # Optional: Print progress
            if (i + 1) % 500 == 0:
                 print(f"  Processed {i+1}/{total_count} words...")

    # --- 6. Calculate and Print Accuracy ---
    if total_count > 0:
        accuracy = (correct_count / total_count) * 100
        print("\n--- Test Set Accuracy ---")
        print(f"✅ Exact Match Accuracy: {accuracy:.2f}% ({correct_count} correct out of {total_count})")
    else:
        print("No valid test data found after filtering.")

else:
    print(f"Could not find or load test file: {TEST_FILE}")

Calculating accuracy on the test set...
Loaded 4094 test pairs (filtered from 4096).
Running predictions...
  Processed 500/4094 words...
  Processed 1000/4094 words...
  Processed 1500/4094 words...
  Processed 2000/4094 words...
  Processed 2500/4094 words...
  Processed 3000/4094 words...
  Processed 3500/4094 words...
  Processed 4000/4094 words...

--- Test Set Accuracy ---
✅ Exact Match Accuracy: 26.50% (1085 correct out of 4094)
