In [None]:
# Install / upgrade dependencies (Colab)
!pip -q install -U transformers sentencepiece sacrebleu accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Here we used M2M100 models:
https://huggingface.co/docs/transformers/en/model_doc/m2m_100

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# =========================================================
# Block 1) Paths: just replace with your dataset's .sa / .en
# =========================================================
# For the 5 datasets, you only need to change these two paths
# to the corresponding <dataset>.sa and <dataset>.en files.
SA_PATH = "/content/testset.sa"  # Sanskrit (source)
EN_PATH = "/content/testset.en"  # English (reference)

For each dataset, you only need to replace SA_PATH and EN_PATH with that dataset’s corresponding .sa and .en files (everything else stays the same).

# =========================================================
# Block 2) Read aligned parallel data (skips empty lines)
# =========================================================
def read_parallel(sa_path, en_path, n=None):
    with open(sa_path, "r", encoding="utf-8") as f:
        sa_lines = [ln.strip() for ln in f]
    with open(en_path, "r", encoding="utf-8") as f:
        en_lines = [ln.strip() for ln in f]

    assert len(sa_lines) == len(en_lines), (
        f"Line count mismatch: sa={len(sa_lines)} en={len(en_lines)}"
    )

    pairs = []
    for s, e in zip(sa_lines, en_lines):
        # skip if either side is empty
        if not s or not e:
            continue
        pairs.append((s, e))
        if n is not None and len(pairs) >= n:
            break

    if n is not None and len(pairs) < n:
        print(f"Warning: only got {len(pairs)} non-empty aligned pairs (requested {n}).")

    src_texts = [s for s, _ in pairs]  # Sanskrit
    ref_texts = [e for _, e in pairs]  # English
    return src_texts, ref_texts


# n=None => full set; if you want only first 200 pairs, set n=200
src_texts, ref_texts = read_parallel(SA_PATH, EN_PATH, n=None)
print("Loaded non-empty aligned pairs:", len(src_texts))


# =========================================================
# Block 3) Load model + tokenizer
# =========================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

m2m_model_name = "Swamitucats/M2M100_Sanskrit_English"
tokenizer = AutoTokenizer.from_pretrained(m2m_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(m2m_model_name).to(device)
model.eval()

Loaded non-empty aligned pairs: 100
Device: cuda


M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
       

In [None]:
# =========================================================
# Block 4) Inference (keeps your original logic)
# =========================================================
def run_m2m_sanskrit_english_on_test(batch_size=16, max_length=256, max_examples=None):
    preds = []
    n = len(src_texts) if max_examples is None else min(len(src_texts), max_examples)

    for i in range(0, n, batch_size):
        batch = src_texts[i : min(i + batch_size, n)]  # Sanskrit (Devanagari)

        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        ).to(device)

        with torch.no_grad():
            gen = model.generate(
                **enc,
                max_length=max_length,
                num_beams=4,  # set to 1 for faster decoding
            )

        outputs = tokenizer.batch_decode(
            gen,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        assert len(outputs) == len(batch)
        preds.extend(outputs)

    assert len(preds) == n, f"M2M preds length {len(preds)} != expected {n}"
    return preds



In [None]:
# =========================================================
# Block 3) Load model + tokenizer
# =========================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

m2m_model_name = "Swamitucats/M2M100_Sanskrit_English"
tokenizer = AutoTokenizer.from_pretrained(m2m_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(m2m_model_name).to(device)
model.eval()


# =========================================================
# Block 4) Inference (keeps your original logic)
# =========================================================
def run_m2m_sanskrit_english_on_test(batch_size=16, max_length=256, max_examples=None):
    preds = []
    n = len(src_texts) if max_examples is None else min(len(src_texts), max_examples)

    for i in range(0, n, batch_size):
        batch = src_texts[i : min(i + batch_size, n)]  # Sanskrit (Devanagari)

        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        ).to(device)

        with torch.no_grad():
            gen = model.generate(
                **enc,
                max_length=max_length,
                num_beams=4,  # set to 1 for faster decoding
            )

        outputs = tokenizer.batch_decode(
            gen,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        assert len(outputs) == len(batch)
        preds.extend(outputs)

    assert len(preds) == n, f"M2M preds length {len(preds)} != expected {n}"
    return preds


# =========================================================
# Block 5) Run full set + evaluate + print examples
# =========================================================
m2m_max_examples = None  # None => run all; or set e.g. 200 / 1000
m2m_preds = run_m2m_sanskrit_english_on_test(
    batch_size=16,
    max_length=256,
    max_examples=m2m_max_examples,
)

eval_n = len(m2m_preds)
_ = evaluate_mt(
    f"Baseline — M2M100_Sanskrit_English (Itihasa {eval_n} lines)",
    m2m_preds,
    ref_texts[:eval_n],
)


Baseline — M2M100_Sanskrit_English (gitasopanam 100 lines)
  #pairs = 100
  BLEU  = 6.79
  chrF2 = 31.31


In [None]:
# Print a few samples
for i in range(min(10, len(src_texts), len(m2m_preds))):
    print("==== sample", i, "====")
    print("SRC:", src_texts[i])
    print("REF:", ref_texts[i])
    print("HYP:", m2m_preds[i])
    print()

==== sample 0 ====
SRC: अहं अतिथिम् स्वागतं करोमि ।
REF: I will welcome the guest.
HYP: I do welcome my guest.

==== sample 1 ====
SRC: गुरुवासरः कदा भविष्यति ?
REF: When will it be Thursday?
HYP: When will he be born as a preceptor?

==== sample 2 ====
SRC: बालिका कन्दुकेन क्रीडितवती ।
REF: Girl played with the ball.
HYP: The maiden was sporting with the mace.

==== sample 3 ====
SRC: भवन्तः कौन्तेयाः ।
REF: You all are sons of Kunti.
HYP: You are the sons of Kunti.

==== sample 4 ====
SRC: चम्वाः नायकः धृष्टद्युम्नः ।
REF: The leader of Chamva is Dhrishtadyumna.
HYP: The Chambas are the heroes and Dhrishtadyumna.

==== sample 5 ====
SRC: रामः वनवासं समाप्य प्रत्यागच्छति ।
REF: Rama returns concluding the exile to forest.
HYP: Rāma having finished his abode in the forest, returns.

==== sample 6 ====
SRC: पूर्वं युयुत्सुः कौरवपक्षीयः आसीत् ।
REF: Earlier Yuyutsu was on Kaurava side.
HYP: In the days of yore Yuyutsu was the son of the Kuru race.

==== sample 7 ====
SRC: माता पुनः प्रक्