In [None]:
# Install / upgrade dependencies (Colab)
!pip -q install -U transformers sentencepiece sacrebleu accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Here we used IndicTrans2 models:
https://huggingface.co/ai4bharat/indictrans2-indic-en-1B

In [None]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor


# =========================================================
# Block 1) Paths: just replace with your dataset's .sa / .en
# =========================================================
# For the 5 datasets, you only need to change these two paths
# to the corresponding <dataset>.sa and <dataset>.en files.
SA_PATH = "/content/testset.sa"  # Sanskrit (source)
EN_PATH = "/content/testset.en"  # English (reference)


# =========================================================
# Block 2) Read aligned parallel data (skips empty lines)
# =========================================================
def read_parallel(sa_path, en_path, n=None):
    with open(sa_path, "r", encoding="utf-8") as f:
        sa_lines = [ln.strip() for ln in f]
    with open(en_path, "r", encoding="utf-8") as f:
        en_lines = [ln.strip() for ln in f]

    assert len(sa_lines) == len(en_lines), (
        f"Line count mismatch: sa={len(sa_lines)} en={len(en_lines)}"
    )

    pairs = []
    for s, e in zip(sa_lines, en_lines):
        if not s or not e:
            continue
        pairs.append((s, e))
        if n is not None and len(pairs) >= n:
            break

    src_texts = [s for s, _ in pairs]  # Sanskrit (san_Deva)
    ref_texts = [e for _, e in pairs]  # English (eng_Latn)
    return src_texts, ref_texts


# n=None => full set; if you want only first 200 pairs, set n=200
src_texts, ref_texts = read_parallel(SA_PATH, EN_PATH, n=None)
print("Loaded non-empty aligned pairs:", len(src_texts))

Loaded non-empty aligned pairs: 100


In [None]:
# =========================================================
# Block 3) Load IndicTrans2 model (official from_pretrained)
# =========================================================
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

MODEL_NAME = "ai4bharat/indictrans2-indic-en-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    dtype="auto",  # auto-select fp16 / bf16 / fp32 based on hardware
    # If you installed flash_attn and want faster decoding, you can add:
    # attn_implementation="flash_attention_2",
).to(DEVICE).eval()

ip = IndicProcessor(inference=True)

In [None]:
# =========================================================
# Block 4) Inference (IndicProcessor preprocess + postprocess)
# =========================================================
@torch.no_grad()
def indictrans_translate_sa2en(
    texts,
    batch_size=8,
    max_new_tokens=128,
    num_beams=1,
):
    src_lang, tgt_lang = "san_Deva", "eng_Latn"
    outs = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        generated = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            do_sample=False,
            use_cache=False,
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2,
        )

        decoded = tokenizer.batch_decode(
            generated,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        decoded = ip.postprocess_batch(decoded, lang=tgt_lang)
        outs.extend(decoded)

    return outs


sa2en_preds = indictrans_translate_sa2en(
    src_texts,
    batch_size=8,
    max_new_tokens=128,
    num_beams=4,
)
print("Generated:", len(sa2en_preds))

Generated: 100


In [None]:
# =========================================================
# Block 5) Evaluate (clean refs, then evaluate)
# =========================================================
import re

def clean_ref(s: str) -> str:
    s = re.sub(r"\*.*$", "", s)  # remove trailing comments after '*'
    return " ".join(s.split())

ref_clean = [clean_ref(x) for x in ref_texts]

_ = evaluate_mt(
    f"IndicTrans2 (refs cleaned) — san_Deva→eng_Latn (testset {len(sa2en_preds)} lines)",
    sa2en_preds,
    ref_clean[: len(sa2en_preds)],
)


===== IndicTrans2 (refs cleaned) — san_Deva→eng_Latn (testset 100 lines) =====
Count: 100
SacreBLEU: 13.03
chrF++   : 38.06


In [None]:
# =========================================================
# Block 6) Print a few samples
# =========================================================
for i in range(min(10, len(src_texts), len(sa2en_preds))):
    print("==== sample", i, "====")
    print("SRC:", src_texts[i])
    print("REF:", ref_texts[i])
    print("HYP:", sa2en_preds[i])
    print()

==== sample 0 ====
SRC: अहं अतिथिम् स्वागतं करोमि ।
REF: I will welcome the guest.
HYP: I welcome the guest.

==== sample 1 ====
SRC: गुरुवासरः कदा भविष्यति ?
REF: When will it be Thursday?
HYP: when will it be thursday

==== sample 2 ====
SRC: बालिका कन्दुकेन क्रीडितवती ।
REF: Girl played with the ball.
HYP: The girl plays with a ball.

==== sample 3 ====
SRC: भवन्तः कौन्तेयाः ।
REF: You all are sons of Kunti.
HYP: You're a coward.

==== sample 4 ====
SRC: चम्वाः नायकः धृष्टद्युम्नः ।
REF: The leader of Chamva is Dhrishtadyumna.
HYP: Chamva is the hero of Dhrishtadyumna.

==== sample 5 ====
SRC: रामः वनवासं समाप्य प्रत्यागच्छति ।
REF: Rama returns concluding the exile to forest.
HYP: Rama returns after completing his exile.

==== sample 6 ====
SRC: पूर्वं युयुत्सुः कौरवपक्षीयः आसीत् ।
REF: Earlier Yuyutsu was on Kaurava side.
HYP: In the past, Yuyutsu was pro-Kaurava.

==== sample 7 ====
SRC: माता पुनः प्रक्षालयति ।
REF: Mother again cleans.
HYP: The mother washes again.

==== sample 