<a href="https://colab.research.google.com/github/woshimajintao/AI-Agentic-Translation-for-Sanskrit/blob/main/notebooks/GPT-4o-mini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path

# Project root (repo root when on GitHub / local)
PROJECT_ROOT = Path(".").resolve()

# Data root
DATA_ROOT = PROJECT_ROOT / "data"

# Cache (already relative, keep it)
CACHE_DIR = PROJECT_ROOT / "gpt_cache"
CACHE_DIR.mkdir(exist_ok=True)

In [None]:
!pip -q install -U openai sacrebleu pandas tqdm gitpython "httpx>=0.28.1,<1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m137.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

In [None]:
import os
from pathlib import Path

def find_parallel_files(data_root: Path):
    pairs = []
    for dirpath, _, filenames in os.walk(data_root):
        dirpath = Path(dirpath)
        ens = {Path(f).stem: dirpath / f for f in filenames if f.endswith(".en")}
        sas = {Path(f).stem: dirpath / f for f in filenames if f.endswith(".sa")}
        for stem in sorted(set(ens) & set(sas)):
            pairs.append({
                "folder": str(dirpath.relative_to(data_root)),
                "stem": stem,
                "en_path": ens[stem],
                "sa_path": sas[stem],
            })
    return pairs

In [None]:
from sacrebleu.metrics import BLEU, CHRF

bleu_metric = BLEU(tokenize="13a")
chrf2_metric = CHRF(word_order=2)

def read_lines(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [ln.strip() for ln in f]
    return [ln for ln in lines if ln]

def eval_metrics(hyps, refs):
    bleu = bleu_metric.corpus_score(hyps, [refs]).score
    chrf2 = chrf2_metric.corpus_score(hyps, [refs]).score
    return bleu, chrf2

In [None]:
import os, json, time
from openai import OpenAI
from tqdm import tqdm
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("gpt")

client = OpenAI()

MODEL = "gpt-4o-mini"
TEMP = 0.2

CACHE_DIR = Path("gpt_cache")
CACHE_DIR.mkdir(exist_ok=True)

def cache_path(folder, stem):
    safe = (folder.replace("/", "__") + "__" + stem).replace("..", "_")
    return CACHE_DIR / f"{safe}.jsonl"

def load_cache(path: Path):
    cache = {}
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                cache[obj["i"]] = obj["hyp"]
    return cache

def append_cache(path: Path, i: int, hyp: str):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps({"i": i, "hyp": hyp}, ensure_ascii=False) + "\n")

def gpt_translate_one(sa_text: str) -> str:
    prompt = (
        "Translate the following Sanskrit into natural English.\n"
        "Rules:\n"
        "1) Keep proper names consistent.\n"
        "2) Do not add information not in the source.\n"
        "3) Output only the English translation.\n\n"
        f"Sanskrit:\n{sa_text}\n"
    )
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a careful Sanskrit-to-English translator."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    return resp.choices[0].message.content.strip()

# Itihasa

In [None]:
from pathlib import Path
from tqdm import tqdm
import time

ITIHASA_DIR = DATA_ROOT / "itihasa" / "testset_subset_42"

EN_PATH = ITIHASA_DIR / "testset.en"
SA_PATH = ITIHASA_DIR / "testset.sa"

assert EN_PATH.exists(), EN_PATH
assert SA_PATH.exists(), SA_PATH


MAX_N = None
FOLDER_NAME = "itihasa"
MODEL_TAG = "gpt-4o-mini"
STEM_NAME = f"testset_subset_42_{MODEL_TAG}"


en_lines = read_lines(EN_PATH)
sa_lines = read_lines(SA_PATH)

n = min(len(en_lines), len(sa_lines))
if MAX_N is not None:
    n = min(n, MAX_N)

refs = en_lines[:n]
srcs = sa_lines[:n]

print(f"Loaded {n} sentence pairs")


cpath = cache_path(FOLDER_NAME, STEM_NAME)
cache = load_cache(cpath)

hyps = []

for i in tqdm(range(n), desc=f"{FOLDER_NAME}/{STEM_NAME} n={n}"):
    if i in cache:
        hyp = cache[i]
    else:
        hyp = gpt_translate_one(srcs[i])
        append_cache(cpath, i, hyp)
        time.sleep(0.25)
    hyps.append(hyp)


bleu, chrf2 = eval_metrics(hyps, refs)

print("\nBLEU :", round(bleu, 2))
print("chrF2:", round(chrf2, 2))

Loaded 100 sentence pairs


itihasa/testset_subset_42_gpt-4o-mini n=100: 100%|██████████| 100/100 [02:19<00:00,  1.40s/it]


BLEU : 5.08
chrF2: 29.08





# Bible

In [None]:
DOMAIN = "bible"
SUBSET = "testset_subset_42"

DOMAIN_DIR = DATA_ROOT / DOMAIN / SUBSET

EN_PATH = DOMAIN_DIR / "testset.en"
SA_PATH = DOMAIN_DIR / "testset.sa"

MAX_N = None
FOLDER_NAME = "bible"
MODEL = "gpt-4o-mini"
STEM_NAME = f"testset_subset_42_{MODEL_TAG}"


en_lines = read_lines(EN_PATH)
sa_lines = read_lines(SA_PATH)

n = min(len(en_lines), len(sa_lines))
if MAX_N is not None:
    n = min(n, MAX_N)

refs = en_lines[:n]
srcs = sa_lines[:n]

print(f"Loaded {n} sentence pairs")


cpath = cache_path(FOLDER_NAME, STEM_NAME)
cache = load_cache(cpath)

hyps = []

for i in tqdm(range(n), desc=f"{FOLDER_NAME}/{STEM_NAME} n={n}"):
    if i in cache:
        hyp = cache[i]
    else:
        hyp = gpt_translate_one(srcs[i])
        append_cache(cpath, i, hyp)
        time.sleep(0.25)
    hyps.append(hyp)


bleu, chrf2 = eval_metrics(hyps, refs)

print("\nBLEU :", round(bleu, 2))
print("chrF2:", round(chrf2, 2))

Loaded 100 sentence pairs


bible/testset_subset_42_gpt-4o-mini n=100: 100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


BLEU : 8.96
chrF2: 31.95





# Gitasopanam

In [None]:
DOMAIN = "gitasopanam"
SUBSET = "testset_subset_42"

DOMAIN_DIR = DATA_ROOT / DOMAIN / SUBSET

EN_PATH = DOMAIN_DIR / "testset.en"
SA_PATH = DOMAIN_DIR / "testset.sa"

MAX_N = None
FOLDER_NAME = "gitasopanam"
MODEL_TAG = "gpt-4o-mini"
STEM_NAME = f"testset_subset_42_{MODEL_TAG}"


en_lines = read_lines(EN_PATH)
sa_lines = read_lines(SA_PATH)

n = min(len(en_lines), len(sa_lines))
if MAX_N is not None:
    n = min(n, MAX_N)

refs = en_lines[:n]
srcs = sa_lines[:n]

print(f"Loaded {n} sentence pairs")


cpath = cache_path(FOLDER_NAME, STEM_NAME)
cache = load_cache(cpath)

hyps = []

for i in tqdm(range(n), desc=f"{FOLDER_NAME}/{STEM_NAME} n={n}"):
    if i in cache:
        hyp = cache[i]
    else:
        hyp = gpt_translate_one(srcs[i])
        append_cache(cpath, i, hyp)
        time.sleep(0.25)
    hyps.append(hyp)


bleu, chrf2 = eval_metrics(hyps, refs)

print("\nBLEU :", round(bleu, 2))
print("chrF2:", round(chrf2, 2))

Loaded 100 sentence pairs


gitasopanam/testset_subset_42_gpt-4o-mini n=100: 100%|██████████| 100/100 [01:27<00:00,  1.14it/s]


BLEU : 15.75
chrF2: 42.81





# Mkb

In [None]:
DOMAIN = "mkb"
SUBSET = "testset_subset_42"

DOMAIN_DIR = DATA_ROOT / DOMAIN / SUBSET

EN_PATH = DOMAIN_DIR / "testset.en"
SA_PATH = DOMAIN_DIR / "testset.sa"

MAX_N = None
FOLDER_NAME = "mkb"
MODEL_TAG = "gpt-4o-mini"
STEM_NAME = f"testset_subset_42_{MODEL_TAG}"


en_lines = read_lines(EN_PATH)
sa_lines = read_lines(SA_PATH)

n = min(len(en_lines), len(sa_lines))
if MAX_N is not None:
    n = min(n, MAX_N)

refs = en_lines[:n]
srcs = sa_lines[:n]

print(f"Loaded {n} sentence pairs")


cpath = cache_path(FOLDER_NAME, STEM_NAME)
cache = load_cache(cpath)

hyps = []

for i in tqdm(range(n), desc=f"{FOLDER_NAME}/{STEM_NAME} n={n}"):
    if i in cache:
        hyp = cache[i]
    else:
        hyp = gpt_translate_one(srcs[i])
        append_cache(cpath, i, hyp)
        time.sleep(0.25)
    hyps.append(hyp)


bleu, chrf2 = eval_metrics(hyps, refs)

print("\nBLEU :", round(bleu, 2))
print("chrF2:", round(chrf2, 2))

Loaded 100 sentence pairs


mkb/testset_subset_42_gpt-4o-mini n=100: 100%|██████████| 100/100 [01:58<00:00,  1.18s/it]


BLEU : 12.63
chrF2: 37.86





# Spoken-tutorials

In [None]:
DOMAIN = "spoken-tutorials"
SUBSET = "testset_subset_42"

DOMAIN_DIR = DATA_ROOT / DOMAIN / SUBSET

EN_PATH = DOMAIN_DIR / "testset.en"
SA_PATH = DOMAIN_DIR / "testset.sa"

MAX_N = None
FOLDER_NAME = "spoken-tutorials"
MODEL_TAG = "gpt-4o-mini"
STEM_NAME = f"testset_subset_42_{MODEL_TAG}"


en_lines = read_lines(EN_PATH)
sa_lines = read_lines(SA_PATH)

n = min(len(en_lines), len(sa_lines))
if MAX_N is not None:
    n = min(n, MAX_N)

refs = en_lines[:n]
srcs = sa_lines[:n]

print(f"Loaded {n} sentence pairs")


cpath = cache_path(FOLDER_NAME, STEM_NAME)
cache = load_cache(cpath)

hyps = []

for i in tqdm(range(n), desc=f"{FOLDER_NAME}/{STEM_NAME} n={n}"):
    if i in cache:
        hyp = cache[i]
    else:
        hyp = gpt_translate_one(srcs[i])
        append_cache(cpath, i, hyp)
        time.sleep(0.25)
    hyps.append(hyp)


bleu, chrf2 = eval_metrics(hyps, refs)

print("\nBLEU :", round(bleu, 2))
print("chrF2:", round(chrf2, 2))

Loaded 100 sentence pairs


spoken-tutorials/testset_subset_42_gpt-4o-mini n=100: 100%|██████████| 100/100 [01:42<00:00,  1.02s/it]


BLEU : 15.81
chrF2: 42.42



