# LLM Translation Baseline + Evaluation

This notebook evaluates baseline translation quality across multiple LLM models:
- **Gemini 2.5 Flash**
- **Claude 3.5 Sonnet**  
- **GPT-4o Mini**

**Target Languages:** French, Italian, Japanese

---

## 1. Environment Setup & Configuration

In [59]:
import os
import re
import json
import time
import random
import warnings
from pathlib import Path
from typing import Any, List, Tuple, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

load_dotenv()
print("LLM Translation Baseline + Evaluation ready!")
print("=" * 72)

LLM Translation Baseline + Evaluation ready!


## 2. Configs & Constants

In [60]:
TARGET_LANGUAGES = ["fr", "ja", "it"]
LANGUAGE_NAMES = {"fr": "French", "ja": "Japanese", "it": "Italian"}

# Concurrency (safe default for notebook clients)
MAX_WORKERS = int(os.getenv("BASELINE_MAX_WORKERS", "4"))
MAX_RETRIES = int(os.getenv("BASELINE_MAX_RETRIES", "3"))

# Pricing (override via env if needed)
PRICING_PER_1M_TOKENS = {
    "gemini-2.5-flash": {"input": float(os.getenv("PRICE_GEMINI_IN", 0.075)), "output": float(os.getenv("PRICE_GEMINI_OUT", 0.30))},
    "gpt-4o-mini": {"input": float(os.getenv("PRICE_O4MINI_IN", 0.15)), "output": float(os.getenv("PRICE_O4MINI_OUT", 0.60))},
    "claude-3-5-sonnet": {"input": float(os.getenv("PRICE_SONNET_IN", 3.00)), "output": float(os.getenv("PRICE_SONNET_OUT", 15.00))},
}

OUT_BASE = Path("translations/baseline")
EVAL_BASE = Path("eval/baseline")
OUT_BASE.mkdir(parents=True, exist_ok=True)
EVAL_BASE.mkdir(parents=True, exist_ok=True)

# --- Provider-aware concurrency & pricing helpers ---

# Per-provider safe concurrency (override via env)
PROVIDER_MAX_WORKERS = {
    "openai":    int(os.getenv("OPENAI_MAX_WORKERS", "4")),
    "anthropic": int(os.getenv("ANTHROPIC_MAX_WORKERS", "1")),  # Claude: keep low
    "google":    int(os.getenv("GOOGLE_MAX_WORKERS", "2")),
}

def get_max_workers(provider: str, default: int = 2) -> int:
    return max(1, PROVIDER_MAX_WORKERS.get(provider, default))

## 3. Data Loading & Preprocessing

In [61]:
def flatten_json_strings(obj: Any, prefix: str = "") -> List[Tuple[str, str]]:
    out: List[Tuple[str, str]] = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            out.extend(flatten_json_strings(v, new_prefix))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            new_prefix = f"{prefix}[{i}]"
            out.extend(flatten_json_strings(v, new_prefix))
    elif isinstance(obj, str) and obj.strip():
        out.append((prefix, obj))
    return out

SRC_FILE = Path("data/en.json")
if not SRC_FILE.exists():
    raise FileNotFoundError("Missing required file: data/en.json")

with open(SRC_FILE, "r", encoding="utf-8") as f:
    en_json = json.load(f)

en_segments: List[Tuple[str, str]] = flatten_json_strings(en_json)
print(f"Total string segments: {len(en_segments)}")
if en_segments:
    for i, (p, t) in enumerate(en_segments[:3], 1):
        print(f"  {i}. {p}: '{t}'")

# Build mappings for dedupe/expansion
SRC_TO_PATHS: Dict[str, List[str]] = {}
for path, src in en_segments:
    SRC_TO_PATHS.setdefault(src, []).append(path)
UNIQUE_SOURCES = list(SRC_TO_PATHS.keys())
print(f"Unique source strings: {len(UNIQUE_SOURCES)} (dedup from {len(en_segments)})")

Total string segments: 76
  1. nav.design: 'Design'
  2. nav.about: 'About Us'
  3. nav.faq: 'FAQ'
Unique source strings: 73 (dedup from 76)


## 4. LLM Client Initialization & Model Configuration

In [62]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

if not (GOOGLE_API_KEY or OPENAI_API_KEY or ANTHROPIC_API_KEY):
    raise RuntimeError("No API keys found! Add at least one provider key to .env")

print("Initializing clients…")

# Google: prefer new google.genai, fallback to google.generativeai
GOOGLE_CLIENT = None
GOOGLE_MODELS = {}
if GOOGLE_API_KEY:
    try:
        import google.genai as genai
        GOOGLE_CLIENT = genai.Client(api_key=GOOGLE_API_KEY)
        GOOGLE_MODELS["gemini-2.5-flash"] = {"model": os.getenv("GOOGLE_BASELINE_MODEL", "gemini-2.5-flash")}
        print("  ✅ Google (genai) ready")
    except Exception as e:
        try:
            import google.generativeai as genai_legacy
            genai_legacy.configure(api_key=GOOGLE_API_KEY)
            GOOGLE_CLIENT = genai_legacy
            GOOGLE_MODELS["gemini-2.5-flash"] = {"model": os.getenv("GOOGLE_BASELINE_MODEL", "gemini-2.5-flash")}
            print("  ⚠️ Using legacy google.generativeai client")
        except Exception as e2:
            print(f"  ❌ Google init failed: {e} | {e2}")
            GOOGLE_CLIENT = None

# OpenAI
OPENAI_CLIENT = None
OPENAI_MODELS = {}
if OPENAI_API_KEY:
    try:
        from openai import OpenAI
        OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)
        OPENAI_MODELS["gpt-4o-mini"] = {"model": os.getenv("OPENAI_BASELINE_MODEL", "gpt-4o-mini")}
        print("  ✅ OpenAI ready")
    except Exception as e:
        print(f"  ❌ OpenAI init failed: {e}")
        OPENAI_CLIENT = None

# Anthropic
ANTHROPIC_CLIENT = None
ANTHROPIC_MODELS = {}
if ANTHROPIC_API_KEY:
    try:
        from anthropic import Anthropic
        ANTHROPIC_CLIENT = Anthropic(api_key=ANTHROPIC_API_KEY)
        ANTHROPIC_MODELS["claude-3-5-sonnet"] = {"model": os.getenv("ANTHROPIC_BASELINE_MODEL", "claude-3-5-sonnet-latest")}
        print("  ✅ Anthropic ready")
    except Exception as e:
        print(f"  ❌ Anthropic init failed: {e}")
        ANTHROPIC_CLIENT = None

# Compose BASELINE_MODELS registry actually available
BASELINE_MODELS: Dict[str, Tuple[str, Dict[str, str]]] = {}
if GOOGLE_CLIENT:
    BASELINE_MODELS.update({"gemini-2.5-flash": ("google", GOOGLE_MODELS["gemini-2.5-flash"])})
if OPENAI_CLIENT:
    BASELINE_MODELS.update({"gpt-4o-mini": ("openai", OPENAI_MODELS["gpt-4o-mini"])})
if ANTHROPIC_CLIENT:
    BASELINE_MODELS.update({"claude-3-5-sonnet": ("anthropic", ANTHROPIC_MODELS["claude-3-5-sonnet"])})

print("Baseline models configured:")
for m, (prov, cfg) in BASELINE_MODELS.items():
    print(f"  • {m} ({prov}): {cfg['model']}")

Initializing clients…
  ✅ Google (genai) ready
  ✅ OpenAI ready
  ✅ Anthropic ready
Baseline models configured:
  • gemini-2.5-flash (google): gemini-2.5-flash
  • gpt-4o-mini (openai): gpt-4o-mini
  • claude-3-5-sonnet (anthropic): claude-3-5-sonnet-latest


## 5. Prompting Helpers

In [63]:
FEWSHOT_EXAMPLES = {
    "fr": [
        ("Welcome to <strong>NaiLit</strong>", "Bienvenue sur <strong>NaiLit</strong>"),
        ("Contact Us", "Nous contacter"),
        ("<a href='/home'>Home</a>", "<a href='/home'>Accueil</a>"),
        ("Sign up for free", "Inscrivez-vous gratuitement"),
        ("Privacy Policy", "Politique de confidentialité"),
    ],
    "ja": [
        ("Welcome to <strong>NaiLit</strong>", "<strong>NaiLit</strong>へようこそ"),
        ("Contact Us", "お問い合わせ"),
        ("<a href='/home'>Home</a>", "<a href='/home'>ホーム</a>"),
        ("Sign up for free", "無料でサインアップ"),
        ("Privacy Policy", "プライバシーポリシー"),
    ],
    "it": [
        ("Welcome to <strong>NaiLit</strong>", "Benvenuto su <strong>NaiLit</strong>"),
        ("Contact Us", "Contattaci"),
        ("<a href='/home'>Home</a>", "<a href='/home'>Home</a>"),
        ("Sign up for free", "Registrati gratuitamente"),
        ("Privacy Policy", "Informativa sulla privacy"),
    ],
}

LANGUAGE_NAMES_HUMAN = {"fr": "French", "ja": "Japanese", "it": "Italian"}

PROMPT_HEADERS: Dict[str, str] = {}
for lang, pairs in FEWSHOT_EXAMPLES.items():
    lang_name = LANGUAGE_NAMES_HUMAN.get(lang, lang)
    examples = "".join([f"\nExample {i}:\nSource: {src}\nTranslation: {tgt}\n" for i, (src, tgt) in enumerate(pairs, 1)])
    PROMPT_HEADERS[lang] = (
        f"You are a professional translator. Translate the following text into {lang_name}.\n\n"
        "IMPORTANT REQUIREMENTS:\n"
        "- Preserve all HTML tags exactly (do not add/remove/reorder tags)\n"
        "- Keep brand names and DNT terms as-is (e.g., NaiLit)\n"
        "- Maintain the original tone and style\n"
        "- Return ONLY the translation between <translation> and </translation> — no notes\n\n"
        f"Here are some examples:{examples}\n\n"
        "Now translate this:\nSource: "
    )

HTML_TAG = re.compile(r"</?\w+(?:\s+[^>]*?)?>", re.IGNORECASE)

def tags_preserved(src: str, tgt: str) -> bool:
    return HTML_TAG.findall(src or "") == HTML_TAG.findall(tgt or "")


In [64]:
def _backoff_sleep(attempt: int, base: float = 0.5, jitter: float = 0.3):
    time.sleep(base * (2 ** attempt) + random.random() * jitter)


def _extract_translation(raw_text: str) -> str:
    m = re.search(r"<translation>([\s\S]*?)</translation>", raw_text or "")
    return (m.group(1).strip() if m else (raw_text or "")).strip()


def translate_one(provider: str, cfg: Dict[str, str], src_text: str, target_lang: str) -> str:
    header = PROMPT_HEADERS[target_lang]
    prompt = f"{header}{src_text}\n\n<translation>"

    for attempt in range(MAX_RETRIES):
        try:
            if provider == "google":
                # New google.genai client
                if hasattr(GOOGLE_CLIENT, "models"):
                    resp = GOOGLE_CLIENT.models.generate_content(model=cfg["model"], contents=prompt)
                    raw = getattr(resp, "text", "") or ""
                # Legacy google.generativeai
                else:
                    model = GOOGLE_CLIENT.GenerativeModel(cfg["model"])  # type: ignore[attr-defined]
                    resp = model.generate_content(prompt)
                    raw = "".join([c.text for c in getattr(resp, "candidates", []) if hasattr(c, "text")]) or getattr(resp, "text", "") or ""
                return _extract_translation(raw)

            elif provider == "openai":
                resp = OPENAI_CLIENT.chat.completions.create(
                    model=cfg["model"],
                    messages=[
                        {"role": "system", "content": "You are a professional translator. Follow the examples and output only the translation enclosed in <translation> tags."},
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0.2,
                    max_tokens=2048,
                )
                raw = resp.choices[0].message.content or ""
                return _extract_translation(raw)

            elif provider == "anthropic":
                resp = ANTHROPIC_CLIENT.messages.create(
                    model=cfg["model"],
                    max_tokens=2048,
                    temperature=0.2,
                    messages=[{"role": "user", "content": prompt}],
                )
                raw = "".join(getattr(b, "text", "") for b in resp.content).strip()
                return _extract_translation(raw)

            else:
                raise ValueError(f"Unknown provider: {provider}")

        except Exception as e:
            # Retry on rate limit / transient failures
            if attempt == MAX_RETRIES - 1:
                print(f"  ⚠️ Final failure for provider={provider}: {e}")
                return "[TRANSLATION_ERROR]"
            _backoff_sleep(attempt)

In [65]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import os, json, time
from pathlib import Path
from typing import Dict, List, Tuple, Any
from tqdm import tqdm

# Edited to fix the Claude overload issue
PROVIDER_MAX_WORKERS = {
    "openai":    int(os.getenv("OPENAI_MAX_WORKERS", "4")),
    "anthropic": int(os.getenv("ANTHROPIC_MAX_WORKERS", "1")),  # Claude: keep low
    "google":    int(os.getenv("GOOGLE_MAX_WORKERS", "2")),
}
def _get_max_workers(provider: str, default_workers: int) -> int:
    return max(1, PROVIDER_MAX_WORKERS.get(provider, default_workers))

def run_translation_batch(model_key: str, provider: str, cfg: Dict[str, str], target_lang: str,
                          unique_sources: List[str], src_to_paths: Dict[str, List[str]]
                          ) -> Tuple[List[Dict], float]:
    start = time.time()

    # --- Resume support
    out_dir = OUT_BASE / model_key
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"{target_lang}.json"

    existing_map: Dict[str, str] = {}
    if out_file.exists():
        try:
            prev = json.load(open(out_file, "r", encoding="utf-8"))
            by_src: Dict[str, str] = {}
            for row in prev:
                # first translation per unique "source" wins (consistent with your format)
                if "source" in row and "translation" in row and row["source"] not in by_src:
                    by_src[row["source"]] = row["translation"]
            existing_map = by_src
            print(f"Resuming: found {len(existing_map)} prior unique translations")
        except Exception as e:
            print(f"  warn: failed to parse previous output ({out_file}): {e}")

    # --- Filter to-do set
    todo = [s for s in unique_sources if s not in existing_map]
    results_map: Dict[str, str] = dict(existing_map)

    # Provider-aware workers + gentle submit delay for Anthropic
    max_workers = _get_max_workers(provider, default_workers=MAX_WORKERS)
    submit_delay = 0.05 if provider == "anthropic" else 0.0

    print(f"Translating {len(todo)} unique strings with {model_key} → {target_lang}… (workers={max_workers})")

    # --- Execution
    if len(todo) > 0:
        if max_workers == 1:
            # Sequential path (stable for Claude)
            for src in tqdm(todo, total=len(todo), desc=f"{model_key} → {target_lang} (seq)"):
                try:
                    results_map[src] = translate_one(provider, cfg, src, target_lang)
                except Exception as e:
                    print(f"    warn: failed src≈{src[:48]!r}… | {e}")
                    results_map[src] = "[TRANSLATION_ERROR]"
                if submit_delay:
                    time.sleep(submit_delay)
        else:
            # Parallel path
            from concurrent.futures import ThreadPoolExecutor, as_completed
            with ThreadPoolExecutor(max_workers=max_workers) as ex:
                futures = {}
                for src in todo:
                    fut = ex.submit(translate_one, provider, cfg, src, target_lang)
                    futures[fut] = src
                    if submit_delay:
                        time.sleep(submit_delay)  # soften burst for Anthropic
                for fut in tqdm(as_completed(futures), total=len(futures), desc=f"{model_key} → {target_lang}"):
                    src = futures[fut]
                    try:
                        results_map[src] = fut.result()
                    except Exception as e:
                        print(f"    warn: failed src≈{src[:48]!r}… | {e}")
                        results_map[src] = "[TRANSLATION_ERROR]"

    duration = time.time() - start

    # --- Expand back to all segments
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    results: List[Dict[str, Any]] = []
    for src, paths in src_to_paths.items():
        tr = results_map.get(src, "[TRANSLATION_ERROR]")
        for path in paths:
            results.append({
                "path": path,
                "source": src,
                "translation": tr,
                "model": model_key,
                "target_lang": target_lang,
                "timestamp": now,
            })

    # --- Save
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"✅ {model_key} → {target_lang}: {len(results)} segments in {duration:.1f}s | saved: {out_file}")
    return results, duration

## 6. Metrics Helpers

In [66]:
def estimate_tokens(text: str) -> int:
    # rough approximation: 4 chars/token
    return max(1, len(text) // 4)


def estimate_cost(results: List[Dict], model_key: str) -> Optional[float]:
    if model_key not in PRICING_PER_1M_TOKENS:
        return None
    pricing = PRICING_PER_1M_TOKENS[model_key]
    in_tok = sum(estimate_tokens(r["source"]) for r in results)
    out_tok = sum(estimate_tokens(r["translation"]) for r in results)
    cost = (in_tok / 1000000) * pricing["input"] + (out_tok / 1000000) * pricing["output"]
    return round(cost, 4)


def calculate_length_ratio(src: str, tgt: str) -> float:
    src_len = max(len(src or ""), 1)
    return len(tgt or "") / src_len


def evaluate_translation_results(results: List[Dict], target_lang: str, model_key: str, duration: float) -> Dict:
    n = len(results)
    tag_ok = [tags_preserved(r["source"], r["translation"]) for r in results]
    ratios = [calculate_length_ratio(r["source"], r["translation"]) for r in results]
    errors = sum(1 for r in results if "[TRANSLATION_ERROR]" in r["translation"])
    avg_latency = duration / n if n else 0.0

    metrics = {
        "model": model_key,
        "target_language": target_lang,
        "n_segments": n,
        "error_count": errors,
        "error_rate": round(errors / n, 3) if n else 0,
        "tag_preservation_rate": round(float(np.mean(tag_ok)), 3) if n else 0,
        "length_ratio_avg": round(float(np.mean(ratios)), 3) if n else 0,
        "length_ratio_std": round(float(np.std(ratios)), 3) if n else 0,
        "total_duration_sec": round(duration, 2),
        "avg_latency_sec": round(avg_latency, 3),
        "segments_per_minute": round(n / (duration / 60), 1) if duration > 0 else 0,
        "estimated_cost_usd": estimate_cost(results, model_key),
        "few_shot_prompting": True,
        "evaluation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    return metrics


def save_metrics(metrics: Dict, model_key: str, target_lang: str):
    out_dir = EVAL_BASE / model_key
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"metrics_{target_lang}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)
    print(f"Saved metrics: {out_path}")


## 7. Main Translation + Evaluation Loop

In [67]:
print("\nStarting translation + evaluation…")
print("=" * 72)

all_results: List[Dict] = []
all_metrics: List[Dict] = []

for model_key, (provider, cfg) in BASELINE_MODELS.items():
    print(f"\n▶ Model: {model_key} ({provider})")
    for lang in TARGET_LANGUAGES:
        print(f"  → Language: {LANGUAGE_NAMES[lang]} ({lang})")
        try:
            results, duration = run_translation_batch(model_key, provider, cfg, lang, UNIQUE_SOURCES, SRC_TO_PATHS)
            metrics = evaluate_translation_results(results, lang, model_key, duration)
            save_metrics(metrics, model_key, lang)
            all_results.extend(results)
            all_metrics.append(metrics)
            print(
                "    Key metrics — "
                f"Tag: {metrics['tag_preservation_rate']:.1%} | "
                f"Latency: {metrics['avg_latency_sec']:.2f}s | "
                f"Speed: {metrics['segments_per_minute']:.1f} seg/min | "
                f"Cost: {('$' + format(metrics['estimated_cost_usd'], '.4f')) if metrics['estimated_cost_usd'] is not None else 'n/a'}"
            )
        except Exception as e:
            print(f"  ❌ Failed {model_key} → {lang}: {e}")
            continue

print(f"\n✅ Pipeline done. Model-language combos processed: {len(all_metrics)}")


Starting translation + evaluation…

▶ Model: gemini-2.5-flash (google)
  → Language: French (fr)
Translating 73 unique strings with gemini-2.5-flash → fr… (workers=2)


gemini-2.5-flash → fr: 100%|███████████████████████████████████████████████████████████| 73/73 [01:53<00:00,  1.55s/it]


✅ gemini-2.5-flash → fr: 76 segments in 113.2s | saved: translations\baseline\gemini-2.5-flash\fr.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_fr.json
    Key metrics — Tag: 71.1% | Latency: 1.49s | Speed: 40.3 seg/min | Cost: $0.0005
  → Language: Japanese (ja)
Translating 73 unique strings with gemini-2.5-flash → ja… (workers=2)


gemini-2.5-flash → ja: 100%|███████████████████████████████████████████████████████████| 73/73 [02:42<00:00,  2.22s/it]


✅ gemini-2.5-flash → ja: 76 segments in 162.1s | saved: translations\baseline\gemini-2.5-flash\ja.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_ja.json
    Key metrics — Tag: 93.4% | Latency: 2.13s | Speed: 28.1 seg/min | Cost: $0.0003
  → Language: Italian (it)
Translating 73 unique strings with gemini-2.5-flash → it… (workers=2)


gemini-2.5-flash → it: 100%|███████████████████████████████████████████████████████████| 73/73 [01:40<00:00,  1.38s/it]


✅ gemini-2.5-flash → it: 76 segments in 100.6s | saved: translations\baseline\gemini-2.5-flash\it.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_it.json
    Key metrics — Tag: 96.1% | Latency: 1.32s | Speed: 45.3 seg/min | Cost: $0.0004

▶ Model: gpt-4o-mini (openai)
  → Language: French (fr)
Translating 73 unique strings with gpt-4o-mini → fr… (workers=4)


gpt-4o-mini → fr: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:17<00:00,  4.23it/s]


✅ gpt-4o-mini → fr: 76 segments in 17.3s | saved: translations\baseline\gpt-4o-mini\fr.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_fr.json
    Key metrics — Tag: 98.7% | Latency: 0.23s | Speed: 263.9 seg/min | Cost: $0.0009
  → Language: Japanese (ja)
Translating 73 unique strings with gpt-4o-mini → ja… (workers=4)


gpt-4o-mini → ja: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:16<00:00,  4.42it/s]


✅ gpt-4o-mini → ja: 76 segments in 16.5s | saved: translations\baseline\gpt-4o-mini\ja.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_ja.json
    Key metrics — Tag: 97.4% | Latency: 0.22s | Speed: 275.8 seg/min | Cost: $0.0005
  → Language: Italian (it)
Translating 73 unique strings with gpt-4o-mini → it… (workers=4)


gpt-4o-mini → it: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:17<00:00,  4.16it/s]


✅ gpt-4o-mini → it: 76 segments in 17.6s | saved: translations\baseline\gpt-4o-mini\it.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_it.json
    Key metrics — Tag: 98.7% | Latency: 0.23s | Speed: 259.6 seg/min | Cost: $0.0008

▶ Model: claude-3-5-sonnet (anthropic)
  → Language: French (fr)
Translating 73 unique strings with claude-3-5-sonnet → fr… (workers=1)


claude-3-5-sonnet → fr (seq): 100%|████████████████████████████████████████████████████| 73/73 [04:23<00:00,  3.61s/it]


✅ claude-3-5-sonnet → fr: 76 segments in 263.6s | saved: translations\baseline\claude-3-5-sonnet\fr.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_fr.json
    Key metrics — Tag: 68.4% | Latency: 3.47s | Speed: 17.3 seg/min | Cost: $0.0216
  → Language: Japanese (ja)
Translating 73 unique strings with claude-3-5-sonnet → ja… (workers=1)


claude-3-5-sonnet → ja (seq): 100%|████████████████████████████████████████████████████| 73/73 [03:01<00:00,  2.49s/it]


✅ claude-3-5-sonnet → ja: 76 segments in 182.0s | saved: translations\baseline\claude-3-5-sonnet\ja.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_ja.json
    Key metrics — Tag: 72.4% | Latency: 2.39s | Speed: 25.1 seg/min | Cost: $0.0132
  → Language: Italian (it)
Translating 73 unique strings with claude-3-5-sonnet → it… (workers=1)


claude-3-5-sonnet → it (seq): 100%|████████████████████████████████████████████████████| 73/73 [03:08<00:00,  2.58s/it]

✅ claude-3-5-sonnet → it: 76 segments in 188.3s | saved: translations\baseline\claude-3-5-sonnet\it.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_it.json
    Key metrics — Tag: 75.0% | Latency: 2.48s | Speed: 24.2 seg/min | Cost: $0.0204

✅ Pipeline done. Model-language combos processed: 9





## 8. Comparison Summary

In [68]:
if all_metrics:
    comparison_df = pd.DataFrame(all_metrics)
    print("\nTRANSLATION QUALITY COMPARISON")
    print("=" * 72)
    for lang in TARGET_LANGUAGES:
        sub = comparison_df[comparison_df["target_language"] == lang]
        if not sub.empty:
            print(f"\n{LANGUAGE_NAMES[lang]} ({lang.upper()})")
            print("-" * 40)
            for _, row in sub.iterrows():
                cost = (f"${row['estimated_cost_usd']:.4f}" if pd.notna(row['estimated_cost_usd']) else "n/a")
                print(
                    f"Model: {row['model']}\n"
                    f"  Tag preservation: {row['tag_preservation_rate']:.1%}\n"
                    f"  Avg latency: {row['avg_latency_sec']:.2f}s\n"
                    f"  Speed: {row['segments_per_minute']:.1f} seg/min\n"
                    f"  Estimated cost: {cost}\n"
                )
    comp_path = EVAL_BASE / "comparison_results.csv"
    comparison_df.to_csv(comp_path, index=False, encoding="utf-8")
    print(f"Comparison saved: {comp_path}")
else:
    print("⚠️ No metrics available for comparison")


TRANSLATION QUALITY COMPARISON

French (FR)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 71.1%
  Avg latency: 1.49s
  Speed: 40.3 seg/min
  Estimated cost: $0.0005

Model: gpt-4o-mini
  Tag preservation: 98.7%
  Avg latency: 0.23s
  Speed: 263.9 seg/min
  Estimated cost: $0.0009

Model: claude-3-5-sonnet
  Tag preservation: 68.4%
  Avg latency: 3.47s
  Speed: 17.3 seg/min
  Estimated cost: $0.0216


Japanese (JA)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 93.4%
  Avg latency: 2.13s
  Speed: 28.1 seg/min
  Estimated cost: $0.0003

Model: gpt-4o-mini
  Tag preservation: 97.4%
  Avg latency: 0.22s
  Speed: 275.8 seg/min
  Estimated cost: $0.0005

Model: claude-3-5-sonnet
  Tag preservation: 72.4%
  Avg latency: 2.39s
  Speed: 25.1 seg/min
  Estimated cost: $0.0132


Italian (IT)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 96.1%
  Avg latency: 1.32s
  Speed: 45.3 seg/

## 9. Highlights

In [69]:
if all_metrics:
    print("\nPERFORMANCE SUMMARY")
    print("=" * 72)
    metrics_to_analyze = [
        ("tag_preservation_rate", "Tag Preservation", True),
        ("segments_per_minute", "Speed (Higher=Better)", True),
        ("avg_latency_sec", "Latency (Lower=Better)", False),
        ("estimated_cost_usd", "Cost (Lower=Better)", False),
    ]    

    for key, name, higher_better in metrics_to_analyze:
        data = [(m["model"], m["target_language"], m.get(key)) for m in all_metrics if m.get(key) is not None]
        if not data:
            continue
        best = (max if higher_better else min)(data, key=lambda x: x[2])
        print(f"Best {name}: {best[0]} ({best[1]}) — {best[2]:.3f}")


PERFORMANCE SUMMARY
Best Tag Preservation: gpt-4o-mini (fr) — 0.987
Best Speed (Higher=Better): gpt-4o-mini (ja) — 275.800
Best Latency (Lower=Better): gpt-4o-mini (ja) — 0.218
Best Cost (Lower=Better): gemini-2.5-flash (ja) — 0.000
