# LLM Translation Baseline + Evaluation

This notebook evaluates baseline translation quality across multiple LLM models:
- **Gemini 2.5 Flash**
- **Claude 3.5 Sonnet**  
- **GPT-4o Mini**

**Target Languages:** French, Italian, Japanese

---

## 1. Environment Setup & Configuration

In [1]:
import os
import re
import json
import time
import random
import warnings
from pathlib import Path
from typing import Any, List, Tuple, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

load_dotenv()
print("LLM Translation Baseline + Evaluation ready!")
print("=" * 72)

LLM Translation Baseline + Evaluation ready!


## 2. Configs & Constants

In [2]:
TARGET_LANGUAGES = ["fr", "ja", "it"]
LANGUAGE_NAMES = {"fr": "French", "ja": "Japanese", "it": "Italian"}

# Concurrency
MAX_WORKERS = int(os.getenv("BASELINE_MAX_WORKERS", "4"))
MAX_RETRIES = int(os.getenv("BASELINE_MAX_RETRIES", "3"))

# Pricing
PRICING_PER_1M_TOKENS = {
    "gemini-2.5-flash": {"input": float(os.getenv("PRICE_GEMINI_IN", 0.075)), "output": float(os.getenv("PRICE_GEMINI_OUT", 0.30))},
    "gpt-4o-mini": {"input": float(os.getenv("PRICE_O4MINI_IN", 0.15)), "output": float(os.getenv("PRICE_O4MINI_OUT", 0.60))},
    "claude-3-5-sonnet": {"input": float(os.getenv("PRICE_SONNET_IN", 3.00)), "output": float(os.getenv("PRICE_SONNET_OUT", 15.00))},
}

OUT_BASE = Path("translations/baseline")
EVAL_BASE = Path("eval/baseline")
OUT_BASE.mkdir(parents=True, exist_ok=True)
EVAL_BASE.mkdir(parents=True, exist_ok=True)

# Per-provider safe concurrency
PROVIDER_MAX_WORKERS = {
    "openai":    int(os.getenv("OPENAI_MAX_WORKERS", "4")),
    "anthropic": int(os.getenv("ANTHROPIC_MAX_WORKERS", "1")),  # Claude: keep low because kept running into overload issues
    "google":    int(os.getenv("GOOGLE_MAX_WORKERS", "2")),
}

def get_max_workers(provider: str, default: int = 2) -> int:
    return max(1, PROVIDER_MAX_WORKERS.get(provider, default))

## 3. Data Loading & Preprocessing

In [3]:
def flatten_json_strings(obj: Any, prefix: str = "") -> List[Tuple[str, str]]:
    out: List[Tuple[str, str]] = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            out.extend(flatten_json_strings(v, new_prefix))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            new_prefix = f"{prefix}[{i}]"
            out.extend(flatten_json_strings(v, new_prefix))
    elif isinstance(obj, str) and obj.strip():
        out.append((prefix, obj))
    return out

SRC_FILE = Path("data/en.json")
if not SRC_FILE.exists():
    raise FileNotFoundError("Missing required file: data/en.json")

with open(SRC_FILE, "r", encoding="utf-8") as f:
    en_json = json.load(f)

en_segments: List[Tuple[str, str]] = flatten_json_strings(en_json)
print(f"Total string segments: {len(en_segments)}")
if en_segments:
    for i, (p, t) in enumerate(en_segments[:3], 1):
        print(f"  {i}. {p}: '{t}'")

# Build mappings for dedupe/expansion
SRC_TO_PATHS: Dict[str, List[str]] = {}
for path, src in en_segments:
    SRC_TO_PATHS.setdefault(src, []).append(path)
UNIQUE_SOURCES = list(SRC_TO_PATHS.keys())
print(f"Unique source strings: {len(UNIQUE_SOURCES)} (dedup from {len(en_segments)})")

Total string segments: 76
  1. nav.design: 'Design'
  2. nav.about: 'About Us'
  3. nav.faq: 'FAQ'
Unique source strings: 73 (dedup from 76)


## 4. LLM Client Initialization & Model Configuration

In [4]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

if not (GOOGLE_API_KEY or OPENAI_API_KEY or ANTHROPIC_API_KEY):
    raise RuntimeError("No API keys found! Add at least one provider key to .env")

print("Initializing clients…")

# Google: prefer new google.genai, fallback to google.generativeai
GOOGLE_CLIENT = None
GOOGLE_MODELS = {}
if GOOGLE_API_KEY:
    try:
        import google.genai as genai
        GOOGLE_CLIENT = genai.Client(api_key=GOOGLE_API_KEY)
        GOOGLE_MODELS["gemini-2.5-flash"] = {"model": os.getenv("GOOGLE_BASELINE_MODEL", "gemini-2.5-flash")}
        print("Google (genai) ready")
    except Exception as e:
        try:
            import google.generativeai as genai_legacy
            genai_legacy.configure(api_key=GOOGLE_API_KEY)
            GOOGLE_CLIENT = genai_legacy
            GOOGLE_MODELS["gemini-2.5-flash"] = {"model": os.getenv("GOOGLE_BASELINE_MODEL", "gemini-2.5-flash")}
            print("Using legacy google.generativeai client")
        except Exception as e2:
            print(f"Google init failed: {e} | {e2}")
            GOOGLE_CLIENT = None

# OpenAI
OPENAI_CLIENT = None
OPENAI_MODELS = {}
if OPENAI_API_KEY:
    try:
        from openai import OpenAI
        OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)
        OPENAI_MODELS["gpt-4o-mini"] = {"model": os.getenv("OPENAI_BASELINE_MODEL", "gpt-4o-mini")}
        print("OpenAI ready")
    except Exception as e:
        print(f"OpenAI init failed: {e}")
        OPENAI_CLIENT = None

# Anthropic
ANTHROPIC_CLIENT = None
ANTHROPIC_MODELS = {}
if ANTHROPIC_API_KEY:
    try:
        from anthropic import Anthropic
        ANTHROPIC_CLIENT = Anthropic(api_key=ANTHROPIC_API_KEY)
        ANTHROPIC_MODELS["claude-3-5-sonnet"] = {"model": os.getenv("ANTHROPIC_BASELINE_MODEL", "claude-3-5-sonnet-latest")}
        print("Anthropic ready")
    except Exception as e:
        print(f"Anthropic init failed: {e}")
        ANTHROPIC_CLIENT = None

# Compose BASELINE_MODELS registry actually available
BASELINE_MODELS: Dict[str, Tuple[str, Dict[str, str]]] = {}
if GOOGLE_CLIENT:
    BASELINE_MODELS.update({"gemini-2.5-flash": ("google", GOOGLE_MODELS["gemini-2.5-flash"])})
if OPENAI_CLIENT:
    BASELINE_MODELS.update({"gpt-4o-mini": ("openai", OPENAI_MODELS["gpt-4o-mini"])})
if ANTHROPIC_CLIENT:
    BASELINE_MODELS.update({"claude-3-5-sonnet": ("anthropic", ANTHROPIC_MODELS["claude-3-5-sonnet"])})

print("Baseline models configured:")
for m, (prov, cfg) in BASELINE_MODELS.items():
    print(f"  • {m} ({prov}): {cfg['model']}")

Initializing clients…
Google (genai) ready
OpenAI ready
Anthropic ready
Baseline models configured:
  • gemini-2.5-flash (google): gemini-2.5-flash
  • gpt-4o-mini (openai): gpt-4o-mini
  • claude-3-5-sonnet (anthropic): claude-3-5-sonnet-latest


## 5. Prompting Helpers

In [5]:
FEWSHOT_EXAMPLES = {
    "fr": [
        ("Each beautiful press-on nail is carefully crafted by <strong>our in-house</strong> nail techs.",
         "Chaque beau press-on nail est soigneusement conçu par nos stylistes ongulaires <strong>maison</strong>."),
        ("Instead of using brittle acrylic nail bases, NaiLit uses Gel-X soft gel nail bases that provide just the right curvature and comfort for your natural nails.",
         "Au lieu d’utiliser des bases d’ongles fragiles en acrylic, NaiLit utilise des bases en gel souple Gel-X qui offrent la courbure et le confort idéals pour vos ongles naturels."),
        ("Whether you feel like elegant nude French coffin nails or trendy seafoam cat eye stiletto nails, NaiLit has just the right options for you to fully customize.",
         "Que vous ayez envie de nude français ballerine élégant ou de vert d’eau cat eye pointu tendance, NaiLit vous propose les options idéales pour une personnalisation totale."),
    ],
    "ja": [
        ("Each beautiful press-on nail is carefully crafted by <strong>our in-house</strong> nail techs.",
         "それぞれの美しい press-on nail は、<strong>当社専属</strong>のネイリストが丁寧に仕上げています。"),
        ("Instead of using brittle acrylic nail bases, NaiLit uses Gel-X soft gel nail bases that provide just the right curvature and comfort for your natural nails.",
         "割れやすいアクリルベースの代わりに、NaiLitは柔らかいジェルのGel-Xベースを使用し、地爪に最適なカーブと快適さを提供します。"),
        ("Whether you feel like elegant nude French coffin nails or trendy seafoam cat eye stiletto nails, NaiLit has just the right options for you to fully customize.",
         "エレガントなヌーディーバレリーナフレンチネールや、トレンディなミントマグネットポイントネールのどちらの気分でも、NaiLitなら思い通りにカスタマイズできます。"),
    ],
    "it": [
        ("Each beautiful press-on nail is carefully crafted by <strong>our in-house</strong> nail techs.",
         "Ogni bellissima press-on nail è accuratamente realizzata dalle nostre onicotecniche <strong>della casa</strong>."),
        ("Instead of using brittle acrylic nail bases, NaiLit uses Gel-X soft gel nail bases that provide just the right curvature and comfort for your natural nails.",
         "Invece di utilizzare basi per unghie fragili in acrylic, NaiLit impiega basi in gel morbido Gel-X che garantiscono la curvatura e il comfort perfetti per le tue unghie naturali."),
        ("Whether you feel like elegant nude French coffin nails or trendy seafoam cat eye stiletto nails, NaiLit has le opzioni perfette per personalizzare al massimo.",
         "Che tu abbia voglia di un’elegante nude French ballerina o di un trendy verde acqua cat-eye stiletto, NaiLit ha le opzioni perfette per personalizzare al massimo."),
    ],
}

LANGUAGE_NAMES_HUMAN = {"fr": "French", "ja": "Japanese", "it": "Italian"}


PROMPT_HEADERS: Dict[str, str] = {}
for lang, pairs in FEWSHOT_EXAMPLES.items():
    lang_name = LANGUAGE_NAMES_HUMAN.get(lang, lang)
    examples = "".join([f"\nExample {i}:\nSource: {src}\nTranslation: {tgt}\n" for i, (src, tgt) in enumerate(pairs, 1)])
    PROMPT_HEADERS[lang] = (
        f"You are a professional UX translator. Translate the following text into {lang_name}.\n\n"
        "IMPORTANT REQUIREMENTS:\n"
        "- Preserve all HTML tags exactly (do not add/remove/reorder tags)\n"
        "- Keep brand names and DNT terms as-is (e.g., NaiLit)\n"
        "- Maintain the original tone and style\n"
        "- Return ONLY the translation between <translation> and </translation> — no notes\n\n"
        f"Here are some examples:{examples}\n\n"
        "Now translate this:\nSource: "
    )

HTML_TAG = re.compile(r"</?\w+(?:\s+[^>]*?)?>", re.IGNORECASE)

def tags_preserved(src: str, tgt: str) -> bool:
    return HTML_TAG.findall(src or "") == HTML_TAG.findall(tgt or "")

In [6]:
def _backoff_sleep(attempt: int, base: float = 0.5, jitter: float = 0.3):
    time.sleep(base * (2 ** attempt) + random.random() * jitter)


def _extract_translation(raw_text: str) -> str:
    m = re.search(r"<translation>([\s\S]*?)</translation>", raw_text or "")
    return (m.group(1).strip() if m else (raw_text or "")).strip()


def translate_one(provider: str, cfg: Dict[str, str], src_text: str, target_lang: str) -> str:
    header = PROMPT_HEADERS[target_lang]
    prompt = f"{header}{src_text}\n\n<translation>"

    for attempt in range(MAX_RETRIES):
        try:
            if provider == "google":
                # New google.genai client
                if hasattr(GOOGLE_CLIENT, "models"):
                    resp = GOOGLE_CLIENT.models.generate_content(model=cfg["model"], contents=prompt)
                    raw = getattr(resp, "text", "") or ""
                # Legacy google.generativeai
                else:
                    model = GOOGLE_CLIENT.GenerativeModel(cfg["model"])  # type: ignore[attr-defined]
                    resp = model.generate_content(prompt)
                    raw = "".join([c.text for c in getattr(resp, "candidates", []) if hasattr(c, "text")]) or getattr(resp, "text", "") or ""
                return _extract_translation(raw)

            elif provider == "openai":
                resp = OPENAI_CLIENT.chat.completions.create(
                    model=cfg["model"],
                    messages=[
                        {"role": "system", "content": "You are a professional translator. Follow the examples and output only the translation enclosed in <translation> tags."},
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0.2,
                    max_tokens=2048,
                )
                raw = resp.choices[0].message.content or ""
                return _extract_translation(raw)

            elif provider == "anthropic":
                resp = ANTHROPIC_CLIENT.messages.create(
                    model=cfg["model"],
                    max_tokens=2048,
                    temperature=0.2,
                    messages=[{"role": "user", "content": prompt}],
                )
                raw = "".join(getattr(b, "text", "") for b in resp.content).strip()
                return _extract_translation(raw)

            else:
                raise ValueError(f"Unknown provider: {provider}")

        except Exception as e:
            # Retry on rate limit / transient failures
            if attempt == MAX_RETRIES - 1:
                print(f"Final failure for provider={provider}: {e}")
                return "[TRANSLATION_ERROR]"
            _backoff_sleep(attempt)

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import os, json, time
from pathlib import Path
from typing import Dict, List, Tuple, Any
from tqdm import tqdm

# Edited to fix the Claude overload issue
PROVIDER_MAX_WORKERS = {
    "openai":    int(os.getenv("OPENAI_MAX_WORKERS", "4")),
    "anthropic": int(os.getenv("ANTHROPIC_MAX_WORKERS", "1")),  # Claude: keep low
    "google":    int(os.getenv("GOOGLE_MAX_WORKERS", "2")),
}
def _get_max_workers(provider: str, default_workers: int) -> int:
    return max(1, PROVIDER_MAX_WORKERS.get(provider, default_workers))

def run_translation_batch(model_key: str, provider: str, cfg: Dict[str, str], target_lang: str,
                          unique_sources: List[str], src_to_paths: Dict[str, List[str]]
                          ) -> Tuple[List[Dict], float]:
    start = time.time()

    out_dir = OUT_BASE / model_key
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"{target_lang}.json"

    existing_map: Dict[str, str] = {}
    if out_file.exists():
        try:
            prev = json.load(open(out_file, "r", encoding="utf-8"))
            by_src: Dict[str, str] = {}
            for row in prev:
                # first translation per unique "source" wins (consistent with your format)
                if "source" in row and "translation" in row and row["source"] not in by_src:
                    by_src[row["source"]] = row["translation"]
            existing_map = by_src
            print(f"Resuming: found {len(existing_map)} prior unique translations")
        except Exception as e:
            print(f"  warn: failed to parse previous output ({out_file}): {e}")

    todo = [s for s in unique_sources if s not in existing_map]
    results_map: Dict[str, str] = dict(existing_map)

    # Provider-aware workers + gentle submit delay for Anthropic
    max_workers = _get_max_workers(provider, default_workers=MAX_WORKERS)
    submit_delay = 0.05 if provider == "anthropic" else 0.0

    print(f"Translating {len(todo)} unique strings with {model_key} → {target_lang}… (workers={max_workers})")

    if len(todo) > 0:
        if max_workers == 1:
            # Sequential path (stable for Claude)
            for src in tqdm(todo, total=len(todo), desc=f"{model_key} → {target_lang} (seq)"):
                try:
                    results_map[src] = translate_one(provider, cfg, src, target_lang)
                except Exception as e:
                    print(f"    warn: failed src≈{src[:48]!r}… | {e}")
                    results_map[src] = "[TRANSLATION_ERROR]"
                if submit_delay:
                    time.sleep(submit_delay)
        else:
            from concurrent.futures import ThreadPoolExecutor, as_completed
            with ThreadPoolExecutor(max_workers=max_workers) as ex:
                futures = {}
                for src in todo:
                    fut = ex.submit(translate_one, provider, cfg, src, target_lang)
                    futures[fut] = src
                    if submit_delay:
                        time.sleep(submit_delay)  # soften burst for Anthropic
                for fut in tqdm(as_completed(futures), total=len(futures), desc=f"{model_key} → {target_lang}"):
                    src = futures[fut]
                    try:
                        results_map[src] = fut.result()
                    except Exception as e:
                        print(f"    warn: failed src≈{src[:48]!r}… | {e}")
                        results_map[src] = "[TRANSLATION_ERROR]"

    duration = time.time() - start

    now = time.strftime("%Y-%m-%d %H:%M:%S")
    results: List[Dict[str, Any]] = []
    for src, paths in src_to_paths.items():
        tr = results_map.get(src, "[TRANSLATION_ERROR]")
        for path in paths:
            results.append({
                "path": path,
                "source": src,
                "translation": tr,
                "model": model_key,
                "target_lang": target_lang,
                "timestamp": now,
            })

    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"{model_key} → {target_lang}: {len(results)} segments in {duration:.1f}s | saved: {out_file}")
    return results, duration

## 6. Metrics Helpers

In [8]:
def estimate_tokens(text: str) -> int:
    # rough approximation: 4 chars/token
    return max(1, len(text) // 4)


def estimate_cost(results: List[Dict], model_key: str) -> Optional[float]:
    if model_key not in PRICING_PER_1M_TOKENS:
        return None
    pricing = PRICING_PER_1M_TOKENS[model_key]
    in_tok = sum(estimate_tokens(r["source"]) for r in results)
    out_tok = sum(estimate_tokens(r["translation"]) for r in results)
    cost = (in_tok / 1000000) * pricing["input"] + (out_tok / 1000000) * pricing["output"]
    return round(cost, 4)


def calculate_length_ratio(src: str, tgt: str) -> float:
    src_len = max(len(src or ""), 1)
    return len(tgt or "") / src_len


def evaluate_translation_results(results: List[Dict], target_lang: str, model_key: str, duration: float) -> Dict:
    n = len(results)
    tag_ok = [tags_preserved(r["source"], r["translation"]) for r in results]
    ratios = [calculate_length_ratio(r["source"], r["translation"]) for r in results]
    errors = sum(1 for r in results if "[TRANSLATION_ERROR]" in r["translation"])
    avg_latency = duration / n if n else 0.0

    metrics = {
        "model": model_key,
        "target_language": target_lang,
        "n_segments": n,
        "error_count": errors,
        "error_rate": round(errors / n, 3) if n else 0,
        "tag_preservation_rate": round(float(np.mean(tag_ok)), 3) if n else 0,
        "length_ratio_avg": round(float(np.mean(ratios)), 3) if n else 0,
        "length_ratio_std": round(float(np.std(ratios)), 3) if n else 0,
        "total_duration_sec": round(duration, 2),
        "avg_latency_sec": round(avg_latency, 3),
        "segments_per_minute": round(n / (duration / 60), 1) if duration > 0 else 0,
        "estimated_cost_usd": estimate_cost(results, model_key),
        "few_shot_prompting": True,
        "evaluation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }
    return metrics


def save_metrics(metrics: Dict, model_key: str, target_lang: str):
    out_dir = EVAL_BASE / model_key
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"metrics_{target_lang}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)
    print(f"Saved metrics: {out_path}")


## 7. Main Translation + Evaluation Loop

In [10]:
#Begin the LLM Battle Royale

print("\nStarting translation + evaluation…")
print("=" * 72)

all_results: List[Dict] = []
all_metrics: List[Dict] = []

for model_key, (provider, cfg) in BASELINE_MODELS.items():
    print(f"\n▶ Model: {model_key} ({provider})")
    for lang in TARGET_LANGUAGES:
        print(f"  → Language: {LANGUAGE_NAMES[lang]} ({lang})")
        try:
            results, duration = run_translation_batch(model_key, provider, cfg, lang, UNIQUE_SOURCES, SRC_TO_PATHS)
            metrics = evaluate_translation_results(results, lang, model_key, duration)
            save_metrics(metrics, model_key, lang)
            all_results.extend(results)
            all_metrics.append(metrics)
            print(
                "    Key metrics — "
                f"Tag: {metrics['tag_preservation_rate']:.1%} | "
                f"Latency: {metrics['avg_latency_sec']:.2f}s | "
                f"Speed: {metrics['segments_per_minute']:.1f} seg/min | "
                f"Cost: {('$' + format(metrics['estimated_cost_usd'], '.4f')) if metrics['estimated_cost_usd'] is not None else 'n/a'}"
            )
        except Exception as e:
            print(f"Failed {model_key} → {lang}: {e}")
            continue

print(f"\nPipeline done. Model-language combos processed: {len(all_metrics)}")


Starting translation + evaluation…

▶ Model: gemini-2.5-flash (google)
  → Language: French (fr)
Translating 73 unique strings with gemini-2.5-flash → fr… (workers=2)


gemini-2.5-flash → fr:  16%|█████████▋                                                 | 12/73 [00:26<01:51,  1.83s/it]

Final failure for provider=google: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10\nPlease retry in 6.011507153s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '10'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.Ret

gemini-2.5-flash → fr:  19%|███████████▎                                               | 14/73 [00:29<01:21,  1.38s/it]

Final failure for provider=google: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10\nPlease retry in 3.704801524s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-flash', 'location': 'global'}, 'quotaValue': '10'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.Ret

gemini-2.5-flash → fr: 100%|███████████████████████████████████████████████████████████| 73/73 [02:06<00:00,  1.73s/it]


gemini-2.5-flash → fr: 76 segments in 126.1s | saved: translations\baseline\gemini-2.5-flash\fr.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_fr.json
    Key metrics — Tag: 98.7% | Latency: 1.66s | Speed: 36.2 seg/min | Cost: $0.0004
  → Language: Japanese (ja)
Translating 73 unique strings with gemini-2.5-flash → ja… (workers=2)


gemini-2.5-flash → ja: 100%|███████████████████████████████████████████████████████████| 73/73 [03:21<00:00,  2.76s/it]


gemini-2.5-flash → ja: 76 segments in 201.3s | saved: translations\baseline\gemini-2.5-flash\ja.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_ja.json
    Key metrics — Tag: 97.4% | Latency: 2.65s | Speed: 22.7 seg/min | Cost: $0.0003
  → Language: Italian (it)
Translating 73 unique strings with gemini-2.5-flash → it… (workers=2)


gemini-2.5-flash → it: 100%|███████████████████████████████████████████████████████████| 73/73 [02:31<00:00,  2.08s/it]


gemini-2.5-flash → it: 76 segments in 151.8s | saved: translations\baseline\gemini-2.5-flash\it.json
Saved metrics: eval\baseline\gemini-2.5-flash\metrics_it.json
    Key metrics — Tag: 100.0% | Latency: 2.00s | Speed: 30.0 seg/min | Cost: $0.0004

▶ Model: gpt-4o-mini (openai)
  → Language: French (fr)
Translating 73 unique strings with gpt-4o-mini → fr… (workers=4)


gpt-4o-mini → fr: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:32<00:00,  2.23it/s]


gpt-4o-mini → fr: 76 segments in 32.7s | saved: translations\baseline\gpt-4o-mini\fr.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_fr.json
    Key metrics — Tag: 100.0% | Latency: 0.43s | Speed: 139.5 seg/min | Cost: $0.0008
  → Language: Japanese (ja)
Translating 73 unique strings with gpt-4o-mini → ja… (workers=4)


gpt-4o-mini → ja: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:22<00:00,  3.20it/s]


gpt-4o-mini → ja: 76 segments in 22.8s | saved: translations\baseline\gpt-4o-mini\ja.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_ja.json
    Key metrics — Tag: 100.0% | Latency: 0.30s | Speed: 199.9 seg/min | Cost: $0.0005
  → Language: Italian (it)
Translating 73 unique strings with gpt-4o-mini → it… (workers=4)


gpt-4o-mini → it: 100%|████████████████████████████████████████████████████████████████| 73/73 [00:21<00:00,  3.34it/s]


gpt-4o-mini → it: 76 segments in 21.9s | saved: translations\baseline\gpt-4o-mini\it.json
Saved metrics: eval\baseline\gpt-4o-mini\metrics_it.json
    Key metrics — Tag: 100.0% | Latency: 0.29s | Speed: 208.1 seg/min | Cost: $0.0008

▶ Model: claude-3-5-sonnet (anthropic)
  → Language: French (fr)
Translating 73 unique strings with claude-3-5-sonnet → fr… (workers=1)


claude-3-5-sonnet → fr (seq): 100%|████████████████████████████████████████████████████| 73/73 [01:13<00:00,  1.01s/it]


claude-3-5-sonnet → fr: 76 segments in 73.9s | saved: translations\baseline\claude-3-5-sonnet\fr.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_fr.json
    Key metrics — Tag: 17.1% | Latency: 0.97s | Speed: 61.7 seg/min | Cost: $0.0236
  → Language: Japanese (ja)
Translating 73 unique strings with claude-3-5-sonnet → ja… (workers=1)


claude-3-5-sonnet → ja (seq): 100%|████████████████████████████████████████████████████| 73/73 [01:52<00:00,  1.54s/it]


claude-3-5-sonnet → ja: 76 segments in 112.6s | saved: translations\baseline\claude-3-5-sonnet\ja.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_ja.json
    Key metrics — Tag: 32.9% | Latency: 1.48s | Speed: 40.5 seg/min | Cost: $0.0151
  → Language: Italian (it)
Translating 73 unique strings with claude-3-5-sonnet → it… (workers=1)


claude-3-5-sonnet → it (seq): 100%|████████████████████████████████████████████████████| 73/73 [01:38<00:00,  1.35s/it]

claude-3-5-sonnet → it: 76 segments in 98.3s | saved: translations\baseline\claude-3-5-sonnet\it.json
Saved metrics: eval\baseline\claude-3-5-sonnet\metrics_it.json
    Key metrics — Tag: 18.4% | Latency: 1.29s | Speed: 46.4 seg/min | Cost: $0.0230

Pipeline done. Model-language combos processed: 9





## 8. Comparison Summary

In [13]:
if all_metrics:
    comparison_df = pd.DataFrame(all_metrics)
    print("\nTRANSLATION QUALITY COMPARISON")
    print("=" * 72)
    for lang in TARGET_LANGUAGES:
        sub = comparison_df[comparison_df["target_language"] == lang]
        if not sub.empty:
            print(f"\n{LANGUAGE_NAMES[lang]} ({lang.upper()})")
            print("-" * 40)
            for _, row in sub.iterrows():
                cost = (f"${row['estimated_cost_usd']:.4f}" if pd.notna(row['estimated_cost_usd']) else "n/a")
                print(
                    f"Model: {row['model']}\n"
                    f"  Tag preservation: {row['tag_preservation_rate']:.1%}\n"
                    f"  Avg latency: {row['avg_latency_sec']:.2f}s\n"
                    f"  Speed: {row['segments_per_minute']:.1f} seg/min\n"
                    f"  Estimated cost: {cost}\n"
                )
    comp_path = EVAL_BASE / "comparison_results.csv"
    comparison_df.to_csv(comp_path, index=False, encoding="utf-8")
    print(f"Comparison saved: {comp_path}")
else:
    print("No metrics available for comparison")


TRANSLATION QUALITY COMPARISON

French (FR)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 98.7%
  Avg latency: 1.66s
  Speed: 36.2 seg/min
  Estimated cost: $0.0004

Model: gpt-4o-mini
  Tag preservation: 100.0%
  Avg latency: 0.43s
  Speed: 139.5 seg/min
  Estimated cost: $0.0008

Model: claude-3-5-sonnet
  Tag preservation: 17.1%
  Avg latency: 0.97s
  Speed: 61.7 seg/min
  Estimated cost: $0.0236


Japanese (JA)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 97.4%
  Avg latency: 2.65s
  Speed: 22.7 seg/min
  Estimated cost: $0.0003

Model: gpt-4o-mini
  Tag preservation: 100.0%
  Avg latency: 0.30s
  Speed: 199.9 seg/min
  Estimated cost: $0.0005

Model: claude-3-5-sonnet
  Tag preservation: 32.9%
  Avg latency: 1.48s
  Speed: 40.5 seg/min
  Estimated cost: $0.0151


Italian (IT)
----------------------------------------
Model: gemini-2.5-flash
  Tag preservation: 100.0%
  Avg latency: 2.00s
  Speed: 30.0 s