In [1]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())

2.8.0+cu128
CUDA available: True
Device count: 2


# Run execution

In [2]:
# pip install sentence-transformers cuvs-cpu tqdm pandas

import json
import math
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm
# (cuvs import removed; using cuVS)
from sentence_transformers import SentenceTransformer
import re
import gc, torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === cuVS (CAGRA) fast GPU ANN index ===
# Requires: pip install cuvs-cu12 cupy-cuda12x  (use NVIDIA's PyPI index for cuvs)
#   pip install cuvs-cu12 --extra-index-url https://pypi.nvidia.com
#   pip install cupy-cuda12x   # choose the 'cuda12x' build that matches your CUDA 12 runtime
try:
    import cupy as cp
    from cuvs.neighbors import cagra
    CUVS_AVAILABLE = True
except Exception as e:
    CUVS_AVAILABLE = False
    print("cuVS not available:", e)

class CuVSIndex:
    """
    Minimal adapter to mimic cuvs .search() API with cuVS CAGRA.
    Assumes input vectors are float32 and (for cosine) already L2-normalized.
    """
    def __init__(self, xb: "np.ndarray", device_id: int = 0, graph_degree: int = 64, itopk_size: int = 64):
        if not CUVS_AVAILABLE:
            raise RuntimeError("cuVS not available; please install cuvs-cu12 and cupy-cuda12x.")
        self.device_id = device_id
        with cp.cuda.Device(self.device_id):
            self.xb = cp.asarray(xb, dtype=cp.float32, order="C")
            # Build CAGRA graph index on GPU
            self.index = cagra.build(self.xb, graph_degree=graph_degree, itopk_size=itopk_size)

    def search(self, xq: "np.ndarray", k: int):
        with cp.cuda.Device(self.device_id):
            q = cp.asarray(xq, dtype=cp.float32, order="C")
            D, I = cagra.search(self.index, q, k)
            return cp.asnumpy(D), cp.asnumpy(I)

def build_gpu_index_with_cuvs(xb: "np.ndarray", device_id: int = 0):
    """
    Build a single-GPU cuVS CAGRA index (device_id 0 by default) and return (index, True).
    Ensures xb is float32, C-contiguous, and lives in host memory; it will be moved once to GPU.
    """
    xb = np.asarray(xb, dtype=np.float32, order="C")
    idx = CuVSIndex(xb, device_id=device_id)
    return idx, True

In [4]:
# === Single-GPU performance flags (PyTorch 2.8 on Ada/L40) ===
import torch, os
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")
torch.set_num_threads(max(1, (os.cpu_count() or 4)//2))

# Optional: compile models constructed after this (speeds up forward pass)
def torch_compile_if_available(m):
    try:
        return torch.compile(m)
    except Exception:
        return m

In [5]:
# === Fast embedding helper (batched, normalized, no_grad) ===
from typing import List
import numpy as np
from sentence_transformers import SentenceTransformer

def embed_texts(model: "SentenceTransformer", texts: List[str], batch_size: int = 2048) -> np.ndarray:
    # SentenceTransformers will handle batching on GPU; convert directly to numpy and normalize
    with torch.inference_mode():
        embs = model.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=True,
            normalize_embeddings=True,   # cosine-ready; use IP in index
            device="cuda"
        )
    # ensure C-contiguous float32 for cuVS
    embs = np.asarray(embs, dtype=np.float32, order="C")
    return embs

In [None]:
DOCS_GLOB          = "../Data/docs_subset.jsonl"   # docs with {"doc_id", "<lang>": {"text": ...}}
QRELS_PATH         = "../Data/qrels_subset.jsonl"     # filtered qrels (only for the selected queries)
MERGED_QUERIES     = "../Data/queries_subset.jsonl"  # merged query variants in one file
OUT_ROOT           = Path("runs_cuvs_test")            # outputs will be saved here
TOP_K              = 10
BATCH_SIZE_DOCS    = 64
BATCH_SIZE_QUERIES = 64
USE_GPU_IF_AVAIL   = True       # cuvs GPU if available
NORMALIZE_L2       = True       # cosine via inner product after L2-normalization
PILOT_IDS_FILE     = None       # e.g., "pilot_base_ids.txt" to restrict queries, or None

# Replace with your actual 6 models
MODELS = [
    "jinaai/jina-embeddings-v3",
    "intfloat/multilingual-e5-large",
    "intfloat/multilingual-e5-small",
    "upskyy/bge-m3-korean",
    "intfloat/multilingual-e5-base",
    "Alibaba-NLP/gte-multilingual-base"
]


In [7]:
from pathlib import Path
from typing import List, Tuple, Dict, Set, Optional
import json

def load_docs_for_lang(
    doc_lang: str,
    doc_id_whitelist: Optional[Set[str]] = None,
    tag_ids_with_lang: bool = True
) -> Tuple[List[str], List[str]]:
    """Return (doc_ids, texts) for the chosen language.

    - If doc_id_whitelist is given, only documents whose raw doc_id is in the whitelist are loaded.
    - If tag_ids_with_lang=True, returned IDs are '<doc_id>|<lang>' so the GT and retrieval IDs match.
    """
    ids, texts = [], []
    for p in sorted(Path(".").glob(DOCS_GLOB)):
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                o = json.loads(line)
                did_raw = str(o.get("doc_id") or o.get("id") or "").strip()
                if not did_raw:
                    continue
                if (doc_id_whitelist is not None) and (did_raw not in doc_id_whitelist):
                    continue
                lang_obj = o.get(doc_lang) or {}
                txt = lang_obj.get("text")
                if isinstance(txt, str) and txt.strip():
                    did = f"{did_raw}|{doc_lang}" if tag_ids_with_lang else did_raw
                    ids.append(did)
                    texts.append(txt.strip())
    return ids, texts

def collect_gt_doc_ids_for_queries(
    qrels: Dict[str, List[str]],
    qids: List[str]
) -> Set[str]:
    """Collect raw GT doc_ids for the given qids from qrels {qid: [doc_id,...]}."""
    keep: Set[str] = set()
    for qid in qids:
        for did in qrels.get(qid, []):
            keep.add(str(did))
    return keep

def expand_gt_to_all_langs(
    qrels: Dict[str, List[str]],
    qids: List[str],
    langs: Tuple[str, ...] = ("en", "es", "zh")
) -> Dict[str, Set[str]]:
    """Return {qid: set('<doc_id>|<lang>')} so hits on any language version count."""
    out: Dict[str, Set[str]] = {}
    for qid in qids:
        s: Set[str] = set()
        for did in qrels.get(qid, []):
            for lg in langs:
                s.add(f"{did}|{lg}")
        out[qid] = s
    return out


def load_qrels(path: str) -> Dict[str, List[str]]:
    """Return {query_id: [all relevant doc_ids]} (relevance>0)."""
    rels = defaultdict(list)
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            o = json.loads(line)
            qid = str(o.get("query_id") or "").strip()
            did = str(o.get("doc_id") or "").strip()
            rel = int(o.get("relevance", 1))
            if qid and did and rel > 0:
                rels[qid].append(did)
    return rels

def load_keep_ids(path: Optional[str]) -> Optional[set]:
    if not path:
        return None
    p = Path(path)
    if not p.exists():
        return None
    return {line.strip() for line in p.read_text(encoding="utf-8").splitlines() if line.strip()}

def load_queries_cases(merged_jsonl: str, doc_lang: str, keep_ids: Optional[set]=None):
    """
    Return:
      cases: { case_name : { query_id : query_text } }
      sw_map: { case_name : { query_id : sw_rate_or_None } }  # only for CS cases

    Now loads ALL query languages for ANY target doc language, including ALL CS pairs routed to that doc_lang.
    Cases per doc_lang (expanded):
      en: 'en', 'es->en', 'zh->en', 'es-en->en', 'zh-en->en', 'zh-es->en'
      es: 'es', 'en->es', 'zh->es', 'es-en->es', 'zh-en->es', 'zh-es->es'
      zh: 'zh', 'en->zh', 'es->zh', 'es-en->zh', 'zh-en->zh', 'zh-es->zh'
    """
    CASES_BY_DOC = {
        "en": ["en", "es->en", "zh->en", "es-en->en", "zh-en->en", "zh-es->en"],
        "es": ["es", "en->es", "zh->es", "es-en->es", "zh-en->es", "zh-es->es"],
        "zh": ["zh", "en->zh", "es->zh", "es-en->zh", "zh-en->zh", "zh-es->zh"],
    }
    if doc_lang not in CASES_BY_DOC:
        raise ValueError(f"Unsupported doc_lang={doc_lang}")

    cases = {k: {} for k in CASES_BY_DOC[doc_lang]}
    sw_map = {k: {} for k in CASES_BY_DOC[doc_lang]}

    with open(merged_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            o = json.loads(line)
            qid = str(o.get("id") or o.get("query_id") or "").strip()
            if not qid:
                continue
            if keep_ids is not None and qid not in keep_ids:
                continue

            # monolingual
            en_q = o.get("eng_q")
            es_q = o.get("spa_q") or o.get("esp_q")
            zh_q = o.get("zh_q")
            # code-switched (pair-specific)
            es_en_q = o.get("es_en_q"); es_en_sw = o.get("es_en_sw_rate")
            zh_en_q = o.get("zh_en_q"); zh_en_sw = o.get("zh_en_sw_rate")
            zh_es_q = o.get("zh_es_q"); zh_es_sw = o.get("zh_es_sw_rate")

            def as_float(x):
                try:
                    return float(x) if x is not None and str(x).strip() != "" else None
                except Exception:
                    return None
            es_en_sw = as_float(es_en_sw)
            zh_en_sw = as_float(zh_en_sw)
            zh_es_sw = as_float(zh_es_sw)

            # ---- Monolingual routed to doc_lang ----
            if doc_lang == "en" and isinstance(en_q, str) and en_q.strip():
                cases["en"][qid] = en_q.strip()
            if doc_lang == "es" and isinstance(es_q, str) and es_q.strip():
                cases["es"][qid] = es_q.strip()
            if doc_lang == "zh" and isinstance(zh_q, str) and zh_q.strip():
                cases["zh"][qid] = zh_q.strip()

            # ---- Cross-lingual routed to doc_lang (ALL sources) ----
            if doc_lang == "en":
                if isinstance(es_q, str) and es_q.strip(): cases["es->en"][qid] = es_q.strip()
                if isinstance(zh_q, str) and zh_q.strip(): cases["zh->en"][qid] = zh_q.strip()
            if doc_lang == "es":
                if isinstance(en_q, str) and en_q.strip(): cases["en->es"][qid] = en_q.strip()
                if isinstance(zh_q, str) and zh_q.strip(): cases["zh->es"][qid] = zh_q.strip()
            if doc_lang == "zh":
                if isinstance(en_q, str) and en_q.strip(): cases["en->zh"][qid] = en_q.strip()
                if isinstance(es_q, str) and es_q.strip(): cases["es->zh"][qid] = es_q.strip()

            # ---- Code-switched routed to doc_lang (ALL pairs) ----
            # es-en
            if isinstance(es_en_q, str) and es_en_q.strip():
                if doc_lang == "en":
                    cases["es-en->en"][qid] = es_en_q.strip(); sw_map["es-en->en"][qid] = es_en_sw
                if doc_lang == "es":
                    cases["es-en->es"][qid] = es_en_q.strip(); sw_map["es-en->es"][qid] = es_en_sw
                if doc_lang == "zh":
                    cases["es-en->zh"][qid] = es_en_q.strip(); sw_map["es-en->zh"][qid] = es_en_sw

            # zh-en
            if isinstance(zh_en_q, str) and zh_en_q.strip():
                if doc_lang == "en":
                    cases["zh-en->en"][qid] = zh_en_q.strip(); sw_map["zh-en->en"][qid] = zh_en_sw
                if doc_lang == "es":
                    cases["zh-en->es"][qid] = zh_en_q.strip(); sw_map["zh-en->es"][qid] = zh_en_sw
                if doc_lang == "zh":
                    cases["zh-en->zh"][qid] = zh_en_q.strip(); sw_map["zh-en->zh"][qid] = zh_en_sw

            # zh-es
            if isinstance(zh_es_q, str) and zh_es_q.strip():
                if doc_lang == "en":
                    cases["zh-es->en"][qid] = zh_es_q.strip(); sw_map["zh-es->en"][qid] = zh_es_sw
                if doc_lang == "es":
                    cases["zh-es->es"][qid] = zh_es_q.strip(); sw_map["zh-es->es"][qid] = zh_es_sw
                if doc_lang == "zh":
                    cases["zh-es->zh"][qid] = zh_es_q.strip(); sw_map["zh-es->zh"][qid] = zh_es_sw

    return cases, sw_map


def print_case_counts(cases: Dict[str, Dict[str,str]], label: str):
    print(f"--- Query availability for doc_lang={label} ---")
    for k in sorted(cases.keys()):
        print(f"{k:12s}: {len(cases[k]):6d}")

def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int=256, normalize: bool=True) -> np.ndarray:
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embeddings", leave=False):
        chunk = texts[i:i+batch_size]
        v = model.encode(chunk, batch_size=batch_size, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=normalize)
        if normalize and not np.allclose(np.linalg.norm(v, axis=1), 1.0, atol=1e-3):
            v = v / np.linalg.norm(v, axis=1, keepdims=True)
        embs.append(v.astype("float32"))
    return np.vstack(embs) if embs else np.zeros((0, model.get_sentence_embedding_dimension()), dtype="float32")


In [8]:
# === cuVS (CAGRA) fast GPU ANN index — version-robust adapter ===
import numpy as np

try:
    import cupy as cp
    from cuvs.neighbors import cagra
    CUVS_AVAILABLE = True
except Exception as e:
    CUVS_AVAILABLE = False
    print("cuVS not available:", e)

class CuVSIndex:
    """Minimal adapter to mimic cuvs .search() with cuVS CAGRA."""
    def __init__(self, xb: "np.ndarray", device_id: int = 0,
                 graph_degree: int = 64, itopk_size: int = 64):
        if not CUVS_AVAILABLE:
            raise RuntimeError("cuVS not available; install cuvs-cu12 and cupy-cuda12x.")
        self.device_id = device_id
        self.itopk_size = itopk_size
        with cp.cuda.Device(self.device_id):
            xb = np.asarray(xb, dtype=np.float32, order="C")
            self.xb = cp.asarray(xb, dtype=cp.float32, order="C")

            # Build params (handle API differences across cuVS versions)
            try:
                ip = cagra.IndexParams(graph_degree=graph_degree)
            except TypeError:
                ip = cagra.IndexParams()
                try:
                    ip.graph_degree = graph_degree
                except Exception:
                    pass

            # Build index (try preferred then fallback signature)
            try:
                self.index = cagra.build(ip, self.xb)
            except TypeError:
                self.index = cagra.build(self.xb, ip)

    def search(self, xq: "np.ndarray", k: int):
        with cp.cuda.Device(self.device_id):
            q = cp.asarray(np.asarray(xq, dtype=np.float32, order="C"))

            # Search params (handle API differences)
            try:
                sp = cagra.SearchParams(itopk_size=self.itopk_size)
            except TypeError:
                sp = cagra.SearchParams()
                try:
                    sp.itopk_size = self.itopk_size
                except Exception:
                    pass

            # Try both call orders
            try:
                D, I = cagra.search(sp, self.index, q, k)
            except TypeError:
                D, I = cagra.search(self.index, q, k, sp)

            return cp.asnumpy(D), cp.asnumpy(I)

def build_gpu_index_with_cuvs(xb: "np.ndarray", device_id: int = 0):
    """Build a single-GPU cuVS CAGRA index and return (index, True)."""
    xb = np.asarray(xb, dtype=np.float32, order="C")
    idx = CuVSIndex(xb, device_id=device_id)
    return idx, True

def build_cuvs_index(d: int, xb: "np.ndarray"):
    """Compatibility wrapper so existing code can call build_cuvs_index(...)."""
    idx, _ = build_gpu_index_with_cuvs(xb, device_id=0)
    return idx, True


In [9]:
def safe_case_name(case: str) -> str:
    """
    Make a case string safe for Windows filenames.
    - Replace '->' with '__to__'
    - Remove/replace characters forbidden on Windows: <>:"/\\|?*
    - Collapse spaces
    """
    s = case.replace("->", "__to__")
    s = re.sub(r'[<>:"/\\|?*]', "_", s)
    s = re.sub(r"\s+", "_", s)
    return s

In [10]:
# === RETRIEVAL ONLY (no metric computation): GLOBAL CORPUS (all GT in en/es/zh) ===
OUT_ROOT.mkdir(parents=True, exist_ok=True)
keep_ids = load_keep_ids(PILOT_IDS_FILE)
qrels = load_qrels(QRELS_PATH)

ALL_LANGS = ("en", "es", "zh")

for model_name in MODELS:
    print(f"\n=== MODEL: {model_name} ===")
    model_slug = model_name.replace("/", "__")
    model_out = OUT_ROOT / model_slug
    model_out.mkdir(parents=True, exist_ok=True)

    # load model once
    model = SentenceTransformer(model_name, trust_remote_code=True, device='cuda')
    dim = model.get_sentence_embedding_dimension()

    # ------------------------------------------------------------
    # 1) Build ALL cases once (for each routing doc_lang), collect ALL qids
    # ------------------------------------------------------------
    cases_all: dict[str, dict[str, str]] = {}
    sw_map_all: dict[str, dict[str, float]] = {}
    qids_all: set[str] = set()

    for doc_lang in ["es", "en", "zh"]:
        cases, sw_map = load_queries_cases(MERGED_QUERIES, doc_lang, keep_ids)
        print_case_counts(cases, f"{doc_lang} (eligible)")
        for case_name, qmap in cases.items():
            if not qmap:
                continue
            # Keep only queries that have at least one GT
            qids_case = [qid for qid in qmap.keys() if (qid in qrels and len(qrels[qid]) > 0)]
            if not qids_case:
                continue
            # Merge into global structures
            cases_all.setdefault(case_name, {})
            sw_map_all.setdefault(case_name, {})
            for qid in qids_case:
                cases_all[case_name][qid] = qmap[qid]
                if case_name in sw_map and qid in sw_map[case_name]:
                    sw_map_all[case_name][qid] = sw_map[case_name][qid]
            qids_all.update(qids_case)

    if not qids_all:
        print("No queries with qrels found across all cases. Skipping model.")
        del model
        torch.cuda.empty_cache(); gc.collect()
        continue

    qids_all = sorted(qids_all)
    print(f"Total unique queries (with qrels) across all cases: {len(qids_all):,}")

    # ------------------------------------------------------------
    # 2) Build global GT-only whitelist from ALL qids, load docs in en/es/zh
    # ------------------------------------------------------------
    gt_whitelist_all = collect_gt_doc_ids_for_queries(qrels, qids_all)
    if not gt_whitelist_all:
        print("Global GT whitelist empty. Skipping model.")
        del model
        torch.cuda.empty_cache(); gc.collect()
        continue

    ids_en, txt_en = load_docs_for_lang("en", doc_id_whitelist=gt_whitelist_all, tag_ids_with_lang=True)
    ids_es, txt_es = load_docs_for_lang("es", doc_id_whitelist=gt_whitelist_all, tag_ids_with_lang=True)
    ids_zh, txt_zh = load_docs_for_lang("zh", doc_id_whitelist=gt_whitelist_all, tag_ids_with_lang=True)

    doc_ids  = ids_en + ids_es + ids_zh
    doc_text = txt_en + txt_es + txt_zh
    print(f"[GLOBAL] Doc pool (GT-only, 3-langs): {len(doc_ids):,}")

    if not doc_ids:
        print("No documents loaded for global corpus. Skipping model.")
        del model
        torch.cuda.empty_cache(); gc.collect()
        continue

    # ------------------------------------------------------------
    # 3) Embed & index docs ONCE per model (global index)
    # ------------------------------------------------------------
    xb = embed_texts(model, doc_text, batch_size=BATCH_SIZE_DOCS, normalize=NORMALIZE_L2)
    index, gpu_res = build_cuvs_index(dim, xb)
    print(f"[GLOBAL] Index ready. Docs: {len(doc_ids)}, dim={dim}")

    # ------------------------------------------------------------
    # 4) Run retrieval for each case against the SAME global index
    # ------------------------------------------------------------
    for case_name, qmap in cases_all.items():
        if not qmap:
            print(f"[{case_name}] No queries. Skipping.")
            continue

        qids_eval = sorted(qmap.keys())
        q_texts = [qmap[qid] for qid in qids_eval]

        # Embed queries and search
        qv = embed_texts(model, q_texts, batch_size=BATCH_SIZE_QUERIES, normalize=NORMALIZE_L2)
        D, I = index.search(qv, TOP_K)

        doc_id_arr = np.array(doc_ids)
        retrieved_lists = [doc_id_arr[I[i]].tolist() for i in range(len(qids_eval))]

        # Expanded GT for the qids in THIS case (so hits on any language count)
        expanded_gt = expand_gt_to_all_langs(qrels, qids_eval, langs=ALL_LANGS)

        # Write CSV rows (no metrics)
        rows = []
        qlang = case_name.split("->")[0] if "->" in case_name else case_name

        for i, qid in enumerate(qids_eval):
            gt_tagged = sorted(list(expanded_gt.get(qid, set())))  # 'doc_id|lang'
            ret = [str(x) for x in retrieved_lists[i]]             # 'doc_id|lang'
            sw = sw_map_all.get(case_name, {}).get(qid)            # only for CS cases

            # optional: convenience column for language preference (top-1)
            ret_top1_lang = ret[0].split("|")[-1] if ret else None

            rows.append({
                "query_id": qid,
                "query_language": qlang,
                "query": qmap[qid],
                "ground_truth_doc_ids": json.dumps(gt_tagged, ensure_ascii=False),
                "retrieved_doc_ids": json.dumps(ret, ensure_ascii=False),
                "sw_rate": sw,
                "ret_top1_lang": ret_top1_lang,
            })

        out_dir = model_out / "D_ALL_GT3L"  # single global corpus
        out_dir.mkdir(parents=True, exist_ok=True)

        safe_name = safe_case_name(case_name)
        out_csv = out_dir / f"Q_{safe_name}.csv"

        pd.DataFrame(rows).to_csv(out_csv, index=False, encoding="utf-8")
        print(f"[{case_name}] Saved {len(rows)} rows → {out_csv}")

    # cleanup per model
    del index, xb, doc_ids, doc_text
    if gpu_res is not None:
        del gpu_res
    del model
    torch.cuda.empty_cache()
    gc.collect()

print("\nRETRIEVAL TEST RUN COMPLETE (GLOBAL corpus, no metrics).")



=== MODEL: jinaai/jina-embeddings-v3 ===


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention impl

--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:00:46:913049][info  ] optimizing graph           
[454601][15:00:46:973109][info  ] Graph optimized, creating index


[GLOBAL] Index ready. Docs: 31884, dim=1024


                                                             

[es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/jinaai__jina-embeddings-v3/D_ALL_GT3L/Q_zh-es__to__zh.csv

=== MODEL: intfloat/multilingual-e5-large ===
--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:04:30:209842][info  ] optimizing graph           


[GLOBAL] Index ready. Docs: 31884, dim=1024


[454601][15:04:30:277203][info  ] Graph optimized, creating index
                                                             

[es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-large/D_ALL_GT3L/Q_zh-es__to__zh.csv

=== MODEL: intfloat/multilingual-e5-small ===
--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:06:26:099387][info  ] optimizing graph           
[454601][15:06:26:165533][info  ] Graph optimized, creating index


[GLOBAL] Index ready. Docs: 31884, dim=384


                                                             

[es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-small/D_ALL_GT3L/Q_zh-es__to__zh.csv

=== MODEL: upskyy/bge-m3-korean ===
--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:09:03:565133][info  ] optimizing graph           
[454601][15:09:03:627500][info  ] Graph optimized, creating index


[GLOBAL] Index ready. Docs: 31884, dim=1024


                                                             

[es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/upskyy__bge-m3-korean/D_ALL_GT3L/Q_zh-es__to__zh.csv

=== MODEL: intfloat/multilingual-e5-base ===
--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:11:18:284380][info  ] optimizing graph           
[454601][15:11:18:350493][info  ] Graph optimized, creating index


[GLOBAL] Index ready. Docs: 31884, dim=768


                                                             

[es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/intfloat__multilingual-e5-base/D_ALL_GT3L/Q_zh-es__to__zh.csv

=== MODEL: Alibaba-NLP/gte-multilingual-base ===


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


--- Query availability for doc_lang=es (eligible) ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=en (eligible) ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=zh (eligible) ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
Total unique queries (with qrels) across all cases: 10,000
[GLOBAL] Doc pool (GT-only, 3-langs): 31,884


[454601][15:13:07:351093][info  ] optimizing graph           
[454601][15:13:07:418436][info  ] Graph optimized, creating index


[GLOBAL] Index ready. Docs: 31884, dim=768


                                                             

[es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es.csv


                                                             

[en->es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_en__to__es.csv


                                                             

[zh->es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh__to__es.csv


                                                             

[es-en->es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es-en__to__es.csv


                                                             

[zh-en->es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-en__to__es.csv


                                                             

[zh-es->es] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-es__to__es.csv


                                                             

[en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_en.csv


                                                             

[es->en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es__to__en.csv


                                                             

[zh->en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh__to__en.csv


                                                             

[es-en->en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es-en__to__en.csv


                                                             

[zh-en->en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-en__to__en.csv


                                                             

[zh-es->en] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-es__to__en.csv


                                                             

[zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh.csv


                                                             

[en->zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_en__to__zh.csv


                                                             

[es->zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es__to__zh.csv


                                                             

[es-en->zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_es-en__to__zh.csv


                                                             

[zh-en->zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-en__to__zh.csv


                                                             

[zh-es->zh] Saved 10000 rows → runs_cuvs_test/Alibaba-NLP__gte-multilingual-base/D_ALL_GT3L/Q_zh-es__to__zh.csv

RETRIEVAL TEST RUN COMPLETE (GLOBAL corpus, no metrics).


# Run metric computation

In [15]:
# 01_compute_metrics.py
# pip install pandas numpy tqdm

import json, math
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

In [16]:
RUNS = Path("runs_cuvs_test")
TOPK_LIST = [1, 3, 5, 10]
CS_LANGS = {"es-en", "zh-en", "zh-es"}
CS_BINS = [(0.0,0.2),(0.2,0.4),(0.4,0.6),(0.6,0.8),(0.8,1.0)]

In [17]:
def bin_label(lo, hi): 
    return f"{lo:.1f}-{hi:.1f}"

# ===== Metrics =====
def metrics_for_row(gt_ids, ret_ids, K):
    gt = set(gt_ids)
    top = ret_ids[:K]
    hits = [1 if d in gt else 0 for d in top]

    # P@K
    P = sum(hits)/max(K,1)

    # R@K
    pos = len(gt)
    R = sum(hits)/pos if pos>0 else 0.0

    # AP@K
    ap, seen = 0.0, 0
    for i,h in enumerate(hits,1):
        if h:
            seen += 1
            ap += seen/i
    AP = (ap/pos) if pos>0 else 0.0

    # MRR@K
    RR = 0.0
    for i,h in enumerate(hits,1):
        if h:
            RR = 1.0/i
            break

    # nDCG@K (binary gains)
    dcg = sum((1.0/math.log2(i+1)) for i,h in enumerate(hits,1) if h)
    idcg = sum((1.0/math.log2(i+1)) for i in range(1, min(pos,K)+1))
    nDCG = (dcg/idcg) if idcg>0 else 0.0

    return P, R, AP, RR, nDCG

# ===== Helpers =====
def parse_json_list(x):
    """Parse a JSON list from a cell that may already be a list, a JSON string, or empty."""
    if isinstance(x, list):
        return [str(i) for i in x]
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        try:
            obj = json.loads(s)
            if isinstance(obj, list):
                return [str(i) for i in obj]
            # fallback: try comma-split if not json list
            return [t.strip() for t in s.split(",") if t.strip()]
        except Exception:
            # fallback: comma-split
            return [t.strip() for t in s.split(",") if t.strip()]
    return []

def load_rows_from_df(df: pd.DataFrame):
    """Return list of (qid, gt_ids, ret_ids, sw_rate). Ensures types and safe parsing."""
    # Ensure query_id
    if "query_id" in df.columns:
        qid = df["query_id"].astype(str)
    else:
        qid = pd.Series((f"row_{i}" for i in range(len(df))), index=df.index, dtype="object")

    # Parse JSON list columns
    if "ground_truth_doc_ids" not in df.columns or "retrieved_doc_ids" not in df.columns:
        raise ValueError("CSV must contain 'ground_truth_doc_ids' and 'retrieved_doc_ids' columns.")
    gt = df["ground_truth_doc_ids"].apply(parse_json_list)
    ret = df["retrieved_doc_ids"].apply(parse_json_list)

    # sw_rate numeric (optional)
    if "sw_rate" in df.columns:
        sw = pd.to_numeric(df["sw_rate"], errors="coerce")
    else:
        sw = pd.Series([np.nan]*len(df), index=df.index)

    return list(zip(qid.tolist(), gt.tolist(), ret.tolist(), sw.tolist()))

def detect_query_lang(qcsv_path: Path, df: pd.DataFrame) -> str:
    """Prefer the CSV's query_language column; fallback to filename (Q_<lang>.csv)."""
    if "query_language" in df.columns:
        vals = df["query_language"].dropna().astype(str)
        if len(vals.unique()) == 1:
            ql = vals.iloc[0]
        else:
            ql = vals.mode().iat[0]
        return str(ql).strip()
    # fallback: Q_<lang>.csv
    return qcsv_path.stem.split("_", 1)[1]

In [18]:
# ===== Main =====
def main():
    print("=== Starting IR evaluation summary pass ===")
    rows_sum = []
    rows_cs = []
    total_models = 0
    total_runs = 0

    for model_dir in tqdm(sorted(RUNS.glob("*")), desc="models"):
        if not model_dir.is_dir():
            continue
        model_slug = model_dir.name
        total_models += 1
        print(f"\n--- Processing model: {model_slug} ---")

        for ddir in sorted(model_dir.glob("D_*")):
            if not ddir.is_dir():
                continue
            doc_lang = ddir.name.split("_", 1)[1]
            print(f"  [Doc lang: {doc_lang}]")

            for qcsv in sorted(ddir.glob("Q_*.csv")):
                # Read once, reuse
                try:
                    df_tmp = pd.read_csv(qcsv)
                except Exception as e:
                    print(f"    !! Failed to read {qcsv.name}: {e}")
                    continue

                query_lang = detect_query_lang(qcsv, df_tmp)
                norm_lang = query_lang.replace("_", "-").strip()

                print(f"    > Query file: {qcsv.name} | Query lang: {query_lang} -> normalized: {norm_lang}")

                try:
                    data = load_rows_from_df(df_tmp)
                except Exception as e:
                    print(f"      !! Skipping {qcsv.name}: {e}")
                    continue

                if not data:
                    print(f"      !! Skipping: no rows found in {qcsv.name}")
                    continue

                N = len(data)
                print(f"      Loaded {N} rows")

                # aggregate metrics over all rows
                sums = {f"{m}@{k}": 0.0 for m in ["P","R","MAP","MRR","nDCG"] for k in TOPK_LIST}
                for _, gt, ret, _ in data:
                    for K in TOPK_LIST:
                        P, R, AP, RR, nD = metrics_for_row(gt, ret, K)
                        sums[f"P@{K}"]   += P
                        sums[f"R@{K}"]   += R
                        sums[f"MAP@{K}"] += AP
                        sums[f"MRR@{K}"] += RR
                        sums[f"nDCG@{K}"]+= nD

                summary = {
                    "model": model_slug,
                    "doc_lang": doc_lang,
                    "query_lang": norm_lang,
                    "N": N,
                }
                summary.update({k: v/N for k, v in sums.items()})
                rows_sum.append(summary)
                total_runs += 1

                # === Code-switched bucket analysis ===
                if norm_lang in CS_LANGS:
                    print(f"      [CS bucket analysis enabled for {norm_lang}]")
                    bins = {bin_label(lo,hi): [] for lo,hi in CS_BINS}

                    for _, gt, ret, sw in data:
                        if sw is None or (isinstance(sw, float) and math.isnan(sw)):
                            continue
                        try:
                            swv = float(sw)
                        except Exception:
                            continue

                        for lo, hi in CS_BINS:
                            # include exact 1.0 in the last bin
                            if (lo <= swv < hi) or (math.isclose(swv, 1.0) and math.isclose(hi, 1.0)):
                                bins[bin_label(lo, hi)].append((gt, ret))
                                break

                    for b, pairs in bins.items():
                        if not pairs:
                            # Still log empty bins for visibility
                            print(f"        - Bin {b}: 0 queries")
                            continue

                        B = len(pairs)
                        print(f"        - Bin {b}: {B} queries")

                        sumsB = {f"{m}@{k}": 0.0 for m in ["P","R","MAP","MRR","nDCG"] for k in TOPK_LIST}
                        for gt, ret in pairs:
                            for K in TOPK_LIST:
                                P, R, AP, RR, nD = metrics_for_row(gt, ret, K)
                                sumsB[f"P@{K}"]   += P
                                sumsB[f"R@{K}"]   += R
                                sumsB[f"MAP@{K}"] += AP
                                sumsB[f"MRR@{K}"] += RR
                                sumsB[f"nDCG@{K}"]+= nD

                        row = {
                            "model": model_slug,
                            "doc_lang": doc_lang,
                            "query_lang": norm_lang,
                            "sw_bin": b,
                            "N": B,
                        }
                        row.update({k: v/B for k, v in sumsB.items()})
                        rows_cs.append(row)

    # === Write results to Excel ===
    out_dir = RUNS
    out_dir.mkdir(parents=True, exist_ok=True)
    excel_path = out_dir / "summaries_mix.xlsx"

    with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
        df_all = pd.DataFrame(rows_sum)
        if not df_all.empty:
            # All Results
            df_all.to_excel(writer, sheet_name="All Results", index=False)

            # ES-related queries (plain 'es' and CS 'es-en')
            df_es = df_all[df_all["query_lang"].isin(["es", "es-en"])]
            df_es.to_excel(writer, sheet_name="ES Queries", index=False)

            # ZH-related queries (plain 'zh' and CS 'zh-en', 'zh-es')
            df_zh = df_all[df_all["query_lang"].isin(["zh", "zh-en", "zh-es"])]
            df_zh.to_excel(writer, sheet_name="ZH Queries", index=False)

            # Per-model sheets
            for model in sorted(df_all["model"].dropna().unique()):
                df_model = df_all[df_all["model"] == model]
                df_model.to_excel(writer, sheet_name=f"Model_{str(model)[:25]}", index=False)
        else:
            # still create empty sheets
            pd.DataFrame().to_excel(writer, sheet_name="All Results", index=False)
            pd.DataFrame().to_excel(writer, sheet_name="ES Queries", index=False)
            pd.DataFrame().to_excel(writer, sheet_name="ZH Queries", index=False)

        # CodeSwitch Curves sheet (always write, even if empty)
        df_cs = pd.DataFrame(rows_cs)
        df_cs.to_excel(writer, sheet_name="CodeSwitch Curves", index=False)

    # Console summary
    print(f"\n=== Excel saved to: {excel_path} ===")
    print(f"Total models: {total_models} | Total runs: {total_runs}")
    print(f"Per-bin rows: {len(rows_cs)}")
    if len(rows_cs) > 0:
        try:
            print(
                df_cs.groupby(["model","doc_lang","query_lang","sw_bin"])["N"]
                .sum()
                .sort_values(ascending=False)
                .head(10)
            )
        except Exception:
            pass

if __name__ == "__main__":
    main()


=== Starting IR evaluation summary pass ===


models:   0%|          | 0/7 [00:00<?, ?it/s]


--- Processing model: Alibaba-NLP__gte-multilingual-base ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 5

models:  14%|█▍        | 1/7 [00:10<01:00, 10.12s/it]

    > Query file: Q_zh.csv | Query lang: zh -> normalized: zh
      Loaded 10000 rows

--- Processing model: intfloat__multilingual-e5-base ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin

models:  29%|██▊       | 2/7 [00:19<00:48,  9.76s/it]


--- Processing model: intfloat__multilingual-e5-large ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 

models:  43%|████▎     | 3/7 [00:29<00:38,  9.69s/it]


--- Processing model: intfloat__multilingual-e5-small ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 

models:  57%|█████▋    | 4/7 [00:39<00:29,  9.74s/it]

    > Query file: Q_zh.csv | Query lang: zh -> normalized: zh
      Loaded 10000 rows

--- Processing model: jinaai__jina-embeddings-v3 ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4

models:  71%|███████▏  | 5/7 [00:48<00:19,  9.71s/it]

        - Bin 0.6-0.8: 994 queries
        - Bin 0.8-1.0: 1059 queries
    > Query file: Q_zh.csv | Query lang: zh -> normalized: zh
      Loaded 10000 rows

--- Processing model: upskyy__bge-m3-korean ---
  [Doc lang: ALL_GT3L]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__es.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_en__to__zh.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es-en__to__es.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 

models: 100%|██████████| 7/7 [00:58<00:00,  8.35s/it]

        - Bin 0.4-0.6: 1729 queries
        - Bin 0.6-0.8: 994 queries
        - Bin 0.8-1.0: 1059 queries
    > Query file: Q_zh.csv | Query lang: zh -> normalized: zh
      Loaded 10000 rows






=== Excel saved to: runs_cuvs_test/summaries_mix.xlsx ===
Total models: 6 | Total runs: 216
Per-bin rows: 540
model                               doc_lang  query_lang  sw_bin 
Alibaba-NLP__gte-multilingual-base  ALL_GT3L  es-en       0.0-0.2    13125
upskyy__bge-m3-korean               ALL_GT3L  es-en       0.0-0.2    13125
intfloat__multilingual-e5-base      ALL_GT3L  es-en       0.0-0.2    13125
jinaai__jina-embeddings-v3          ALL_GT3L  es-en       0.0-0.2    13125
intfloat__multilingual-e5-small     ALL_GT3L  es-en       0.0-0.2    13125
intfloat__multilingual-e5-large     ALL_GT3L  es-en       0.0-0.2    13125
jinaai__jina-embeddings-v3          ALL_GT3L  zh-en       0.0-0.2    11751
intfloat__multilingual-e5-small     ALL_GT3L  zh-en       0.0-0.2    11751
intfloat__multilingual-e5-base      ALL_GT3L  zh-en       0.0-0.2    11751
Alibaba-NLP__gte-multilingual-base  ALL_GT3L  zh-en       0.0-0.2    11751
Name: N, dtype: int64


# Lang Eval

In [2]:
#!/usr/bin/env python3
import json, math
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm.auto import tqdm  # notebook/terminal-friendly

# ========= Config =========
RUNS = Path("runs_cuvs_test")
TOPK_LIST = [1, 3, 5, 10]
CS_BINS = [(0.0,0.2),(0.2,0.4),(0.4,0.6),(0.6,0.8),(0.8,1.0)]
LANGS = ("en", "es", "zh")   # expected doc languages from 'doc_id|lang'

# Outputs
OUT_XLSX                = RUNS / "lang_pref.xlsx"
OUT_CSV_PREF            = RUNS / "lang_pref.csv"                      # per model × qlang × (bin|mono) × K
OUT_CSV_MODEL           = RUNS / "lang_pref_by_model_avg.csv"         # per model × qlang × K (avg across CS bins; mono included)
OUT_CSV_BIN             = RUNS / "lang_pref_by_bin_across_models.csv" # across models × qlang × (bin|mono) × K
OUT_CSV_KIND_MODEL      = RUNS / "lang_pref_by_model_kind.csv"        # per model × qlang × qkind × K
OUT_CSV_KIND_ALL        = RUNS / "lang_pref_by_kind_all_models.csv"   # across models × qlang × qkind × K

OUT_CSV_GT              = RUNS / "gt3lang_stats.csv"                       # per model × qlang × (bin|mono) × K
OUT_CSV_GT_MODEL        = RUNS / "gt3lang_by_model_avg.csv"                # per model × qlang × K (avg across CS bins; mono included)
OUT_CSV_GT_BIN          = RUNS / "gt3lang_by_bin_across_models.csv"        # across models × qlang × (bin|mono) × K
OUT_CSV_GT_KIND_MODEL   = RUNS / "gt3lang_by_model_kind.csv"               # per model × qlang × qkind × K
OUT_CSV_GT_KIND_ALL     = RUNS / "gt3lang_by_kind_all_models.csv"          # across models × qlang × qkind × K

# ---------- helpers ----------
def bin_label(lo, hi): 
    return f"{lo:.1f}-{hi:.1f}"

def parse_json_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        try:
            v = json.loads(s)
            if isinstance(v, list):
                return v
        except Exception:
            return [t.strip() for t in s.split(",") if t.strip()]
    return []

def parse_base_lang(docid: str):
    s = str(docid)
    if "|" in s:
        base, lang = s.rsplit("|", 1)
        return base.strip(), lang.strip().lower()
    return s.strip(), None

def lang_from_docid(docid: str) -> str:
    _, lg = parse_base_lang(docid)
    return lg if lg in LANGS else ("unk" if lg is not None else "unk")

def sw_bin_of(v: float):
    if v is None or (isinstance(v, float) and math.isnan(v)):
        return None
    try:
        vv = float(v)
    except Exception:
        return None
    for lo, hi in CS_BINS:
        if (lo <= vv < hi) or (math.isclose(vv, 1.0) and math.isclose(hi, 1.0)):
            return bin_label(lo, hi)
    return None

def collect_csv_files(model_dir: Path) -> list[Path]:
    """Prefer a single global corpus directory 'D_ALL_GT3L' if present; otherwise include all D_* dirs."""
    d_all = model_dir / "D_ALL_GT3L"
    if d_all.exists() and d_all.is_dir():
        return sorted(d_all.glob("Q_*.csv"))
    files = []
    for d in sorted(model_dir.glob("D_*")):
        if d.is_dir():
            files += sorted(d.glob("Q_*.csv"))
    return files

def query_kind(qlang: str) -> str:
    """mono if qlang in {en,es,zh}; otherwise cs."""
    ql = (qlang or "").strip().lower().replace("_","-")
    return "mono" if ql in {"en","es","zh"} else "cs"

# ---------- accumulators ----------
def _new_langpref_accum():
    return {
        "N": 0,
        "top1_en_count": 0, "top1_es_count": 0, "top1_zh_count": 0, "top1_other_count": 0,
        "topK_en_count_sum": 0, "topK_es_count_sum": 0, "topK_zh_count_sum": 0, "topK_other_count_sum": 0,
    }

def _new_gt_accum():
    return {
        "N": 0,
        "all3_count": 0,  # all 3 GT languages appear within top-K
        "gt_top1_en": 0, "gt_top1_es": 0, "gt_top1_zh": 0, "gt_top1_other": 0,
        "gt_top2_en": 0, "gt_top2_es": 0, "gt_top2_zh": 0, "gt_top2_other": 0,
        "gt_top3_en": 0, "gt_top3_es": 0, "gt_top3_zh": 0, "gt_top3_other": 0,
    }

def accumulate_langpref(groups, key, K, top1_lang, topK_langs):
    g = groups[(key, K)]
    g["N"] += 1
    if top1_lang in LANGS:
        g[f"top1_{top1_lang}_count"] += 1
    else:
        g["top1_other_count"] += 1
    c = Counter(topK_langs)
    for lg in LANGS:
        g[f"topK_{lg}_count_sum"] += c.get(lg, 0)
    g["topK_other_count_sum"] += sum(v for k,v in c.items() if k not in LANGS)

def accumulate_gt(groups_gt, key, K, gt_bases_set, ret_ids):
    g = groups_gt[(key, K)]
    g["N"] += 1

    # record first positions per GT language seen within top-K
    first_pos = {}
    for rank, rid in enumerate(ret_ids[:K], start=1):
        base, lg = parse_base_lang(rid)
        if lg not in LANGS:
            continue
        if base in gt_bases_set and lg not in first_pos:
            first_pos[lg] = rank

    if all(lg in first_pos for lg in LANGS):
        g["all3_count"] += 1

    if first_pos:
        ordered = sorted(first_pos.items(), key=lambda kv: kv[1])
        order_langs = [lg for lg,_ in ordered]
        # 1st
        lg1 = order_langs[0]
        g[f"gt_top1_{lg1}"] = g.get(f"gt_top1_{lg1}", 0) + 1 if lg1 in LANGS else g.__setitem__("gt_top1_other", g["gt_top1_other"]+1)
        # 2nd
        if len(order_langs) >= 2:
            lg2 = order_langs[1]
            g[f"gt_top2_{lg2}"] = g.get(f"gt_top2_{lg2}", 0) + 1 if lg2 in LANGS else g.__setitem__("gt_top2_other", g["gt_top2_other"]+1)
        # 3rd
        if len(order_langs) >= 3:
            lg3 = order_langs[2]
            g[f"gt_top3_{lg3}"] = g.get(f"gt_top3_{lg3}", 0) + 1 if lg3 in LANGS else g.__setitem__("gt_top3_other", g["gt_top3_other"]+1)

def finalize_langpref_df(groups, key_names):
    rows = []
    for (key_tuple, K), g in groups.items():
        N = g["N"]
        if N <= 0:
            continue
        row = {key: val for key, val in zip(key_names, key_tuple)}
        row["K"] = K
        row["N"] = N
        row["top1_en_pct"] = g["top1_en_count"] / N
        row["top1_es_pct"] = g["top1_es_count"] / N
        row["top1_zh_pct"] = g["top1_zh_count"] / N
        row["top1_other_pct"] = g["top1_other_count"] / N
        denom = N * K
        row["topK_en_avg_frac"]  = g["topK_en_count_sum"] / denom
        row["topK_es_avg_frac"]  = g["topK_es_count_sum"] / denom
        row["topK_zh_avg_frac"]  = g["topK_zh_count_sum"] / denom
        row["topK_other_avg_frac"] = g["topK_other_count_sum"] / denom
        rows.append(row)
    return pd.DataFrame(rows)

def finalize_gt_df(groups_gt, key_names):
    rows = []
    for (key_tuple, K), g in groups_gt.items():
        N = g["N"]
        if N <= 0:
            continue
        row = {key: val for key, val in zip(key_names, key_tuple)}
        row["K"] = K
        row["N"] = N
        row["all3_pct"] = (g["all3_count"] / N) if K >= 3 else 0.0
        for pos in (1,2,3):
            for lg in LANGS:
                row[f"gt_top{pos}_{lg}_pct"] = g.get(f"gt_top{pos}_{lg}", 0) / N
            row[f"gt_top{pos}_other_pct"] = g.get(f"gt_top{pos}_other", 0) / N
        rows.append(row)
    return pd.DataFrame(rows)

# ---------- main ----------
def main():
    # A) Language preference accumulators
    # per model × qlang × (bin|mono) × K
    langpref_by_model_bin = defaultdict(_new_langpref_accum)
    # per model × qlang × K (avg across bins for CS; mono contributes directly)
    langpref_by_model_nobin = defaultdict(_new_langpref_accum)
    # across models × qlang × (bin|mono) × K
    langpref_by_bin_across_models = defaultdict(_new_langpref_accum)
    # NEW: per model × qlang × qkind × K (qkind ∈ {mono, cs})
    langpref_by_model_kind = defaultdict(_new_langpref_accum)
    # NEW: across models × qlang × qkind × K
    langpref_by_kind_all_models = defaultdict(_new_langpref_accum)

    # B) GT 3-language accumulators (same shapes)
    gt_by_model_bin = defaultdict(_new_gt_accum)
    gt_by_model_nobin = defaultdict(_new_gt_accum)
    gt_by_bin_across_models = defaultdict(_new_gt_accum)
    gt_by_model_kind = defaultdict(_new_gt_accum)
    gt_by_kind_all_models = defaultdict(_new_gt_accum)

    model_dirs = [d for d in sorted(RUNS.glob("*")) if d.is_dir()]
    pbar_models = tqdm(model_dirs, desc="Models", unit="model")

    for model_dir in pbar_models:
        model_slug = model_dir.name
        csv_files = collect_csv_files(model_dir)
        if not csv_files:
            continue

        pbar_files = tqdm(csv_files, desc=f"{model_slug}: files", unit="file", leave=False)
        for csv_path in pbar_files:
            try:
                df = pd.read_csv(csv_path)
            except Exception as e:
                pbar_files.write(f"!! Failed to read {csv_path}: {e}")
                continue

            if "retrieved_doc_ids" not in df.columns:
                continue

            # Normalize query_language or fallback
            if "query_language" in df.columns:
                df["query_language"] = df["query_language"].astype(str).str.replace("_", "-")
            else:
                stem = csv_path.stem
                df["query_language"] = stem.split("_", 1)[1] if "_" in stem else "unknown"

            if "sw_rate" not in df.columns:
                df["sw_rate"] = np.nan

            pbar_rows = tqdm(total=len(df), desc=f"{model_slug}: {csv_path.name} rows", unit="row", leave=False)
            for _, row in df.iterrows():
                pbar_rows.update(1)

                ret = parse_json_list(row.get("retrieved_doc_ids"))
                if not ret:
                    continue

                qlang = str(row.get("query_language", "unknown")).strip()
                kind = query_kind(qlang)  # mono or cs
                # For monolingual queries, label bin as "mono" (so we don't drop them); for CS, compute bin
                swb = sw_bin_of(row.get("sw_rate"))
                swb_label = swb if (kind == "cs" and swb is not None) else "mono"

                # language list for overall preference
                ret_langs = [lang_from_docid(x) for x in ret]
                top1_lang = ret_langs[0] if ret_langs else "unk"

                # GT base-id set (strip language suffixes)
                gt_list = parse_json_list(row.get("ground_truth_doc_ids"))
                gt_bases = set(parse_base_lang(gid)[0] for gid in gt_list if gid is not None)

                for K in TOPK_LIST:
                    kk = min(K, len(ret_langs))
                    if kk <= 0:
                        continue

                    key_model_bin  = (model_slug, qlang, swb_label)
                    key_model_nobin = (model_slug, qlang)
                    key_bin_all    = (qlang, swb_label)
                    key_model_kind = (model_slug, qlang, kind)
                    key_kind_all   = (qlang, kind)

                    # --------- Language preference ---------
                    accumulate_langpref(langpref_by_model_bin, key_model_bin, K, top1_lang, ret_langs[:kk])
                    accumulate_langpref(langpref_by_model_nobin, key_model_nobin, K, top1_lang, ret_langs[:kk])
                    accumulate_langpref(langpref_by_bin_across_models, key_bin_all, K, top1_lang, ret_langs[:kk])
                    accumulate_langpref(langpref_by_model_kind, key_model_kind, K, top1_lang, ret_langs[:kk])
                    accumulate_langpref(langpref_by_kind_all_models, key_kind_all, K, top1_lang, ret_langs[:kk])

                    # --------- GT representativeness ---------
                    accumulate_gt(gt_by_model_bin, key_model_bin, K, gt_bases, ret)
                    accumulate_gt(gt_by_model_nobin, key_model_nobin, K, gt_bases, ret)
                    accumulate_gt(gt_by_bin_across_models, key_bin_all, K, gt_bases, ret)
                    accumulate_gt(gt_by_model_kind, key_model_kind, K, gt_bases, ret)
                    accumulate_gt(gt_by_kind_all_models, key_kind_all, K, gt_bases, ret)

            pbar_rows.close()

    # -------- finalize dataframes --------
    # A) Language preference
    df_pref       = finalize_langpref_df(langpref_by_model_bin, ["model","query_lang","sw_bin"])
    df_model_avg  = finalize_langpref_df(langpref_by_model_nobin, ["model","query_lang"])
    df_bin_avg    = finalize_langpref_df(langpref_by_bin_across_models, ["query_lang","sw_bin"])
    df_kind_model = finalize_langpref_df(langpref_by_model_kind, ["model","query_lang","qkind"])
    df_kind_all   = finalize_langpref_df(langpref_by_kind_all_models, ["query_lang","qkind"])

    # B) GT representativeness
    df_gt           = finalize_gt_df(gt_by_model_bin, ["model","query_lang","sw_bin"])
    df_gt_model_avg = finalize_gt_df(gt_by_model_nobin, ["model","query_lang"])
    df_gt_bin_avg   = finalize_gt_df(gt_by_bin_across_models, ["query_lang","sw_bin"])
    df_gt_kind_model= finalize_gt_df(gt_by_model_kind, ["model","query_lang","qkind"])
    df_gt_kind_all  = finalize_gt_df(gt_by_kind_all_models, ["query_lang","qkind"])

    # -------- save --------
    OUT_XLSX.parent.mkdir(parents=True, exist_ok=True)
    with pd.ExcelWriter(OUT_XLSX, engine="xlsxwriter") as writer:
        # Primary (with mono rows included via sw_bin=="mono")
        df_pref.to_excel(writer, sheet_name="LangPref", index=False)
        df_model_avg.to_excel(writer, sheet_name="ByModel_NoBin", index=False)
        df_bin_avg.to_excel(writer, sheet_name="ByBin_AllModels", index=False)
        # Mono vs CS splits
        df_kind_model.to_excel(writer, sheet_name="ByModel_Kind", index=False)
        df_kind_all.to_excel(writer, sheet_name="ByKind_AllModels", index=False)

        # Convenience pivots (optional)
        if not df_pref.empty:
            piv1 = df_pref.pivot_table(index=["model","query_lang","sw_bin","K"],
                                       values=["top1_en_pct","top1_es_pct","top1_zh_pct"],
                                       aggfunc="first").reset_index()
            piv1.to_excel(writer, sheet_name="Top1Pct", index=False)
            piv2 = df_pref.pivot_table(index=["model","query_lang","sw_bin","K"],
                                       values=["topK_en_avg_frac","topK_es_avg_frac","topK_zh_avg_frac"],
                                       aggfunc="first").reset_index()
            piv2.to_excel(writer, sheet_name="TopKAvgFrac", index=False)

        # GT 3-language presence & order
        df_gt.to_excel(writer, sheet_name="GT3Lang", index=False)
        df_gt_model_avg.to_excel(writer, sheet_name="GT3Lang_NoBin", index=False)
        df_gt_bin_avg.to_excel(writer, sheet_name="GT3Lang_AllModels", index=False)
        df_gt_kind_model.to_excel(writer, sheet_name="GT3Lang_ByModel_Kind", index=False)
        df_gt_kind_all.to_excel(writer, sheet_name="GT3Lang_ByKind_AllModels", index=False)

    # CSV mirrors
    df_pref.to_csv(OUT_CSV_PREF, index=False, encoding="utf-8")
    df_model_avg.to_csv(OUT_CSV_MODEL, index=False, encoding="utf-8")
    df_bin_avg.to_csv(OUT_CSV_BIN, index=False, encoding="utf-8")
    df_kind_model.to_csv(OUT_CSV_KIND_MODEL, index=False, encoding="utf-8")
    df_kind_all.to_csv(OUT_CSV_KIND_ALL, index=False, encoding="utf-8")

    df_gt.to_csv(OUT_CSV_GT, index=False, encoding="utf-8")
    df_gt_model_avg.to_csv(OUT_CSV_GT_MODEL, index=False, encoding="utf-8")
    df_gt_bin_avg.to_csv(OUT_CSV_GT_BIN, index=False, encoding="utf-8")
    df_gt_kind_model.to_csv(OUT_CSV_GT_KIND_MODEL, index=False, encoding="utf-8")
    df_gt_kind_all.to_csv(OUT_CSV_GT_KIND_ALL, index=False, encoding="utf-8")

    print("\nSaved:")
    for p in [
        OUT_XLSX, OUT_CSV_PREF, OUT_CSV_MODEL, OUT_CSV_BIN,
        OUT_CSV_KIND_MODEL, OUT_CSV_KIND_ALL,
        OUT_CSV_GT, OUT_CSV_GT_MODEL, OUT_CSV_GT_BIN, OUT_CSV_GT_KIND_MODEL, OUT_CSV_GT_KIND_ALL
    ]:
        print(f"  {p}")

if __name__ == "__main__":
    main()


Models:   0%|          | 0/6 [00:00<?, ?model/s]
Alibaba-NLP__gte-multilingual-base: files:   0%|          | 0/18 [00:00<?, ?file/s][A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:   0%|          | 0/10000 [00:00<?, ?row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:   3%|▎         | 348/10000 [00:00<00:02, 3476.56row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:   7%|▋         | 696/10000 [00:00<00:02, 3413.38row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:  11%|█         | 1076/10000 [00:00<00:02, 3587.43row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:  17%|█▋        | 1717/10000 [00:00<00:01, 4692.15row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:  24%|██▎       | 2356/10000 [00:00<00:01, 5301.17row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:  30%|██▉       | 2997/10000 [00:00<00:01, 5676.06row/s][A[A

Alibaba-NLP__gte-multilingual-base: Q_en.csv rows:  36%|███▋      | 3636/10000 [


Saved:
  runs_cuvs_test/lang_pref.xlsx
  runs_cuvs_test/lang_pref.csv
  runs_cuvs_test/lang_pref_by_model_avg.csv
  runs_cuvs_test/lang_pref_by_bin_across_models.csv
  runs_cuvs_test/lang_pref_by_model_kind.csv
  runs_cuvs_test/lang_pref_by_kind_all_models.csv
  runs_cuvs_test/gt3lang_stats.csv
  runs_cuvs_test/gt3lang_by_model_avg.csv
  runs_cuvs_test/gt3lang_by_bin_across_models.csv
  runs_cuvs_test/gt3lang_by_model_kind.csv
  runs_cuvs_test/gt3lang_by_kind_all_models.csv
