In [1]:
!nvidia-smi

Tue Sep 23 15:03:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.09              Driver Version: 580.82.09      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40                     Off |   00000000:0D:00.0 Off |                    0 |
| N/A   56C    P0             74W /  300W |       5MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA L40                     Off |   00

In [2]:
import os
# 100% disable NVML usage in the CUDA caching allocator
os.environ["PYTORCH_NO_NVML"] = "0"

# (optional but recommended) Use expandable segments allocator — fewer large reallocs
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# pick GPUs (keep multi-GPU if you want)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"   # or "0" to sanity-check single GPU


In [3]:
import torch
import os
print("Torch:", torch.__version__)
print("NVML disabled:", os.environ.get("PYTORCH_NO_NVML"))
print("Allocator:", os.environ.get("PYTORCH_CUDA_ALLOC_CONF"))
print("Visible GPUs:", os.environ.get("CUDA_VISIBLE_DEVICES"))

Torch: 2.8.0+cu128
NVML disabled: 0
Allocator: expandable_segments:True
Visible GPUs: 0,1


In [4]:
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())

2.8.0+cu128
CUDA available: True
Device count: 2


# Run execution

In [5]:
# pip install sentence-transformers cuvs-cpu tqdm pandas

import json
import math
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Union
from collections import defaultdict
from dataclasses import dataclass

import numpy as np
import pandas as pd
from tqdm import tqdm
# (cuvs import removed; using cuVS)
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import gc, torch

In [6]:
# === cuVS (CAGRA) fast GPU ANN index ===
# Requires: pip install cuvs-cu12 cupy-cuda12x  (use NVIDIA's PyPI index for cuvs)
#   pip install cuvs-cu12 --extra-index-url https://pypi.nvidia.com
#   pip install cupy-cuda12x   # choose the 'cuda12x' build that matches your CUDA 12 runtime
try:
    import cupy as cp
    from cuvs.neighbors import cagra
    CUVS_AVAILABLE = True
except Exception as e:
    CUVS_AVAILABLE = False
    print("cuVS not available:", e)

class CuVSIndex:
    """
    Minimal adapter to mimic cuvs .search() API with cuVS CAGRA.
    Assumes input vectors are float32 and (for cosine) already L2-normalized.
    """
    def __init__(self, xb: "np.ndarray", device_id: int = 0, graph_degree: int = 64, itopk_size: int = 64):
        if not CUVS_AVAILABLE:
            raise RuntimeError("cuVS not available; please install cuvs-cu12 and cupy-cuda12x.")
        self.device_id = device_id
        with cp.cuda.Device(self.device_id):
            self.xb = cp.asarray(xb, dtype=cp.float32, order="C")
            # Build CAGRA graph index on GPU
            self.index = cagra.build(self.xb, graph_degree=graph_degree, itopk_size=itopk_size)

    def search(self, xq: "np.ndarray", k: int):
        with cp.cuda.Device(self.device_id):
            q = cp.asarray(xq, dtype=cp.float32, order="C")
            D, I = cagra.search(self.index, q, k)
            return cp.asnumpy(D), cp.asnumpy(I)

def build_gpu_index_with_cuvs(xb: "np.ndarray", device_id: int = 0):
    """
    Build a single-GPU cuVS CAGRA index (device_id 0 by default) and return (index, True).
    Ensures xb is float32, C-contiguous, and lives in host memory; it will be moved once to GPU.
    """
    xb = np.asarray(xb, dtype=np.float32, order="C")
    idx = CuVSIndex(xb, device_id=device_id)
    return idx, True

In [7]:
# === Single-GPU performance flags (PyTorch 2.8 on Ada/L40) ===
import torch, os
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")
torch.set_num_threads(max(1, (os.cpu_count() or 4)//2))

# Optional: compile models constructed after this (speeds up forward pass)


In [8]:
try:
    import pynvml
    _NVML_OK = True
except Exception:
    _NVML_OK = False

@dataclass
class HFInstructBackend:
    model_name: str
    model: torch.nn.Module
    tokenizer: any
    family: str  # 'llama' / 'qwen' / 'misc'

def _nvml_free_map():
    if not _NVML_OK:
        return {}
    try:
        pynvml.nvmlInit()
        cnt = pynvml.nvmlDeviceGetCount()
        res = {}
        for i in range(cnt):
            h = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem = pynvml.nvmlDeviceGetMemoryInfo(h)
            res[i] = (mem.free, mem.total)
        pynvml.nvmlShutdown()
        return res
    except Exception:
        return {}

def get_embedding_dim(backend: HFInstructBackend) -> int:
    return int(getattr(backend.model.config, "hidden_size"))

In [None]:
DOCS_GLOB          = "../Data/subset_outputs_stratified/docs_subset.jsonl"   # docs with {"doc_id", "<lang>": {"text": ...}}
QRELS_PATH         = "../Data/subset_outputs_stratified/qrels_subset.jsonl"     # filtered qrels (only for the selected queries)
MERGED_QUERIES     = "../Data/subset_outputs_stratified/queries_subset.jsonl"  # merged query variants in one file
OUT_ROOT           = Path("runs_cuvs_test_instruct_v2")
HF_TOKEN = os.getenv("HF_TOKEN", "...")  # set your HF token or use env var            # outputs will be saved here
TOP_K              = 10
BATCH_SIZE_DOCS    = 16
BATCH_SIZE_QUERIES = 16
USE_GPU_IF_AVAIL   = True       # cuvs GPU if available
NORMALIZE_L2       = True       # cosine via inner product after L2-normalization
PILOT_IDS_FILE     = None       # e.g., "pilot_base_ids.txt" to restrict queries, or None

# Replace with your actual 6 models
MODELS = [
    #LLAMA
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    #QWEN
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    #PHI
    "microsoft/Phi-3-mini-4k-instruct"
]


In [10]:
def load_docs_for_lang(doc_lang: str) -> Tuple[List[str], List[str]]:
    """Return (doc_ids, texts) for the chosen language."""
    ids, texts = [], []
    for p in sorted(Path(".").glob(DOCS_GLOB)):
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                o = json.loads(line)
                did = str(o.get("doc_id") or o.get("id") or "").strip()
                lang_obj = o.get(doc_lang) or {}
                txt = lang_obj.get("text")
                if did and isinstance(txt, str) and txt.strip():
                    ids.append(did)
                    texts.append(txt.strip())
    return ids, texts

def load_qrels(path: str) -> Dict[str, List[str]]:
    """Return {query_id: [all relevant doc_ids]} (relevance>0)."""
    rels = defaultdict(list)
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            o = json.loads(line)
            qid = str(o.get("query_id") or "").strip()
            did = str(o.get("doc_id") or "").strip()
            rel = int(o.get("relevance", 1))
            if qid and did and rel > 0:
                rels[qid].append(did)
    return rels

def load_keep_ids(path: Optional[str]) -> Optional[set]:
    if not path:
        return None
    p = Path(path)
    if not p.exists():
        return None
    return {line.strip() for line in p.read_text(encoding="utf-8").splitlines() if line.strip()}

def load_queries_cases(merged_jsonl: str, doc_lang: str, keep_ids: Optional[set]=None):
    """
    Return:
      cases: { case_name : { query_id : query_text } }
      sw_map: { case_name : { query_id : sw_rate_or_None } }  # only for CS cases

    Now loads ALL query languages for ANY target doc language, including ALL CS pairs routed to that doc_lang.
    Cases per doc_lang (expanded):
      en: 'en', 'es->en', 'zh->en', 'es-en->en', 'zh-en->en', 'zh-es->en'
      es: 'es', 'en->es', 'zh->es', 'es-en->es', 'zh-en->es', 'zh-es->es'
      zh: 'zh', 'en->zh', 'es->zh', 'es-en->zh', 'zh-en->zh', 'zh-es->zh'
    """
    CASES_BY_DOC = {
        "en": ["en", "es->en", "zh->en", "es-en->en", "zh-en->en", "zh-es->en"],
        "es": ["es", "en->es", "zh->es", "es-en->es", "zh-en->es", "zh-es->es"],
        "zh": ["zh", "en->zh", "es->zh", "es-en->zh", "zh-en->zh", "zh-es->zh"],
    }
    if doc_lang not in CASES_BY_DOC:
        raise ValueError(f"Unsupported doc_lang={doc_lang}")

    cases = {k: {} for k in CASES_BY_DOC[doc_lang]}
    sw_map = {k: {} for k in CASES_BY_DOC[doc_lang]}

    with open(merged_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            o = json.loads(line)
            qid = str(o.get("id") or o.get("query_id") or "").strip()
            if not qid:
                continue
            if keep_ids is not None and qid not in keep_ids:
                continue

            # monolingual
            en_q = o.get("eng_q")
            es_q = o.get("spa_q") or o.get("esp_q")
            zh_q = o.get("zh_q")
            # code-switched (pair-specific)
            es_en_q = o.get("es_en_q"); es_en_sw = o.get("es_en_sw_rate")
            zh_en_q = o.get("zh_en_q"); zh_en_sw = o.get("zh_en_sw_rate")
            zh_es_q = o.get("zh_es_q"); zh_es_sw = o.get("zh_es_sw_rate")

            def as_float(x):
                try:
                    return float(x) if x is not None and str(x).strip() != "" else None
                except Exception:
                    return None
            es_en_sw = as_float(es_en_sw)
            zh_en_sw = as_float(zh_en_sw)
            zh_es_sw = as_float(zh_es_sw)

            # ---- Monolingual routed to doc_lang ----
            if doc_lang == "en" and isinstance(en_q, str) and en_q.strip():
                cases["en"][qid] = en_q.strip()
            if doc_lang == "es" and isinstance(es_q, str) and es_q.strip():
                cases["es"][qid] = es_q.strip()
            if doc_lang == "zh" and isinstance(zh_q, str) and zh_q.strip():
                cases["zh"][qid] = zh_q.strip()

            # ---- Cross-lingual routed to doc_lang (ALL sources) ----
            if doc_lang == "en":
                if isinstance(es_q, str) and es_q.strip(): cases["es->en"][qid] = es_q.strip()
                if isinstance(zh_q, str) and zh_q.strip(): cases["zh->en"][qid] = zh_q.strip()
            if doc_lang == "es":
                if isinstance(en_q, str) and en_q.strip(): cases["en->es"][qid] = en_q.strip()
                if isinstance(zh_q, str) and zh_q.strip(): cases["zh->es"][qid] = zh_q.strip()
            if doc_lang == "zh":
                if isinstance(en_q, str) and en_q.strip(): cases["en->zh"][qid] = en_q.strip()
                if isinstance(es_q, str) and es_q.strip(): cases["es->zh"][qid] = es_q.strip()

            # ---- Code-switched routed to doc_lang (ALL pairs) ----
            # es-en
            if isinstance(es_en_q, str) and es_en_q.strip():
                if doc_lang == "en":
                    cases["es-en->en"][qid] = es_en_q.strip(); sw_map["es-en->en"][qid] = es_en_sw
                if doc_lang == "es":
                    cases["es-en->es"][qid] = es_en_q.strip(); sw_map["es-en->es"][qid] = es_en_sw
                if doc_lang == "zh":
                    cases["es-en->zh"][qid] = es_en_q.strip(); sw_map["es-en->zh"][qid] = es_en_sw

            # zh-en
            if isinstance(zh_en_q, str) and zh_en_q.strip():
                if doc_lang == "en":
                    cases["zh-en->en"][qid] = zh_en_q.strip(); sw_map["zh-en->en"][qid] = zh_en_sw
                if doc_lang == "es":
                    cases["zh-en->es"][qid] = zh_en_q.strip(); sw_map["zh-en->es"][qid] = zh_en_sw
                if doc_lang == "zh":
                    cases["zh-en->zh"][qid] = zh_en_q.strip(); sw_map["zh-en->zh"][qid] = zh_en_sw

            # zh-es
            if isinstance(zh_es_q, str) and zh_es_q.strip():
                if doc_lang == "en":
                    cases["zh-es->en"][qid] = zh_es_q.strip(); sw_map["zh-es->en"][qid] = zh_es_sw
                if doc_lang == "es":
                    cases["zh-es->es"][qid] = zh_es_q.strip(); sw_map["zh-es->es"][qid] = zh_es_sw
                if doc_lang == "zh":
                    cases["zh-es->zh"][qid] = zh_es_q.strip(); sw_map["zh-es->zh"][qid] = zh_es_sw

    return cases, sw_map


def print_case_counts(cases: Dict[str, Dict[str,str]], label: str):
    print(f"--- Query availability for doc_lang={label} ---")
    for k in sorted(cases.keys()):
        print(f"{k:12s}: {len(cases[k]):6d}")

In [11]:
try:
    import pynvml
    _NVML_OK = True
except Exception:
    _NVML_OK = False

def build_max_memory_map(reserve_gb: int = 4, min_gb: int = 6):
    fm = _nvml_free_map()
    if not fm:
        return {}
    mm = {}
    for i, (_free, total) in fm.items():
        total_gb = int(total // (1024**3))
        allow_gb = max(min_gb, total_gb - reserve_gb)
        mm[i] = f"{allow_gb}GiB"
    return mm

def _infer_family(model_name: str) -> str:
    m = model_name.lower()
    if "llama" in m: return "llama"
    if "qwen"  in m: return "qwen"
    return "misc"

def _ensure_pad_token(tok, mdl):
    """
    Guarantees a valid pad token. Prefer EOS if present; otherwise add a new [PAD]
    and resize embeddings. Safe for inference.
    """
    if tok.pad_token_id is not None:
        return
    if getattr(tok, "eos_token", None) is not None:
        tok.pad_token = tok.eos_token  # share id, no resize needed
    else:
        # add a dedicated [PAD] token and resize model embeddings
        tok.add_special_tokens({"pad_token": "[PAD]"})
        try:
            mdl.resize_token_embeddings(len(tok))
        except Exception:
            # Some models manage embeddings lazily; ignore if not needed
            pass

def load_model_backend(model_name: str, hf_token: str = HF_TOKEN) -> HFInstructBackend:
    tok = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
    mm = build_max_memory_map()
    if mm:
        mdl = AutoModelForCausalLM.from_pretrained(
            model_name, token=hf_token, dtype=dtype,
            device_map="auto", max_memory=mm,
        )
    else:
        mdl = AutoModelForCausalLM.from_pretrained(
            model_name, token=hf_token, dtype=dtype,
            device_map="auto",
        )
    mdl.eval()
    _ensure_pad_token(tok, mdl)
    return HFInstructBackend(
        model_name=model_name,
        model=mdl,
        tokenizer=tok,
        family=_infer_family(model_name),
    )


# Language-matched, architecture-agnostic prompts
PROMPTS_S = {
    "llama": {
        "en": 'You are an AI assistant that can understand human language.',
        "es": 'Eres un asistente de IA que entiende el lenguaje humano.',
        "zh": '你是一名能理解人类语言的AI助手。'
    },
    "qwen": {
        "en": 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.',
        "es": 'Eres Qwen, creado por Alibaba Cloud. Eres un asistente útil.',
        "zh": '你是 Qwen，由阿里云创建。你是一个有用的助手。'
    },
    "mistral": { # Mistral doesn't use a specific system intro, so we use a simple one
        "en": 'You are a helpful AI assistant.',
        "es": 'Eres un asistente de IA útil.',
        "zh": '你是一名乐于助人的AI助手。'
    },
    "phi": {
        "en": 'You are a helpful AI assistant.',
        "es": 'Eres un asistente de IA útil.',
        "zh": '你是一名乐于助人的AI助手。'
    },
    "unknown": {
        "en": 'You are a helpful AI assistant.',
        "es": 'Eres un asistente de IA útil.',
        "zh": '你是一名乐于助人的AI助手。'
    }
}

PROMPTS_U = {
    "doc": {
       "en": 'Passage: "{text}". Use one word to represent the query in a retrieval task. Make sure your word is in lowercase.',
        "es": 'Pasaje: "{text}". Usa una sola palabra para representar la consulta en una tarea de recuperación. Asegúrate de que la palabra esté en minúsculas.',
        "zh": '段落：" {text} "。请用一个词来表示该查询用于检索任务。务必使用小写字母。'
    },
    "query": {
        "en": 'Query: "{text}". Use one word to represent the query in a retrieval task. Make sure your word is in lowercase.',
        "es": 'Consulta: "{text}". Usa una sola palabra para representar la consulta en una tarea de recuperación. Asegúrate de que la palabra esté en minúsculas.',
        "zh": '查询：" {text} "。请用一个词来表示该查询用于检索任务。务必使用小写字母。'
    },
}

import re
from typing import Literal

Arch = Literal["llama", "qwen", "mistral", "phi", "unknown"]

def detect_architecture(model_name: str) -> Arch:
    """
    Heuristically detect the architecture family from a model name / repo id.

    Recognizes:
      - Llama family:   "meta-llama/Meta-Llama-3-8B...", "Llama-2-7b", "llama3.1"
      - Qwen family:    "Qwen/Qwen2.5-3B-Instruct", "qwen1.8b"
      - Mistral family: "mistralai/Mistral-7B-Instruct", "Mixtral-8x7B"
      - Phi family:     "microsoft/phi-3-mini-4k-instruct", "phi-2"

    Returns: "llama" | "qwen" | "mistral" | "phi" | "unknown"
    """
    n = model_name.lower().strip()

    # Normalize common separators to help with boundary checks
    # We'll treat "/", "-", "_", and spaces as boundaries.
    # Use regex with those as non-capturing boundaries.
    b = r"(?:^|[\/\-\_\s])"
    e = r"(?:$|[\/\-\_\s])"

    # --- LLaMA ---
    # Matches "meta-llama", "llama", "llama-3", "llama3.1", etc.
    if re.search(b + r"(meta-)?llama(?:[\-\s_]*\d+(?:\.\d+)*)?" + e, n):
        return "llama"

    # --- Qwen ---
    # Matches "qwen", "qwen2", "qwen2.5", etc.
    if re.search(b + r"qwen(?:\d+(?:\.\d+)*)?" + e, n):
        return "qwen"

    # --- Mistral / Mixtral ---
    # Matches "mistral", "mistral-7b", and "mixtral-8x7b" (MoE variant)
    if re.search(b + r"(mixtral|mistral)(?:[\-\s_]*\w*)?" + e, n):
        return "mistral"

    # --- Phi ---
    # Matches "phi", "phi-3", "phi-3-mini", etc., while avoiding words like "trophy"
    if re.search(b + r"phi(?:\d+(?:\.\d+)*)?(?:[\-\s_]*\w*)?" + e, n):
        return "phi"

    return "unknown"


def _norm_lang(tag: Optional[str]) -> str:
    if not tag: return "en"
    tag = tag.lower()
    if "-" in tag:  # code-switched => pick the first
        tag = tag.split("-", 1)[0]
    return tag if tag in {"en","es","zh"} else "en"

def _messages(text: str, mode: str, model_name: str, lang: Optional[str]):
    #prompt_system = PROMPTS_S[mode][_norm_lang(lang)].format(text=text)
    #prompt_user = PROMPTS_U[mode][_norm_lang(lang)].format(text=text)
    model_arch = detect_architecture(model_name)
    #print(model_arch)
    prompt_system = PROMPTS_S[model_arch][_norm_lang(lang)].format(text=text)
    prompt_user = PROMPTS_U[mode][_norm_lang(lang)].format(text=text)
    return [
        {"role": "system", "content": prompt_system},
        {"role": "user",  "content": prompt_user},
        {"role": "assistant", "content": "The word is: "}
    ]

def _embed_batch_hf(backend: HFInstructBackend, texts: List[str], mode: str, lang: Optional[str]) -> np.ndarray:
    tok, mdl = backend.tokenizer, backend.model
    device = next(mdl.parameters()).device

    # Build chat-formatted inputs (with the assistant tail already included by _messages)
    enc = tok.apply_chat_template(
        [_messages(t, mode, backend.model_name, lang) for t in texts],
        add_generation_prompt=False,
        return_tensors="pt",
        padding=True,
    )

    # Unpack tokenizer outputs
    if isinstance(enc, dict):
        input_ids = enc["input_ids"]           # [B, T]
        attn      = enc.get("attention_mask")  # [B, T] or None
    else:
        input_ids = enc
        attn      = None

    # --- Paper behavior ---
    # Slice off the final special token so that the *new* last token is the quote (")
    input_ids = input_ids[:, :-1]              # [B, T-1]
    if attn is not None:
        attn = attn[:, :-1]                    # keep masks aligned

    # If no attention mask was returned, build one from pad_token_id
    if attn is None:
        pad_id = tok.pad_token_id
        if pad_id is None:
            # common safe default: use eos as pad for masking purposes
            pad_id = tok.eos_token_id
        attn = (input_ids != pad_id).long()

    input_ids = input_ids.to(device)
    attn      = attn.to(device)

    with torch.inference_mode():
        outputs = mdl(
            input_ids=input_ids,
            attention_mask=attn,                # ok to pass mask; paper’s [:, -1, :] assumes no pads
            return_dict=True,
            output_hidden_states=True,
        )
        last_h = outputs.hidden_states[-1]      # [B, T-1, H]

        # Per-row index of the *last* real token after trimming one token above.
        # This equals the paper's [:, -1, :] when there is no padding.
        idx = attn.sum(dim=1) - 1              # [B]
        reps = last_h[torch.arange(last_h.size(0), device=device), idx]  # [B, H]

    reps = torch.nn.functional.normalize(reps.float(), p=2, dim=1)
    return reps.detach().cpu().numpy().astype(np.float32, copy=False)

def embed_texts(
    model_or_backend: HFInstructBackend,
    texts: List[str],
    batch_size: int = 64,
    normalize: bool = True,   # kept for signature compatibility; output is already L2-normalized
    mode: str = "doc",
    lang: Optional[str] = None
) -> np.ndarray:
    out = []
    from math import ceil  # local import (safe)
    total_batches = (len(texts) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding {mode}s ({lang})", total=total_batches, leave=False):
        out.append(_embed_batch_hf(model_or_backend, texts[i:i+batch_size], mode=mode, lang=lang))
    return np.concatenate(out, axis=0) if out else np.zeros((0, get_embedding_dim(model_or_backend)), dtype=np.float32)

In [12]:
# === cuVS (CAGRA) fast GPU ANN index — version-robust adapter ===
import numpy as np

try:
    import cupy as cp
    from cuvs.neighbors import cagra
    CUVS_AVAILABLE = True
except Exception as e:
    CUVS_AVAILABLE = False
    print("cuVS not available:", e)

class CuVSIndex:
    """Minimal adapter to mimic cuvs .search() with cuVS CAGRA."""
    def __init__(self, xb: "np.ndarray", device_id: int = 0,
                 graph_degree: int = 64, itopk_size: int = 64):
        if not CUVS_AVAILABLE:
            raise RuntimeError("cuVS not available; install cuvs-cu12 and cupy-cuda12x.")
        self.device_id = device_id
        self.itopk_size = itopk_size
        with cp.cuda.Device(self.device_id):
            xb = np.asarray(xb, dtype=np.float32, order="C")
            self.xb = cp.asarray(xb, dtype=cp.float32, order="C")

            # Build params (handle API differences across cuVS versions)
            try:
                ip = cagra.IndexParams(graph_degree=graph_degree)
            except TypeError:
                ip = cagra.IndexParams()
                try:
                    ip.graph_degree = graph_degree
                except Exception:
                    pass

            # Build index (try preferred then fallback signature)
            try:
                self.index = cagra.build(ip, self.xb)
            except TypeError:
                self.index = cagra.build(self.xb, ip)

    def search(self, xq: "np.ndarray", k: int):
        with cp.cuda.Device(self.device_id):
            q = cp.asarray(np.asarray(xq, dtype=np.float32, order="C"))

            # Search params (handle API differences)
            try:
                sp = cagra.SearchParams(itopk_size=self.itopk_size)
            except TypeError:
                sp = cagra.SearchParams()
                try:
                    sp.itopk_size = self.itopk_size
                except Exception:
                    pass

            # Try both call orders
            try:
                D, I = cagra.search(sp, self.index, q, k)
            except TypeError:
                D, I = cagra.search(self.index, q, k, sp)

            return cp.asnumpy(D), cp.asnumpy(I)

def build_cuvs_index(d: int, xb: "np.ndarray"):
    """Compatibility wrapper so existing code can call build_cuvs_index(...)."""
    idx, _ = build_gpu_index_with_cuvs(xb, device_id=0)
    return idx, True


In [13]:
def safe_case_name(case: str) -> str:
    """
    Make a case string safe for Windows filenames.
    - Replace '->' with '__to__'
    - Remove/replace characters forbidden on Windows: <>:"/\\|?*
    - Collapse spaces
    """
    s = case.replace("->", "__to__")
    s = re.sub(r'[<>:"/\\|?*]', "_", s)
    s = re.sub(r"\s+", "_", s)
    return s

# --- Helpers for language tags from column names ---

def tag_for_embed(tag: str) -> str:
    """For code-switched tags like 'es-en', return the first ('es')."""
    if not tag:
        return "en"
    tag = tag.lower()
    return tag.split("-", 1)[0] if "-" in tag else tag



In [14]:
# NVML-based GPU picker (safe if NVML not available)
def pick_best_gpu_by_nvml() -> int:
    try:
        import pynvml
        try:
            pynvml.nvmlInit()
            n = pynvml.nvmlDeviceGetCount()
            best, best_free = 0, -1
            for i in range(n):
                h = pynvml.nvmlDeviceGetHandleByIndex(i)
                mem = pynvml.nvmlDeviceGetMemoryInfo(h)
                if mem.free > best_free:
                    best, best_free = i, mem.free
        finally:
            try:
                pynvml.nvmlShutdown()
            except Exception:
                pass
        return best
    except Exception:
        # Fallback if NVML missing or errors out
        return 0

In [15]:
from pathlib import Path

def _results_dir_for(model_out: Path, doc_lang: str) -> Path:
    return model_out / f"D_{doc_lang}"

def _list_done_cases(model_out: Path, doc_lang: str) -> set[str]:
    """
    Return set of safe_case_name strings already completed for this (model, doc_lang).
    Looks for files: {model_out}/D_{doc_lang}/Q_{safe_case}.csv
    """
    p = _results_dir_for(model_out, doc_lang)
    done = set()
    if p.exists():
        for f in p.glob("Q_*.csv"):
            name = f.stem  # 'Q_xxx'
            if name.startswith("Q_"):
                done.add(name[2:])
    return done

def _all_cases_done(model_out: Path, doc_lang: str, expected_cases: dict) -> bool:
    """Check if all expected cases for a doc_lang are already present on disk."""
    done = _list_done_cases(model_out, doc_lang)
    needed = {safe_case_name(c) for c in expected_cases.keys()}
    return needed.issubset(done)

def _filter_pending_cases(model_out: Path, doc_lang: str, cases: dict) -> dict:
    """Return a filtered dict keeping only cases that have not been saved yet."""
    done = _list_done_cases(model_out, doc_lang)
    pending = {c: q for c, q in cases.items() if safe_case_name(c) not in done}
    return pending

from tqdm import tqdm
# === RETRIEVAL ONLY (no metric computation) ===
OUT_ROOT.mkdir(parents=True, exist_ok=True)
keep_ids = load_keep_ids(PILOT_IDS_FILE)
qrels = load_qrels(QRELS_PATH)

for model_name in MODELS:
    print(f"\n=== MODEL: {model_name} ===")
    model_slug = model_name.replace("/", "__")
    model_out = OUT_ROOT / model_slug
    model_out.mkdir(parents=True, exist_ok=True)

    # load model once
    model = load_model_backend(model_name, hf_token=HF_TOKEN)
    dim = get_embedding_dim(model)

    for doc_lang in tqdm(["es","en","zh"], desc="Doc languages", leave=False):
        # Resume/skip: compute all cases, skip doc_lang if complete, filter to pending
        cases_all, sw_map = load_queries_cases(MERGED_QUERIES, doc_lang, keep_ids)
        if _all_cases_done(model_out, doc_lang, cases_all):
            print(f"[{doc_lang}] All cases already done — skipping doc embeddings and index.")
            continue
        cases = _filter_pending_cases(model_out, doc_lang, cases_all)
        if not cases:
            print(f"[{doc_lang}] No pending cases — skipping.")
            continue
        print(f"--- Building/Loading doc index for doc_lang={doc_lang} ---")
        doc_ids, doc_texts = load_docs_for_lang(doc_lang)
        if not doc_ids:
            print(f"[{doc_lang}] No documents. Skipping.")
            continue
        xb = embed_texts(model, doc_texts, batch_size=BATCH_SIZE_DOCS, mode="doc", lang=doc_lang)
        try:
            index, gpu_res = build_cuvs_index(dim, xb, pick_best_gpu_by_nvml())
        except TypeError:
            index, gpu_res = build_cuvs_index(dim, xb)
        print(f"Index ready. Docs: {len(doc_ids)}, dim={dim}")
        print_case_counts(cases, doc_lang)

        # build query cases for this doc language
        cases, sw_map = load_queries_cases(MERGED_QUERIES, doc_lang, keep_ids)
        print_case_counts(cases, doc_lang)

        # run retrieval per case (store outputs for later metric eval)
        for case_name, qmap in tqdm(list(cases.items()), desc=f"Cases for {doc_lang}", leave=False):
            out_dir = model_out / f"D_{doc_lang}"
            out_dir.mkdir(parents=True, exist_ok=True)
            safe_name = safe_case_name(case_name)
            out_csv = out_dir / f"Q_{safe_name}.csv"
            if out_csv.exists():
                continue

            if not qmap:
                print(f"[{case_name}] No queries. Skipping.")
                continue

            # Optional: only keep queries that have at least one GT in qrels
            qids_all = list(qmap.keys())
            qids_eval = [qid for qid in qids_all if qid in qrels and len(qrels[qid]) > 0]
            if not qids_eval:
                print(f"[{case_name}] No queries with qrels. Skipping.")
                continue

            q_texts = [qmap[qid] for qid in qids_eval]
            qlang = case_name.split("->")[0] if "->" in case_name else case_name
            query_lang = qlang.split("-", 1)[0]   # en/es/zh
            qv = embed_texts(model, q_texts,  batch_size=BATCH_SIZE_QUERIES, normalize=NORMALIZE_L2, mode="query", lang=query_lang)

            # search
            D, I = index.search(qv, TOP_K)
            doc_id_arr = np.array(doc_ids)
            retrieved_lists = [doc_id_arr[I[i]].tolist() for i in range(len(qids_eval))]

            # write CSV rows (no metrics computed here)
            rows = []
            # query_language tag
            qlang = case_name.split("->")[0] if "->" in case_name else case_name

            for i, qid in enumerate(qids_eval):
                gt = list(set(qrels[qid]))  # keep all relevant GT docs
                ret = [str(x) for x in retrieved_lists[i]]
                sw = sw_map.get(case_name, {}).get(qid)  # only for CS cases

                rows.append({
                    "query_id": qid,
                    "query_language": qlang,
                    "query": qmap[qid],
                    "ground_truth_doc_ids": json.dumps(gt, ensure_ascii=False),
                    "retrieved_doc_ids": json.dumps(ret, ensure_ascii=False),
                    "sw_rate": sw,
                })

            out_dir = model_out / f"D_{doc_lang}"
            out_dir.mkdir(parents=True, exist_ok=True)
            
            safe_name = safe_case_name(case_name)
            out_csv = out_dir / f"Q_{safe_name}.csv"
            
            pd.DataFrame(rows).to_csv(out_csv, index=False, encoding="utf-8")
            print(f"[{case_name}] Saved {len(rows)} rows → {out_csv}")

        # cleanup
        del index, xb, doc_ids, doc_texts
        if gpu_res is not None:
            del gpu_res
        torch.cuda.empty_cache()     # clear cached blocks
        gc.collect()                 # force Python garbage collector
    
    del model
    torch.cuda.empty_cache()
    gc.collect()

print("\nRETRIEVAL TEST RUN COMPLETE (no metrics).")


=== MODEL: meta-llama/Llama-3.2-1B-Instruct ===


                                                            

[es] All cases already done — skipping doc embeddings and index.
[en] All cases already done — skipping doc embeddings and index.
[zh] All cases already done — skipping doc embeddings and index.





=== MODEL: meta-llama/Llama-3.1-8B-Instruct ===


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

                                                            

[es] All cases already done — skipping doc embeddings and index.
[en] All cases already done — skipping doc embeddings and index.
[zh] All cases already done — skipping doc embeddings and index.

=== MODEL: Qwen/Qwen2.5-3B-Instruct ===


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

                                                            

[es] All cases already done — skipping doc embeddings and index.
[en] All cases already done — skipping doc embeddings and index.
[zh] All cases already done — skipping doc embeddings and index.

=== MODEL: Qwen/Qwen2.5-7B-Instruct ===


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

                                                            

[es] All cases already done — skipping doc embeddings and index.
[en] All cases already done — skipping doc embeddings and index.
[zh] All cases already done — skipping doc embeddings and index.

=== MODEL: microsoft/Phi-3-mini-4k-instruct ===


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Doc languages:   0%|          | 0/3 [00:00<?, ?it/s]

--- Building/Loading doc index for doc_lang=es ---



Embedding docs (es):   0%|          | 0/1250 [00:00<?, ?it/s][A
Embedding docs (es):   0%|          | 1/1250 [00:00<14:03,  1.48it/s][A
Embedding docs (es):   0%|          | 2/1250 [00:00<09:06,  2.29it/s][A
Embedding docs (es):   0%|          | 3/1250 [00:01<08:53,  2.34it/s][A
Embedding docs (es):   0%|          | 4/1250 [00:01<08:43,  2.38it/s][A
Embedding docs (es):   0%|          | 5/1250 [00:02<08:13,  2.52it/s][A
Embedding docs (es):   0%|          | 6/1250 [00:02<08:49,  2.35it/s][A
Embedding docs (es):   1%|          | 7/1250 [00:02<08:20,  2.48it/s][A
Embedding docs (es):   1%|          | 8/1250 [00:03<08:38,  2.40it/s][A
Embedding docs (es):   1%|          | 9/1250 [00:03<08:00,  2.58it/s][A
Embedding docs (es):   1%|          | 10/1250 [00:04<07:36,  2.71it/s][A
Embedding docs (es):   1%|          | 11/1250 [00:04<07:28,  2.76it/s][A
Embedding docs (es):   1%|          | 12/1250 [00:04<07:24,  2.78it/s][A
Embedding docs (es):   1%|          | 13/1250 [00:05<07

Index ready. Docs: 20000, dim=3072
--- Query availability for doc_lang=es ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000
--- Query availability for doc_lang=es ---
en->es      :  10000
es          :  10000
es-en->es   :  10000
zh->es      :  10000
zh-en->es   :  10000
zh-es->es   :  10000



Cases for es:   0%|          | 0/6 [00:00<?, ?it/s][A

Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:15,  8.31it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:25,  7.27it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:23,  7.41it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:20,  7.71it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:20,  7.74it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:21,  7.56it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:23,  7.37it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:26,  7.16it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:26,  7.13it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:23,  7.32it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:29,  6.84it/s][A[A

Embedding que

[es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_es.csv




Embedding querys (en):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (en):   0%|          | 2/625 [00:00<01:00, 10.30it/s][A[A

Embedding querys (en):   1%|          | 4/625 [00:00<00:56, 10.93it/s][A[A

Embedding querys (en):   1%|          | 6/625 [00:00<00:59, 10.45it/s][A[A

Embedding querys (en):   1%|▏         | 8/625 [00:00<00:58, 10.55it/s][A[A

Embedding querys (en):   2%|▏         | 10/625 [00:00<00:58, 10.52it/s][A[A

Embedding querys (en):   2%|▏         | 12/625 [00:01<01:00, 10.16it/s][A[A

Embedding querys (en):   2%|▏         | 14/625 [00:01<00:59, 10.22it/s][A[A

Embedding querys (en):   3%|▎         | 16/625 [00:01<00:59, 10.18it/s][A[A

Embedding querys (en):   3%|▎         | 18/625 [00:01<00:59, 10.23it/s][A[A

Embedding querys (en):   3%|▎         | 20/625 [00:01<00:57, 10.50it/s][A[A

Embedding querys (en):   4%|▎         | 22/625 [00:02<00:59, 10.14it/s][A[A

Embedding querys (en):   4%|▍         | 24/625 [00:02<00:58, 10

[en->es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_en__to__es.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:18,  7.99it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:39,  6.28it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:34,  6.61it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:31,  6.75it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:31,  6.75it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:35,  6.48it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:34,  6.56it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:35,  6.48it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:36,  6.40it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:33,  6.55it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:41,  6.08it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:35,  6.41it

[zh->es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_zh__to__es.csv




Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:16,  8.17it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:22,  7.53it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:18,  7.97it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:19,  7.84it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:20,  7.73it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:22,  7.52it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:22,  7.47it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:25,  7.22it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:24,  7.26it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:23,  7.39it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:30,  6.79it/s][A[A

Embedding querys (es):   2%|▏         | 12/625 [00:01<01:27,  7.02it

[es-en->es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_es-en__to__es.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:13,  8.55it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:34,  6.56it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:31,  6.79it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:29,  6.98it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:28,  6.97it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:32,  6.70it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:32,  6.69it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:34,  6.53it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:32,  6.67it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:30,  6.77it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:35,  6.41it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:30,  6.75it

[zh-en->es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_zh-en__to__es.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:21,  7.69it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:40,  6.22it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:34,  6.60it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:30,  6.83it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:30,  6.88it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:34,  6.54it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:32,  6.66it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:36,  6.41it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:35,  6.43it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:34,  6.52it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:42,  5.97it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:36,  6.32it

[zh-es->es] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_es/Q_zh-es__to__es.csv


Doc languages:  33%|███▎      | 1/3 [17:04<34:08, 1024.36s/it]

--- Building/Loading doc index for doc_lang=en ---



Embedding docs (en):   0%|          | 0/1250 [00:00<?, ?it/s][A
Embedding docs (en):   0%|          | 1/1250 [00:00<04:37,  4.50it/s][A
Embedding docs (en):   0%|          | 2/1250 [00:00<04:41,  4.43it/s][A
Embedding docs (en):   0%|          | 3/1250 [00:00<05:28,  3.79it/s][A
Embedding docs (en):   0%|          | 4/1250 [00:00<05:11,  4.01it/s][A
Embedding docs (en):   0%|          | 5/1250 [00:01<05:25,  3.82it/s][A
Embedding docs (en):   0%|          | 6/1250 [00:01<05:51,  3.54it/s][A
Embedding docs (en):   1%|          | 7/1250 [00:01<05:37,  3.68it/s][A
Embedding docs (en):   1%|          | 8/1250 [00:02<05:58,  3.46it/s][A
Embedding docs (en):   1%|          | 9/1250 [00:02<05:30,  3.75it/s][A
Embedding docs (en):   1%|          | 10/1250 [00:02<05:25,  3.81it/s][A
Embedding docs (en):   1%|          | 11/1250 [00:02<05:14,  3.93it/s][A
Embedding docs (en):   1%|          | 12/1250 [00:03<05:14,  3.93it/s][A
Embedding docs (en):   1%|          | 13/1250 [00:03<05

Index ready. Docs: 20000, dim=3072
--- Query availability for doc_lang=en ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000
--- Query availability for doc_lang=en ---
en          :  10000
es->en      :  10000
es-en->en   :  10000
zh->en      :  10000
zh-en->en   :  10000
zh-es->en   :  10000



Cases for en:   0%|          | 0/6 [00:00<?, ?it/s][A

Embedding querys (en):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (en):   0%|          | 2/625 [00:00<00:58, 10.63it/s][A[A

Embedding querys (en):   1%|          | 4/625 [00:00<00:55, 11.14it/s][A[A

Embedding querys (en):   1%|          | 6/625 [00:00<00:58, 10.65it/s][A[A

Embedding querys (en):   1%|▏         | 8/625 [00:00<00:57, 10.68it/s][A[A

Embedding querys (en):   2%|▏         | 10/625 [00:00<00:56, 10.79it/s][A[A

Embedding querys (en):   2%|▏         | 12/625 [00:01<01:00, 10.17it/s][A[A

Embedding querys (en):   2%|▏         | 14/625 [00:01<00:59, 10.29it/s][A[A

Embedding querys (en):   3%|▎         | 16/625 [00:01<00:59, 10.26it/s][A[A

Embedding querys (en):   3%|▎         | 18/625 [00:01<00:59, 10.27it/s][A[A

Embedding querys (en):   3%|▎         | 20/625 [00:01<00:57, 10.47it/s][A[A

Embedding querys (en):   4%|▎         | 22/625 [00:02<00:59, 10.12it/s][A[A

Embeddin

[en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_en.csv




Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:18,  7.99it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:25,  7.25it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:25,  7.25it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:24,  7.38it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:22,  7.54it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:24,  7.35it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:26,  7.16it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:27,  7.02it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:26,  7.09it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:24,  7.26it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:30,  6.76it/s][A[A

Embedding querys (es):   2%|▏         | 12/625 [00:01<01:28,  6.92it

[es->en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_es__to__en.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:24,  7.43it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:36,  6.44it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:33,  6.66it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:31,  6.82it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:32,  6.74it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:35,  6.45it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:35,  6.49it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:36,  6.38it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:37,  6.34it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:34,  6.53it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:40,  6.10it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:35,  6.43it

[zh->en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_zh__to__en.csv




Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:15,  8.31it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:22,  7.58it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:20,  7.70it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:21,  7.64it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:20,  7.68it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:23,  7.42it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:24,  7.32it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:25,  7.19it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:23,  7.35it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:22,  7.42it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:29,  6.87it/s][A[A

Embedding querys (es):   2%|▏         | 12/625 [00:01<01:26,  7.05it

[es-en->en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_es-en__to__en.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:23,  7.43it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:37,  6.39it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:32,  6.75it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:29,  6.96it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:28,  7.01it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:31,  6.80it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:31,  6.75it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:35,  6.48it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:33,  6.61it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:31,  6.71it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:36,  6.40it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:30,  6.75it

[zh-en->en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_zh-en__to__en.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:23,  7.51it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:39,  6.28it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:35,  6.52it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:30,  6.83it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:29,  6.91it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:33,  6.61it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:31,  6.76it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:34,  6.55it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:33,  6.56it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:32,  6.66it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:40,  6.10it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:35,  6.39it

[zh-es->en] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_en/Q_zh-es__to__en.csv


Doc languages:  67%|██████▋   | 2/3 [31:24<15:27, 927.85s/it] 

--- Building/Loading doc index for doc_lang=zh ---



Embedding docs (zh):   0%|          | 0/1250 [00:00<?, ?it/s][A
Embedding docs (zh):   0%|          | 1/1250 [00:00<07:56,  2.62it/s][A
Embedding docs (zh):   0%|          | 2/1250 [00:00<07:28,  2.78it/s][A
Embedding docs (zh):   0%|          | 3/1250 [00:01<09:21,  2.22it/s][A
Embedding docs (zh):   0%|          | 4/1250 [00:01<09:03,  2.29it/s][A
Embedding docs (zh):   0%|          | 5/1250 [00:02<09:33,  2.17it/s][A
Embedding docs (zh):   0%|          | 6/1250 [00:02<10:33,  1.96it/s][A
Embedding docs (zh):   1%|          | 7/1250 [00:03<10:23,  1.99it/s][A
Embedding docs (zh):   1%|          | 8/1250 [00:03<10:21,  2.00it/s][A
Embedding docs (zh):   1%|          | 9/1250 [00:04<09:11,  2.25it/s][A
Embedding docs (zh):   1%|          | 10/1250 [00:04<09:07,  2.26it/s][A
Embedding docs (zh):   1%|          | 11/1250 [00:05<09:23,  2.20it/s][A
Embedding docs (zh):   1%|          | 12/1250 [00:05<09:13,  2.24it/s][A
Embedding docs (zh):   1%|          | 13/1250 [00:05<08

Index ready. Docs: 20000, dim=3072
--- Query availability for doc_lang=zh ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000
--- Query availability for doc_lang=zh ---
en->zh      :  10000
es->zh      :  10000
es-en->zh   :  10000
zh          :  10000
zh-en->zh   :  10000
zh-es->zh   :  10000



Cases for zh:   0%|          | 0/6 [00:00<?, ?it/s][A

Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:21,  7.70it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:37,  6.36it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:32,  6.73it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:29,  6.94it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:30,  6.84it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:34,  6.54it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:34,  6.56it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:35,  6.48it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:35,  6.46it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:32,  6.66it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:39,  6.19it/s][A[A

Embedding que

[zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_zh.csv




Embedding querys (en):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (en):   0%|          | 2/625 [00:00<00:58, 10.72it/s][A[A

Embedding querys (en):   1%|          | 4/625 [00:00<00:55, 11.09it/s][A[A

Embedding querys (en):   1%|          | 6/625 [00:00<00:58, 10.57it/s][A[A

Embedding querys (en):   1%|▏         | 8/625 [00:00<00:58, 10.58it/s][A[A

Embedding querys (en):   2%|▏         | 10/625 [00:00<00:58, 10.56it/s][A[A

Embedding querys (en):   2%|▏         | 12/625 [00:01<00:59, 10.23it/s][A[A

Embedding querys (en):   2%|▏         | 14/625 [00:01<00:59, 10.34it/s][A[A

Embedding querys (en):   3%|▎         | 16/625 [00:01<00:59, 10.26it/s][A[A

Embedding querys (en):   3%|▎         | 18/625 [00:01<00:59, 10.21it/s][A[A

Embedding querys (en):   3%|▎         | 20/625 [00:01<00:57, 10.45it/s][A[A

Embedding querys (en):   4%|▎         | 22/625 [00:02<00:59, 10.14it/s][A[A

Embedding querys (en):   4%|▍         | 24/625 [00:02<00:58, 10

[en->zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_en__to__zh.csv




Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:19,  7.87it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:26,  7.18it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:25,  7.27it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:23,  7.42it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:22,  7.49it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:24,  7.36it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:25,  7.25it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:27,  7.08it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:26,  7.14it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:24,  7.26it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:30,  6.78it/s][A[A

Embedding querys (es):   2%|▏         | 12/625 [00:01<01:28,  6.94it

[es->zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_es__to__zh.csv




Embedding querys (es):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (es):   0%|          | 1/625 [00:00<01:14,  8.38it/s][A[A

Embedding querys (es):   0%|          | 2/625 [00:00<01:22,  7.57it/s][A[A

Embedding querys (es):   0%|          | 3/625 [00:00<01:21,  7.66it/s][A[A

Embedding querys (es):   1%|          | 4/625 [00:00<01:21,  7.58it/s][A[A

Embedding querys (es):   1%|          | 5/625 [00:00<01:21,  7.61it/s][A[A

Embedding querys (es):   1%|          | 6/625 [00:00<01:22,  7.46it/s][A[A

Embedding querys (es):   1%|          | 7/625 [00:00<01:23,  7.38it/s][A[A

Embedding querys (es):   1%|▏         | 8/625 [00:01<01:25,  7.22it/s][A[A

Embedding querys (es):   1%|▏         | 9/625 [00:01<01:23,  7.39it/s][A[A

Embedding querys (es):   2%|▏         | 10/625 [00:01<01:22,  7.44it/s][A[A

Embedding querys (es):   2%|▏         | 11/625 [00:01<01:29,  6.88it/s][A[A

Embedding querys (es):   2%|▏         | 12/625 [00:01<01:26,  7.05it

[es-en->zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_es-en__to__zh.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:24,  7.39it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:36,  6.43it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:33,  6.68it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:29,  6.93it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:29,  6.95it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:32,  6.72it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:31,  6.78it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:32,  6.64it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:31,  6.74it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:29,  6.86it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:34,  6.47it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:30,  6.78it

[zh-en->zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_zh-en__to__zh.csv




Embedding querys (zh):   0%|          | 0/625 [00:00<?, ?it/s][A[A

Embedding querys (zh):   0%|          | 1/625 [00:00<01:24,  7.42it/s][A[A

Embedding querys (zh):   0%|          | 2/625 [00:00<01:42,  6.10it/s][A[A

Embedding querys (zh):   0%|          | 3/625 [00:00<01:34,  6.60it/s][A[A

Embedding querys (zh):   1%|          | 4/625 [00:00<01:30,  6.87it/s][A[A

Embedding querys (zh):   1%|          | 5/625 [00:00<01:29,  6.89it/s][A[A

Embedding querys (zh):   1%|          | 6/625 [00:00<01:34,  6.55it/s][A[A

Embedding querys (zh):   1%|          | 7/625 [00:01<01:33,  6.64it/s][A[A

Embedding querys (zh):   1%|▏         | 8/625 [00:01<01:36,  6.38it/s][A[A

Embedding querys (zh):   1%|▏         | 9/625 [00:01<01:35,  6.44it/s][A[A

Embedding querys (zh):   2%|▏         | 10/625 [00:01<01:33,  6.55it/s][A[A

Embedding querys (zh):   2%|▏         | 11/625 [00:01<01:42,  6.02it/s][A[A

Embedding querys (zh):   2%|▏         | 12/625 [00:01<01:36,  6.35it

[zh-es->zh] Saved 10000 rows → runs_cuvs_test_instruct_v2/microsoft__Phi-3-mini-4k-instruct/D_zh/Q_zh-es__to__zh.csv


                                                              


=== MODEL: Qwen/Qwen2.5-32B-Instruct ===


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

Doc languages:   0%|          | 0/3 [00:00<?, ?it/s]

--- Building/Loading doc index for doc_lang=es ---



Embedding docs (es):   0%|          | 0/1250 [00:00<?, ?it/s][A
Embedding docs (es):   0%|          | 1/1250 [00:02<53:36,  2.58s/it][A
Embedding docs (es):   0%|          | 2/1250 [00:04<51:32,  2.48s/it][A
Embedding docs (es):   0%|          | 3/1250 [00:08<1:02:32,  3.01s/it][A
Embedding docs (es):   0%|          | 4/1250 [00:11<1:04:50,  3.12s/it][A
Embedding docs (es):   0%|          | 5/1250 [00:15<1:04:52,  3.13s/it][A
Embedding docs (es):   0%|          | 6/1250 [00:19<1:10:58,  3.42s/it][A
Embedding docs (es):   1%|          | 7/1250 [00:21<1:05:27,  3.16s/it][A
Embedding docs (es):   1%|          | 8/1250 [00:25<1:09:56,  3.38s/it][A
Embedding docs (es):   1%|          | 9/1250 [00:28<1:05:39,  3.17s/it][A
Embedding docs (es):   1%|          | 10/1250 [00:30<1:02:43,  3.04s/it][A
Embedding docs (es):   1%|          | 11/1250 [00:33<1:00:42,  2.94s/it][A
Embedding docs (es):   1%|          | 12/1250 [00:36<1:00:38,  2.94s/it][A
Embedding docs (es):   1%|         

KeyboardInterrupt: 

# Run metric computation

In [16]:
# 01_compute_metrics.py
# pip install pandas numpy tqdm

import json, math
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

In [17]:
RUNS = Path("runs_cuvs_test_instruct_v2")
TOPK_LIST = [1, 3, 5, 10]
CS_LANGS = {"es-en", "zh-en", "zh-es"}
CS_BINS = [(0.0,0.2),(0.2,0.4),(0.4,0.6),(0.6,0.8),(0.8,1.0)]

In [18]:
def bin_label(lo, hi): 
    return f"{lo:.1f}-{hi:.1f}"

# ===== Metrics =====
def metrics_for_row(gt_ids, ret_ids, K):
    gt = set(gt_ids)
    top = ret_ids[:K]
    hits = [1 if d in gt else 0 for d in top]

    # P@K
    P = sum(hits)/max(K,1)

    # R@K
    pos = len(gt)
    R = sum(hits)/pos if pos>0 else 0.0

    # AP@K
    ap, seen = 0.0, 0
    for i,h in enumerate(hits,1):
        if h:
            seen += 1
            ap += seen/i
    AP = (ap/pos) if pos>0 else 0.0

    # MRR@K
    RR = 0.0
    for i,h in enumerate(hits,1):
        if h:
            RR = 1.0/i
            break

    # nDCG@K (binary gains)
    dcg = sum((1.0/math.log2(i+1)) for i,h in enumerate(hits,1) if h)
    idcg = sum((1.0/math.log2(i+1)) for i in range(1, min(pos,K)+1))
    nDCG = (dcg/idcg) if idcg>0 else 0.0

    return P, R, AP, RR, nDCG

# ===== Helpers =====
def load_rows_from_df(df: pd.DataFrame):
    """Return list of (qid, gt_ids, ret_ids, sw_rate). Ensures types and safe parsing."""
    # Ensure query_id
    if "query_id" in df.columns:
        qid = df["query_id"].astype(str)
    else:
        qid = pd.Series((f"row_{i}" for i in range(len(df))), index=df.index, dtype="object")

    # Parse JSON list columns
    if "ground_truth_doc_ids" not in df.columns or "retrieved_doc_ids" not in df.columns:
        raise ValueError("CSV must contain 'ground_truth_doc_ids' and 'retrieved_doc_ids' columns.")
    gt = df["ground_truth_doc_ids"].apply(parse_json_list)
    ret = df["retrieved_doc_ids"].apply(parse_json_list)

    # sw_rate numeric (optional)
    if "sw_rate" in df.columns:
        sw = pd.to_numeric(df["sw_rate"], errors="coerce")
    else:
        sw = pd.Series([np.nan]*len(df), index=df.index)

    return list(zip(qid.tolist(), gt.tolist(), ret.tolist(), sw.tolist()))

def detect_query_lang(qcsv_path: Path, df: pd.DataFrame) -> str:
    """Prefer the CSV's query_language column; fallback to filename (Q_<lang>.csv)."""
    if "query_language" in df.columns:
        vals = df["query_language"].dropna().astype(str)
        if len(vals.unique()) == 1:
            ql = vals.iloc[0]
        else:
            ql = vals.mode().iat[0]
        return str(ql).strip()
    # fallback: Q_<lang>.csv
    return qcsv_path.stem.split("_", 1)[1]

In [19]:
# ===== Helpers =====
def parse_json_list(x):
    """Parse a JSON list from a cell that may already be a list, a JSON string, or empty."""
    if isinstance(x, list):
        return [str(i) for i in x]
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        try:
            obj = json.loads(s)
            if isinstance(obj, list):
                return [str(i) for i in obj]
            # fallback: try comma-split if not json list
            return [t.strip() for t in s.split(",") if t.strip()]
        except Exception:
            # fallback: comma-split
            return [t.strip() for t in s.split(",") if t.strip()]
    return []

def load_rows_from_df(df: pd.DataFrame):
    """Return list of (qid, gt_ids, ret_ids, sw_rate). Ensures types and safe parsing."""
    # Ensure query_id
    if "query_id" in df.columns:
        qid = df["query_id"].astype(str)
    else:
        qid = pd.Series((f"row_{i}" for i in range(len(df))), index=df.index, dtype="object")

    # Parse JSON list columns
    if "ground_truth_doc_ids" not in df.columns or "retrieved_doc_ids" not in df.columns:
        raise ValueError("CSV must contain 'ground_truth_doc_ids' and 'retrieved_doc_ids' columns.")
    gt = df["ground_truth_doc_ids"].apply(parse_json_list)
    ret = df["retrieved_doc_ids"].apply(parse_json_list)

    # sw_rate numeric (optional)
    if "sw_rate" in df.columns:
        sw = pd.to_numeric(df["sw_rate"], errors="coerce")
    else:
        sw = pd.Series([np.nan]*len(df), index=df.index)

    return list(zip(qid.tolist(), gt.tolist(), ret.tolist(), sw.tolist()))

def detect_query_lang(qcsv_path: Path, df: pd.DataFrame) -> str:
    """Prefer the CSV's query_language column; fallback to filename (Q_<lang>.csv)."""
    if "query_language" in df.columns:
        vals = df["query_language"].dropna().astype(str)
        if len(vals.unique()) == 1:
            ql = vals.iloc[0]
        else:
            ql = vals.mode().iat[0]
        return str(ql).strip()
    # fallback: Q_<lang>.csv
    return qcsv_path.stem.split("_", 1)[1]

In [20]:
# ===== Main =====
def main():
    print("=== Starting IR evaluation summary pass ===")
    rows_sum = []
    rows_cs = []
    total_models = 0
    total_runs = 0

    for model_dir in tqdm(sorted(RUNS.glob("*")), desc="models"):
        if not model_dir.is_dir():
            continue
        model_slug = model_dir.name
        total_models += 1
        print(f"\n--- Processing model: {model_slug} ---")

        for ddir in sorted(model_dir.glob("D_*")):
            if not ddir.is_dir():
                continue
            doc_lang = ddir.name.split("_", 1)[1]
            print(f"  [Doc lang: {doc_lang}]")

            for qcsv in sorted(ddir.glob("Q_*.csv")):
                # Read once, reuse
                try:
                    df_tmp = pd.read_csv(qcsv)
                except Exception as e:
                    print(f"    !! Failed to read {qcsv.name}: {e}")
                    continue

                query_lang = detect_query_lang(qcsv, df_tmp)
                norm_lang = query_lang.replace("_", "-").strip()

                print(f"    > Query file: {qcsv.name} | Query lang: {query_lang} -> normalized: {norm_lang}")

                try:
                    data = load_rows_from_df(df_tmp)
                except Exception as e:
                    print(f"      !! Skipping {qcsv.name}: {e}")
                    continue

                if not data:
                    print(f"      !! Skipping: no rows found in {qcsv.name}")
                    continue

                N = len(data)
                print(f"      Loaded {N} rows")

                # aggregate metrics over all rows
                sums = {f"{m}@{k}": 0.0 for m in ["P","R","MAP","MRR","nDCG"] for k in TOPK_LIST}
                for _, gt, ret, _ in data:
                    for K in TOPK_LIST:
                        P, R, AP, RR, nD = metrics_for_row(gt, ret, K)
                        sums[f"P@{K}"]   += P
                        sums[f"R@{K}"]   += R
                        sums[f"MAP@{K}"] += AP
                        sums[f"MRR@{K}"] += RR
                        sums[f"nDCG@{K}"]+= nD

                summary = {
                    "model": model_slug,
                    "doc_lang": doc_lang,
                    "query_lang": norm_lang,
                    "N": N,
                }
                summary.update({k: v/N for k, v in sums.items()})
                rows_sum.append(summary)
                total_runs += 1

                # === Code-switched bucket analysis ===
                if norm_lang in CS_LANGS:
                    print(f"      [CS bucket analysis enabled for {norm_lang}]")
                    bins = {bin_label(lo,hi): [] for lo,hi in CS_BINS}

                    for _, gt, ret, sw in data:
                        if sw is None or (isinstance(sw, float) and math.isnan(sw)):
                            continue
                        try:
                            swv = float(sw)
                        except Exception:
                            continue

                        for lo, hi in CS_BINS:
                            # include exact 1.0 in the last bin
                            if (lo <= swv < hi) or (math.isclose(swv, 1.0) and math.isclose(hi, 1.0)):
                                bins[bin_label(lo, hi)].append((gt, ret))
                                break

                    for b, pairs in bins.items():
                        if not pairs:
                            # Still log empty bins for visibility
                            print(f"        - Bin {b}: 0 queries")
                            continue

                        B = len(pairs)
                        print(f"        - Bin {b}: {B} queries")

                        sumsB = {f"{m}@{k}": 0.0 for m in ["P","R","MAP","MRR","nDCG"] for k in TOPK_LIST}
                        for gt, ret in pairs:
                            for K in TOPK_LIST:
                                P, R, AP, RR, nD = metrics_for_row(gt, ret, K)
                                sumsB[f"P@{K}"]   += P
                                sumsB[f"R@{K}"]   += R
                                sumsB[f"MAP@{K}"] += AP
                                sumsB[f"MRR@{K}"] += RR
                                sumsB[f"nDCG@{K}"]+= nD

                        row = {
                            "model": model_slug,
                            "doc_lang": doc_lang,
                            "query_lang": norm_lang,
                            "sw_bin": b,
                            "N": B,
                        }
                        row.update({k: v/B for k, v in sumsB.items()})
                        rows_cs.append(row)

    # === Write results to Excel ===
    out_dir = RUNS
    out_dir.mkdir(parents=True, exist_ok=True)
    excel_path = out_dir / "summaries.xlsx"

    with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
        df_all = pd.DataFrame(rows_sum)
        if not df_all.empty:
            # All Results
            df_all.to_excel(writer, sheet_name="All Results", index=False)

            # ES-related queries (plain 'es' and CS 'es-en')
            df_es = df_all[df_all["query_lang"].isin(["es", "es-en"])]
            df_es.to_excel(writer, sheet_name="ES Queries", index=False)

            # ZH-related queries (plain 'zh' and CS 'zh-en', 'zh-es')
            df_zh = df_all[df_all["query_lang"].isin(["zh", "zh-en", "zh-es"])]
            df_zh.to_excel(writer, sheet_name="ZH Queries", index=False)

            # Per-model sheets
            for model in sorted(df_all["model"].dropna().unique()):
                df_model = df_all[df_all["model"] == model]
                df_model.to_excel(writer, sheet_name=f"Model_{str(model)[:25]}", index=False)
        else:
            # still create empty sheets
            pd.DataFrame().to_excel(writer, sheet_name="All Results", index=False)
            pd.DataFrame().to_excel(writer, sheet_name="ES Queries", index=False)
            pd.DataFrame().to_excel(writer, sheet_name="ZH Queries", index=False)

        # CodeSwitch Curves sheet (always write, even if empty)
        df_cs = pd.DataFrame(rows_cs)
        df_cs.to_excel(writer, sheet_name="CodeSwitch Curves", index=False)

    # Console summary
    print(f"\n=== Excel saved to: {excel_path} ===")
    print(f"Total models: {total_models} | Total runs: {total_runs}")
    print(f"Per-bin rows: {len(rows_cs)}")
    if len(rows_cs) > 0:
        try:
            print(
                df_cs.groupby(["model","doc_lang","query_lang","sw_bin"])["N"]
                .sum()
                .sort_values(ascending=False)
                .head(10)
            )
        except Exception:
            pass

if __name__ == "__main__":
    main()


=== Starting IR evaluation summary pass ===


models:   0%|          | 0/9 [00:00<?, ?it/s]


--- Processing model: .ipynb_checkpoints ---

--- Processing model: Qwen__Qwen2.5-32B-Instruct ---

--- Processing model: Qwen__Qwen2.5-3B-Instruct ---
  [Doc lang: en]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es__to__en.csv | Query lang: es -> normalized: es
      Loaded 10000 rows
    > Query file: Q_zh-en__to__en.csv | Query lang: zh-en -> normalized: zh-en
      Loaded 10000 rows
      [CS bucket analysis enabled for zh-en]
        - Bin 0.0-0.2: 3917 queries
        - Bin 0.2-0.4: 2337 queries
        - Bin 0.4-0.6: 1736 queries
        - Bin 0.6-0.8: 952 queries
        - Bin 0.8-1.0: 1058 queri

models:  33%|███▎      | 3/9 [00:04<00:09,  1.59s/it]

        - Bin 0.4-0.6: 1729 queries
        - Bin 0.6-0.8: 994 queries
        - Bin 0.8-1.0: 1059 queries
    > Query file: Q_zh.csv | Query lang: zh -> normalized: zh
      Loaded 10000 rows

--- Processing model: Qwen__Qwen2.5-7B-Instruct ---
  [Doc lang: en]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es__to__en.csv | Query lang: es -> normalized: es
      Loaded 10000 rows
    > Query file: Q_zh-en__to__en.csv | Query lang: zh-en -> normalized: zh-en
      Loaded 10000 rows
      [CS bucket analysis enabled for zh-en]
        - Bin 0.0-0.2: 3917 queries
        - Bin 0.2-0.4: 2337 queries
        - B

models:  44%|████▍     | 4/9 [00:09<00:12,  2.53s/it]


--- Processing model: meta-llama__Llama-3.1-8B-Instruct ---
  [Doc lang: en]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es__to__en.csv | Query lang: es -> normalized: es
      Loaded 10000 rows
    > Query file: Q_zh-en__to__en.csv | Query lang: zh-en -> normalized: zh-en
      Loaded 10000 rows
      [CS bucket analysis enabled for zh-en]
        - Bin 0.0-0.2: 3917 queries
        - Bin 0.2-0.4: 2337 queries
        - Bin 0.4-0.6: 1736 queries
        - Bin 0.6-0.8: 952 queries
        - Bin 0.8-1.0: 1058 queries
    > Query file: Q_zh-es__to__en.csv | Query lang: zh-es -> normalized: zh-es
      Load

models:  56%|█████▌    | 5/9 [00:13<00:12,  3.14s/it]


--- Processing model: meta-llama__Llama-3.2-1B-Instruct ---
  [Doc lang: en]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es__to__en.csv | Query lang: es -> normalized: es
      Loaded 10000 rows
    > Query file: Q_zh-en__to__en.csv | Query lang: zh-en -> normalized: zh-en
      Loaded 10000 rows
      [CS bucket analysis enabled for zh-en]
        - Bin 0.0-0.2: 3917 queries
        - Bin 0.2-0.4: 2337 queries
        - Bin 0.4-0.6: 1736 queries
        - Bin 0.6-0.8: 952 queries
        - Bin 0.8-1.0: 1058 queries
    > Query file: Q_zh-es__to__en.csv | Query lang: zh-es -> normalized: zh-es
      Load

models:  67%|██████▋   | 6/9 [00:18<00:10,  3.62s/it]

      Loaded 10000 rows

--- Processing model: microsoft__Phi-3-mini-4k-instruct ---
  [Doc lang: en]
    > Query file: Q_en.csv | Query lang: en -> normalized: en
      Loaded 10000 rows
    > Query file: Q_es-en__to__en.csv | Query lang: es-en -> normalized: es-en
      Loaded 10000 rows
      [CS bucket analysis enabled for es-en]
        - Bin 0.0-0.2: 4375 queries
        - Bin 0.2-0.4: 3368 queries
        - Bin 0.4-0.6: 1269 queries
        - Bin 0.6-0.8: 406 queries
        - Bin 0.8-1.0: 582 queries
    > Query file: Q_es__to__en.csv | Query lang: es -> normalized: es
      Loaded 10000 rows
    > Query file: Q_zh-en__to__en.csv | Query lang: zh-en -> normalized: zh-en
      Loaded 10000 rows
      [CS bucket analysis enabled for zh-en]
        - Bin 0.0-0.2: 3917 queries
        - Bin 0.2-0.4: 2337 queries
        - Bin 0.4-0.6: 1736 queries
        - Bin 0.6-0.8: 952 queries
        - Bin 0.8-1.0: 1058 queries
    > Query file: Q_zh-es__to__en.csv | Query lang: zh-es -> norm

models: 100%|██████████| 9/9 [00:22<00:00,  2.54s/it]


--- Processing model: mistralai__Mistral-7B-Instruct-v0.3 ---

=== Excel saved to: runs_cuvs_test_instruct_v2/summaries.xlsx ===
Total models: 8 | Total runs: 90
Per-bin rows: 225
model                              doc_lang  query_lang  sw_bin 
Qwen__Qwen2.5-3B-Instruct          en        es-en       0.0-0.2    4375
                                   es        es-en       0.0-0.2    4375
                                   zh        es-en       0.0-0.2    4375
Qwen__Qwen2.5-7B-Instruct          es        es-en       0.0-0.2    4375
                                   en        es-en       0.0-0.2    4375
                                   zh        es-en       0.0-0.2    4375
meta-llama__Llama-3.1-8B-Instruct  en        es-en       0.0-0.2    4375
                                   zh        es-en       0.0-0.2    4375
                                   es        es-en       0.0-0.2    4375
meta-llama__Llama-3.2-1B-Instruct  zh        es-en       0.0-0.2    4375
Name: N, dtype: int64



