In [2]:
import pandas as pd
df = pd.read_parquet("../data/dataset_tg.parquet")

In [3]:
df["id_channel"].value_counts()

id_channel
3     15747
1      8552
6      6105
4      5073
2      3916
18     2891
5      2476
Name: count, dtype: int64

In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,message_id,views_o0,views_o1,views_o2,views_o3,forwards_o0,forwards_o1,forwards_o2,...,best_ctr_reactions_0_3,viral_ml,viral_final,llm_json,is_economic,topic,confidence,reason,economic_signals,noise_signals
0,0,0,00027151-a524-4d93-a820-116398fb81bb,492127.0,603269.0,619397.0,625475.0,381.0,415.0,449.0,...,,-0.173209,0.406649,"{'confidence': 0.96, 'economic_signals': array...",False,,0.96,Текст содержит лишь политические и военные выс...,,
1,1,2,0003b835-cf4a-43ff-b155-a144cf56b7f8,486195.0,507393.0,514469.0,517508.0,273.0,273.0,273.0,...,,-0.150123,0.445509,"{'confidence': 0.9, 'economic_signals': array(...",False,,0.9,Новость посвящена уголовному делу о хищении ср...,,
2,2,4,0005535a-74a9-4cb9-853f-ce04612d2f94,455967.0,609468.0,627631.0,634731.0,448.0,551.0,551.0,...,,-0.170353,0.396754,"{'confidence': 0.95, 'economic_signals': array...",False,,0.95,Новость описывает процесс поступления детей в ...,,
3,3,7,0007e2f8-787d-404f-91ff-e2582096a4a7,22376.0,23594.0,23735.0,23825.0,5.0,9.0,9.0,...,,0.005396,0.703622,"{'confidence': 0.96, 'economic_signals': array...",True,Санкции и геополитика,0.96,Новость посвящена позиции Сербии по санкциям Е...,,
4,4,8,000884a5-8291-4ec1-805f-ac131112aaf7,19493.0,20605.0,,,64.0,67.0,,...,0.00041,0.024601,0.680369,"{'confidence': 0.96, 'economic_signals': array...",True,Рынки капитала,0.96,Новость описывает падение фондового рынка и ра...,,


In [5]:
df.shape

(44760, 95)

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cleaned_news_exp.csv")
texts = df["message"].fillna("").astype(str)

df_len = pd.DataFrame({
    "chars": texts.str.len(),
    "words": texts.str.split().map(len),
})

def qstats(s):
    return {
        "count": int(s.count()),
        "mean": float(s.mean()),
        "p50": float(s.quantile(0.50)),
        "p75": float(s.quantile(0.75)),
        "p90": float(s.quantile(0.90)),
        "p95": float(s.quantile(0.95)),
        "p99": float(s.quantile(0.99)),
        "max": float(s.max()),
    }

print("CHARS:", qstats(df_len["chars"]))
print("WORDS:", qstats(df_len["words"]))


CHARS: {'count': 18618, 'mean': 451.36663443978944, 'p50': 334.0, 'p75': 558.0, 'p90': 908.0, 'p95': 1127.1499999999978, 'p99': 2348.129999999979, 'max': 4067.0}
WORDS: {'count': 18618, 'mean': 63.3464926415297, 'p50': 47.0, 'p75': 77.0, 'p90': 124.29999999999927, 'p95': 158.0, 'p99': 317.8299999999981, 'max': 634.0}


In [7]:
from transformers import AutoTokenizer
from tqdm import tqdm

tok_e5 = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")

def token_lens(tokenizer, texts, batch_size=256):
    lens = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=False, truncation=False, add_special_tokens=True)
        lens.extend([len(ids) for ids in enc["input_ids"]])
    return np.array(lens)

e5_lens = token_lens(tok_e5, texts.tolist(), batch_size=256)

def qstats_np(a):
    return {
        "count": int(a.size),
        "mean": float(a.mean()),
        "p50": float(np.quantile(a, 0.50)),
        "p75": float(np.quantile(a, 0.75)),
        "p90": float(np.quantile(a, 0.90)),
        "p95": float(np.quantile(a, 0.95)),
        "p99": float(np.quantile(a, 0.99)),
        "max": float(a.max()),
    }

print("E5 token lens:", qstats_np(e5_lens))


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Tokenizing:   0%|          | 0/73 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Tokenizing: 100%|██████████| 73/73 [00:01<00:00, 48.27it/s]

E5 token lens: {'count': 18618, 'mean': 115.6950263186164, 'p50': 87.0, 'p75': 140.0, 'p90': 221.29999999999927, 'p95': 286.0, 'p99': 587.0, 'max': 1226.0}





In [8]:
tok_gte = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
gte_lens = token_lens(tok_gte, texts.tolist(), batch_size=256)
print("GTE token lens:", qstats_np(gte_lens))


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Tokenizing: 100%|██████████| 73/73 [00:01<00:00, 46.58it/s]

GTE token lens: {'count': 18618, 'mean': 115.6950263186164, 'p50': 87.0, 'p75': 140.0, 'p90': 221.29999999999927, 'p95': 286.0, 'p99': 587.0, 'max': 1226.0}





### intfloat/multilingual-e5-small 
python embed_st.py   --input ../data/cleaned_news_exp.csv   --model intfloat/multilingual-e5-small   --prefix "passage: "   --out embeddings/emb_e5_small_fp16.npy --batch_size 16   --max_len 512

### intfloat/multilingual-e5-base
python embed_st.py   --input ../data/cleaned_news_exp.csv   --model intfloat/multilingual-e5-base   --prefix "passage: "   --out embeddings/emb_e5_base_fp16.npy --batch_size 16   --max_len 512

### intfloat/multilingual-e5-large
python embed_st.py   --input ../data/cleaned_news_exp.csv   --model intfloat/multilingual-e5-large   --prefix "passage: "   --out embeddings/emb_e5_large_fp16.npy  --batch_size 16   --max_len 512

### sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 
python embed_st.py   --input ../data/cleaned_news_exp.csv   --model sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2   --out embeddings/emb_minilm_fp16.npy  --batch_size 32   --max_len 512

### Alibaba-NLP/gte-multilingual-base
python embed_gte.py   --input ../data/cleaned_news_exp.csv   --out embeddings/emb_gte_fp16.npy   --rowmap embeddings/rowmap_gte.csv   --batch_size 4 --max_len 256

In [7]:
df["message_id"].nunique()

18618

In [8]:
df = df[["message_id", "id_channel", "message", "date", "topic"]]
df.head()

Unnamed: 0,message_id,id_channel,message,date,topic
0,0007e2f8-787d-404f-91ff-e2582096a4a7,18,Сербия согласна поддержать санкции Евросоюза п...,2025-07-26 07:01:09,Санкции и геополитика
1,000884a5-8291-4ec1-805f-ac131112aaf7,6,Китайский рынок акций упал сильнее всего с апр...,2025-09-04 10:16:56,Рынки капитала
2,000b0331-92a9-4eb4-9f58-d00811257758,18,Министерство труда США отменило рекомендации 2...,2025-05-29 04:05:09,Государственная экономическая политика
3,000b8df7-d902-41eb-b668-900614902f0a,6,Чистая прибыль Московской биржи по МСФО во вто...,2025-08-26 11:40:55,Корпоративные финансы
4,0011adea-7a98-4dcc-b753-905597b42788,4,"США хотят получить нефть и «всё, что угодно» о...",2025-02-22 20:57:18,Сырьевые рынки


In [9]:
rowmap = pd.read_csv("embeddings/rowmap.csv")
rowmap.head()

Unnamed: 0,row_id,message_id,date
0,0,0007e2f8-787d-404f-91ff-e2582096a4a7,2025-07-26 07:01:09
1,1,000884a5-8291-4ec1-805f-ac131112aaf7,2025-09-04 10:16:56
2,2,000b0331-92a9-4eb4-9f58-d00811257758,2025-05-29 04:05:09
3,3,000b8df7-d902-41eb-b668-900614902f0a,2025-08-26 11:40:55
4,4,0011adea-7a98-4dcc-b753-905597b42788,2025-02-22 20:57:18


In [10]:
# проверка согласованности
import numpy as np
import pandas as pd
from pathlib import Path

rowmap = pd.read_csv("embeddings/rowmap.csv")
N = len(rowmap)

emb_dir = Path("embeddings")
for p in sorted(emb_dir.glob("emb_*_fp16.npy")):
    E = np.load(p, mmap_mode="r")
    print(p.name, E.shape, "OK" if E.shape[0] == N else "MISMATCH")

emb_e5_base_fp16.npy (18618, 768) OK
emb_e5_large_fp16.npy (18618, 1024) OK
emb_e5_small_fp16.npy (18618, 384) OK
emb_gte_fp16.npy (18618, 768) OK
emb_minilm_fp16.npy (18618, 384) OK


In [18]:
import json, re, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

JUDGE_MODEL = "Qwen/Qwen2.5-7B-Instruct"

judge_tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL, trust_remote_code=True)
judge_model = AutoModelForCausalLM.from_pretrained(
    JUDGE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
judge_model.eval()

def _extract_first_json_obj(text: str):
    text = (text or "").strip()
    if text.startswith("{") and text.endswith("}"):
        try:
            return json.loads(text)
        except Exception:
            pass
    for m in re.finditer(r"\{.*?\}", text, flags=re.DOTALL):
        cand = m.group(0)
        try:
            return json.loads(cand)
        except Exception:
            continue
    return None

@torch.inference_mode()
def judge_pair(query: str, snippet: str, max_new_tokens: int = 40) -> int:
    user = f"Запрос:\n{query}\n\nДокумент:\n{snippet}\n"
    messages = [
        {"role": "system", "content": JUDGE_SYSTEM},
        {"role": "user", "content": user},
    ]
    prompt = judge_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = judge_tokenizer(prompt, return_tensors="pt").to(judge_model.device)

    out = judge_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        eos_token_id=judge_tokenizer.eos_token_id,
        pad_token_id=judge_tokenizer.eos_token_id,
    )
    gen = judge_tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    data = _extract_first_json_obj(gen)

    if isinstance(data, dict) and data.get("relevance") in (0, 1, 2):
        return int(data["relevance"])

    return 0

def judge_with_llm(query: str, docs: list[str]) -> list[int]:
    return [judge_pair(query, d) for d in docs]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
JUDGE_SYSTEM = """Ты - строгий эксперт по информационному поиску по новостям об экономике.

Оцени, насколько найденный документ релевантен запросу.

Запрос:
{query}

Документ:
{snippet}

Оценка:
2 — документ напрямую отвечает на запрос или описывает то же событие.
1 — документ тематически близок, но не отвечает напрямую запросу.
0 — документ нерелевантен.

Верни ТОЛЬКО JSON и ничего более:
{"relevance": 0|1|2}
"""

In [20]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re

DATA_PATH = Path("cleaned_news_exp.csv")    
ROWMAP_PATH = Path("embeddings/rowmap.csv")
EMB_DIR = Path("embeddings")

K = 10
SNIPPET_CHARS = 1000


QUERY_SPECS = [
    "курс рубля к доллару",
    "укрепление рубля причины",
    "ослабление рубля после решений ФРС",
    "курс евро к рублю прогноз",
    "валютный рынок интервенции",
    "ключевая ставка ЦБ решение",
    "повышение ключевой ставки последствия",
    "снижение ключевой ставки эффект на кредиты",
    "инфляция в России ускорилась",
    "инфляция замедлилась причины",
    "индекс потребительских цен ИПЦ",
    "нефть Brent цена рост",
    "нефть Brent падение причины",
    "ОПЕК решение по добыче",
    "газ цены в Европе",
    "фондовый рынок России падение",
    "Мосбиржа индекс IMOEX рост",
    "акции банков динамика",
    "санкции влияние на экономику",
    "бюджет дефицит доходы расходы",
]


df = pd.read_csv(DATA_PATH, usecols=["message_id", "message"])
df["message"] = df["message"].fillna("").astype(str)

rowmap = pd.read_csv(ROWMAP_PATH, usecols=["row_id", "message_id", "date"])
rowmap["date"] = pd.to_datetime(rowmap["date"], errors="coerce")

m = rowmap.merge(df, on="message_id", how="left")
assert m["message"].notna().all(), "Some messages missing after merge"
texts = m["message"].tolist()

emb_files = sorted(EMB_DIR.glob("emb_*.npy"))
print("Found embeddings:", [p.name for p in emb_files])

from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_REGISTRY = {
    "emb_e5_small_fp16.npy": ("st",  "intfloat/multilingual-e5-small", "query: "),
    "emb_e5_base_fp16.npy":  ("st",  "intfloat/multilingual-e5-base",  "query: "),
    "emb_e5_large_fp16.npy": ("st",  "intfloat/multilingual-e5-large", "query: "),
    "emb_minilm_fp16.npy":   ("st",  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", ""),
    "emb_gte_fp16.npy":      ("gte", "Alibaba-NLP/gte-multilingual-base", ""),
}

_st_cache = {}
_gte_cache = None

def encode_query_st(model_name, prefix, query_text):
    if model_name not in _st_cache:
        mod = SentenceTransformer(model_name)
        _st_cache[model_name] = mod
    mod = _st_cache[model_name]
    q = (prefix + query_text) if prefix else query_text
    v = mod.encode([q], normalize_embeddings=True, show_progress_bar=False)
    return v.astype(np.float32)[0]

def mean_pool(last_hidden, attention_mask):
    mask = attention_mask.unsqueeze(-1).to(last_hidden.dtype)
    summed = (last_hidden * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    return summed / counts

@torch.inference_mode()
def encode_query_gte(model_name, query_text, max_len=256):
    global _gte_cache
    if _gte_cache is None:
        tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        mdl = AutoModel.from_pretrained(
            model_name, trust_remote_code=True, dtype=torch.float16, low_cpu_mem_usage=True
        )
        mdl.eval()
        _gte_cache = (tok, mdl)

    tok, mdl = _gte_cache
    inputs = tok([query_text], padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    out = mdl(**inputs)
    emb = mean_pool(out.last_hidden_state, inputs["attention_mask"])
    emb = torch.nn.functional.normalize(emb, p=2, dim=1)
    return emb.cpu().numpy().astype(np.float32)[0]

def encode_query_for_embfile(embfile_name, query_text):
    typ, model_name, prefix = MODEL_REGISTRY[embfile_name]
    if typ == "st":
        return encode_query_st(model_name, prefix, query_text)
    elif typ == "gte":
        return encode_query_gte(model_name, query_text)
    else:
        raise ValueError("Unknown type")


def load_and_normalize_emb(path: Path):
    E = np.load(path, mmap_mode="r")
    X = E.astype(np.float32)
    X /= (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
    return X

def topk_cosine(qvec, X, k=10):
    sims = X @ qvec
    idx = np.argpartition(-sims, kth=np.arange(k))[:k]
    idx = idx[np.argsort(-sims[idx])]
    return idx, sims[idx]

def snippet(t, n=SNIPPET_CHARS):
    t = (t or "").strip()
    t = re.sub(r"\s+", " ", t)
    return t[:n]



def dcg(rels):
    rels = np.array(rels, dtype=float)
    denom = np.log2(np.arange(2, len(rels) + 2))
    return float(np.sum((2**rels - 1) / denom))

def ndcg_at_k(rels, k=10):
    rels = rels[:k]
    ideal = sorted(rels, reverse=True)
    denom = dcg(ideal)
    return 0.0 if denom == 0 else dcg(rels) / denom

def precision_at_k(rels, k=10, thr=1):
    rels = np.array(rels[:k])
    return float(np.mean(rels >= thr))

def mrr_at_k(rels, k=10, thr=2):
    rels = rels[:k]
    for i, r in enumerate(rels, start=1):
        if r >= thr:
            return 1.0 / i
    return 0.0




results = []

for emb_path in tqdm(emb_files, desc="Embedders"):
    emb_name = emb_path.name
    if emb_name not in MODEL_REGISTRY:
        print("Skip (no registry entry):", emb_name)
        continue

    X = load_and_normalize_emb(emb_path)
    assert X.shape[0] == len(texts), f"N mismatch: {emb_name} {X.shape[0]} vs {len(texts)}"

    for q in tqdm(QUERY_SPECS, desc=f"Queries for {emb_name}", leave=False):
        qv = encode_query_for_embfile(emb_name, q)

        idx, sims = topk_cosine(qv, X, k=K)
        docs = [snippet(texts[i]) for i in idx]

        #LLM judge
        scores = judge_with_llm(q, docs) 
        assert isinstance(scores, list) and len(scores) == K
        assert all(s in (0,1,2) for s in scores)

        results.append({
            "emb": emb_name,
            "query": q,
            "P@10(rel>=1)": precision_at_k(scores, k=K, thr=1),
            "P@10(rel=2)": precision_at_k(scores, k=K, thr=2),
            "nDCG@10": ndcg_at_k(scores, k=K),
            "MRR@10(rel=2)": mrr_at_k(scores, k=K, thr=2),
        })

res = pd.DataFrame(results)


Found embeddings: ['emb_e5_base_fp16.npy', 'emb_e5_large_fp16.npy', 'emb_e5_small_fp16.npy', 'emb_gte_fp16.npy', 'emb_minilm_fp16.npy']


Embedders:   0%|          | 0/5 [00:00<?, ?it/s]
Queries for emb_e5_base_fp16.npy:   0%|          | 0/20 [00:00<?, ?it/s][AThe following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

Queries for emb_e5_base_fp16.npy:   5%|▌         | 1/20 [00:07<02:29,  7.86s/it][A
Queries for emb_e5_base_fp16.npy:  10%|█         | 2/20 [00:09<01:20,  4.48s/it][A
Queries for emb_e5_base_fp16.npy:  15%|█▌        | 3/20 [00:12<00:57,  3.37s/it][A
Queries for emb_e5_base_fp16.npy:  20%|██        | 4/20 [00:14<00:45,  2.85s/it][A
Queries for emb_e5_base_fp16.npy:  25%|██▌       | 5/20 [00:16<00:39,  2.61s/it][A
Queries for emb_e5_base_fp16.npy:  30%|███       | 6/20 [00:18<00:33,  2.42s/it][A
Queries for emb_e5_base_fp16.npy:  35%|███▌      | 7/20 [00:20<00:29,  2.30s/it][A
Queries for emb_e5_base_fp16.npy:  40%|████      | 8/20 [00:22<00:26,  2.21s/it][A
Queries for emb_e5_base_fp16.npy:  45%|████▌     | 

In [21]:
summary = (res.groupby("emb")[["P@10(rel>=1)", "P@10(rel=2)", "nDCG@10", "MRR@10(rel=2)"]]
             .mean()
             .sort_values("nDCG@10", ascending=False))

display(summary)

Unnamed: 0_level_0,P@10(rel>=1),P@10(rel=2),nDCG@10,MRR@10(rel=2)
emb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
emb_e5_small_fp16.npy,0.92,0.595,0.954822,0.866667
emb_e5_large_fp16.npy,0.955,0.665,0.930934,0.8725
emb_e5_base_fp16.npy,0.92,0.685,0.9277,0.854167
emb_gte_fp16.npy,0.84,0.425,0.882691,0.622917
emb_minilm_fp16.npy,0.82,0.44,0.863247,0.708333


### Протестим на топ-50 две топовых модели еще

In [24]:
@torch.inference_mode()
def judge_pairs_batched(pairs, batch_size=32, max_new_tokens=40):
    out_scores = []
    for i in range(0, len(pairs), batch_size):
        chunk = pairs[i:i+batch_size]

        prompts = []
        for q, sn in chunk:
            user = f"Запрос:\n{q}\n\nДокумент:\n{sn}\n"
            messages = [
                {"role": "system", "content": JUDGE_SYSTEM},
                {"role": "user", "content": user},
            ]
            prompts.append(
                judge_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            )

        enc = judge_tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        ).to(judge_model.device)

        gen_ids = judge_model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=judge_tokenizer.eos_token_id,
            pad_token_id=judge_tokenizer.eos_token_id,
        )

        for b in range(len(chunk)):
            prompt_len = int(enc["attention_mask"][b].sum().item())
            gen_txt = judge_tokenizer.decode(gen_ids[b][prompt_len:], skip_special_tokens=True)
            data = _extract_first_json_obj(gen_txt)
            rel = int(data["relevance"]) if isinstance(data, dict) and data.get("relevance") in (0,1,2) else 0
            out_scores.append(rel)

    return out_scores


SEED = 42
N_QUERIES_TOTAL = 50
N_MANUAL = 20
N_AUTO = N_QUERIES_TOTAL - N_MANUAL

def first_sentence(text: str) -> str:
    t = re.sub(r"\s+", " ", (text or "").strip())
    parts = re.split(r"[.!?]", t, maxsplit=1)
    s = (parts[0] if parts else t).strip()
    return (s if s else t)[:140]

rng = np.random.default_rng(SEED)
auto_idx = rng.choice(np.arange(len(texts)), size=N_AUTO, replace=False)
auto_queries = [first_sentence(texts[i]) for i in auto_idx]

queries = QUERY_SPECS[:N_MANUAL] + auto_queries
assert len(queries) == N_QUERIES_TOTAL


K = 50

def topk_cosine(qvec, X, k=50):
    sims = X @ qvec
    idx = np.argpartition(-sims, kth=k-1)[:k]
    idx = idx[np.argsort(-sims[idx])]
    return idx

X_base  = load_and_normalize_emb(EMB_DIR / "emb_e5_base_fp16.npy")
X_large = load_and_normalize_emb(EMB_DIR / "emb_e5_large_fp16.npy")

assert X_base.shape[0] == len(texts),  f"X_base rows != texts: {X_base.shape[0]} vs {len(texts)}"
assert X_large.shape[0] == len(texts), f"X_large rows != texts: {X_large.shape[0]} vs {len(texts)}"


enc_base = SentenceTransformer("intfloat/multilingual-e5-base")
enc_large = SentenceTransformer("intfloat/multilingual-e5-large")

def encode_e5_query(encoder, q: str) -> np.ndarray:
    v = encoder.encode(["query: " + q], normalize_embeddings=True, show_progress_bar=False)
    return v.astype(np.float32)[0]


rows = []

for q in tqdm(queries, desc="Queries"):
    qv_b = encode_e5_query(enc_base, q)
    qv_l = encode_e5_query(enc_large, q)

    idx_b = topk_cosine(qv_b, X_base,  k=K)
    idx_l = topk_cosine(qv_l, X_large, k=K)

    docs_b = [snippet(texts[i]) for i in idx_b]
    docs_l = [snippet(texts[i]) for i in idx_l]

    # pairwise judge (батчами) — используем твою judge_pairs_batched
    scores_b = judge_pairs_batched([(q, d) for d in docs_b], batch_size=32)
    scores_l = judge_pairs_batched([(q, d) for d in docs_l], batch_size=32)

    rows.append({
        "query": q,
        "ndcg_base": ndcg_at_k(scores_b, K),
        "ndcg_large": ndcg_at_k(scores_l, K),
        "mrr_base": mrr_at_k(scores_b, K, thr=2),
        "mrr_large": mrr_at_k(scores_l, K, thr=2),
        "p_base_2": precision_at_k(scores_b, K, thr=2),
        "p_large_2": precision_at_k(scores_l, K, thr=2),
        "p_base_1": precision_at_k(scores_b, K, thr=1),
        "p_large_1": precision_at_k(scores_l, K, thr=1),
    })

res = pd.DataFrame(rows)

Queries: 100%|██████████| 50/50 [07:35<00:00,  9.11s/it]


In [25]:
summary = pd.DataFrame({
    "metric": ["nDCG@50", "MRR@50(rel=2)", "P@50(rel=2)", "P@50(rel>=1)"],
    "e5_base": [
        res["ndcg_base"].mean(),
        res["mrr_base"].mean(),
        res["p_base_2"].mean(),
        res["p_base_1"].mean(),
    ],
    "e5_large": [
        res["ndcg_large"].mean(),
        res["mrr_large"].mean(),
        res["p_large_2"].mean(),
        res["p_large_1"].mean(),
    ],
})
display(summary)

print("Win-rate base > large | nDCG:", float((res["ndcg_base"] > res["ndcg_large"]).mean()))
print("Win-rate base > large | MRR :", float((res["mrr_base"]  > res["mrr_large"]).mean()))
print("Win-rate base > large | P@50(2):", float((res["p_base_2"] > res["p_large_2"]).mean()))

Unnamed: 0,metric,e5_base,e5_large
0,nDCG@50,0.870425,0.89024
1,MRR@50(rel=2),0.891667,0.927222
2,P@50(rel=2),0.234,0.23
3,P@50(rel>=1),0.4136,0.426


Win-rate base > large | nDCG: 0.36
Win-rate base > large | MRR : 0.06
Win-rate base > large | P@50(2): 0.3


## Выбираем e5_large как эмбеддер для TemporalRAG