In [1]:
import pandas as pd

def _norm_query(s: str) -> str:
    if s is None:
        return ""
    return " ".join(str(s).strip().lower().split())

def load_and_intersect_by_query(path_rag: str, path_rag_filtered: str, path_temporal_filtered: str):
    df1 = pd.read_parquet(path_rag)
    df2 = pd.read_parquet(path_rag_filtered)
    df3 = pd.read_parquet(path_temporal_filtered)

    for name, d in [("rag", df1), ("rag_filtered", df2), ("temporal_filtered", df3)]:
        if "query" not in d.columns:
            raise KeyError(f"{name}: missing column 'query'")

    df1 = df1.copy()
    df2 = df2.copy()
    df3 = df3.copy()

    df1["_q"] = df1["query"].map(_norm_query)
    df2["_q"] = df2["query"].map(_norm_query)
    df3["_q"] = df3["query"].map(_norm_query)

    s1 = set(df1["_q"].dropna().tolist()) - {""}
    s2 = set(df2["_q"].dropna().tolist()) - {""}
    s3 = set(df3["_q"].dropna().tolist()) - {""}

    common_q = sorted(s1 & s2 & s3)

    out1 = df1[df1["_q"].isin(common_q)].drop(columns=["_q"])
    out2 = df2[df2["_q"].isin(common_q)].drop(columns=["_q"])
    out3 = df3[df3["_q"].isin(common_q)].drop(columns=["_q"])

    counts = {
        "rag_unique_queries": len(s1),
        "rag_filtered_unique_queries": len(s2),
        "temporal_filtered_unique_queries": len(s3),
        "common_unique_queries": len(common_q),
        "rag_rows_kept": int(len(out1)),
        "rag_filtered_rows_kept": int(len(out2)),
        "temporal_filtered_rows_kept": int(len(out3)),
    }

    return out1, out2, out3, common_q, counts

PATH_RAG = "eval_df_rag_default.parquet"
PATH_RAG_F = "eval_df_rag_filtered.parquet"
PATH_TEMP_F = "eval_df_temporalrag.parquet" 

rag_df_common, ragf_df_common, tempf_df_common, common_queries, counts = load_and_intersect_by_query(
    PATH_RAG, PATH_RAG_F, PATH_TEMP_F)


In [2]:
counts

{'rag_unique_queries': 200,
 'rag_filtered_unique_queries': 200,
 'temporal_filtered_unique_queries': 200,
 'common_unique_queries': 200,
 'rag_rows_kept': 200,
 'rag_filtered_rows_kept': 200,
 'temporal_filtered_rows_kept': 200}

In [4]:
from vllm import LLM

JUDGE_MODEL = "Qwen/Qwen2.5-32B-Instruct"

judge = LLM(
    model=JUDGE_MODEL,
    dtype="bfloat16",
    max_model_len=32768,
    gpu_memory_utilization=0.90,
)

from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL, trust_remote_code=True)

INFO 01-11 20:23:12 [utils.py:253] non-default args: {'dtype': 'bfloat16', 'max_model_len': 32768, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-32B-Instruct'}
INFO 01-11 20:23:14 [model.py:514] Resolved architecture: Qwen2ForCausalLM
INFO 01-11 20:23:14 [model.py:1661] Using max model len 32768
INFO 01-11 20:23:14 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:15 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='Qwen/Qwen2.5-32B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-32B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda

[0;36m(EngineCore_DP0 pid=21352)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:20 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:33 [default_loader.py:308] Loading weights took 11.62 seconds
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:34 [gpu_model_runner.py:3659] Model loading took 61.0375 GiB memory and 15.667408 seconds
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:46 [backends.py:643] Using cache directory: /home/mlcore/.cache/vllm/torch_compile_cache/ff352c8093/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:46 [backends.py:703] Dynamo bytecode transform time: 12.34 s
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:23:53 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:24:12 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 18.51 s
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:24:12 [monitor.py:34] torch.compile takes 30.85 s in total
[0;36m(EngineCore_DP0 pid=213

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:06<00:00,  7.35it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:03<00:00,  9.80it/s]


[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:24:25 [gpu_model_runner.py:4587] Graph capturing finished in 12 secs, took 4.11 GiB
[0;36m(EngineCore_DP0 pid=21352)[0;0m INFO 01-11 20:24:25 [core.py:259] init engine (profile, create kv cache, warmup model) took 51.87 seconds
INFO 01-11 20:24:26 [llm.py:360] Supported tasks: ['generate']


In [5]:
import os, subprocess, textwrap, time, torch

print("GPU:", torch.cuda.get_device_name(0))
print("VRAM(GB):", round(torch.cuda.get_device_properties(0).total_memory / (1024**3), 1))

print(subprocess.check_output(["bash","-lc","nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader"]).decode())

GPU: NVIDIA A100-SXM4-80GB
VRAM(GB): 79.3
3066119, [Not Found], 77902 MiB



In [6]:
import json, re
import pandas as pd
from vllm import SamplingParams

SYSTEM_PROMPT_JUDGE = """
Ты — строгий судья качества RAG-саммари.

Вход:
- QUERY: запрос пользователя
- ANCHOR_DATE: актуальная дата обзора
- CONTEXT: набор документов; каждый документ имеет строку вида date=YYYY-MM-DD channel=...
- SUMMARY: саммари по этим документам

Правила:
- Оценивай ТОЛЬКО по CONTEXT. Внешние знания запрещены.
- Не оценивай стиль. Только содержательную полезность и проверяемость.
- Для recency ориентируйся на “самые свежие документы” в CONTEXT: это документы с максимальной датой date=YYYY-MM-DD (наиболее близкие к ANCHOR_DATE).
- Если SUMMARY не использует свежие документы (не упоминает свежий факт/событие, не делает акцент “что нового”), это сильный минус recency.

Шкала 1..5:
- 5 = отлично
- 4 = хорошо (незначительные проблемы)
- 3 = средне (заметные проблемы, но жить можно)
- 2 = плохо (проблемы мешают доверять/использовать)
- 1 = очень плохо

Оцени 5 критериев (с приоритетом recency и дат):

1) relevance (мягкий критерий: штрафуй только явный оффтоп)
5: В целом по теме QUERY (даже если раскрытие неполное).
4: В целом по теме, но местами уходит в сторону или слишком общо.
3: Частично по теме: заметное количество лишнего/общих рассуждений, но связь с QUERY есть.
2: Слабая связь с QUERY: основная часть текста не помогает ответить на запрос.
1: Почти полностью оффтоп.

2) groundedness (насколько SUMMARY ↔ CONTEXT)
5: Все существенные утверждения проверяемы по CONTEXT; нет “додумок”.
4: Почти всё проверяемо; 1–2 небольших обобщения без явной опоры.
3: Несколько неподтвержденных утверждений/интерпретаций, но основа опирается на CONTEXT.
2: Много неподтвержденного или есть противоречия с CONTEXT.
1: Большая часть не опирается на CONTEXT или противоречит ему.

3) facts_numbers (точность фактов/чисел в SUMMARY относительно фактов/чисел в CONTEXT)
5: Числа/факты корректны относительно CONTEXT.
4: В целом корректно; 1 мелкая неточность/округление без смены смысла.
3: Несколько неточностей/расплывчатостей, которые слегка искажают понимание.
2: Есть как минимум 1 явная ошибка в цифрах/факте или приписывание значения, которого нет в CONTEXT.
1: Много ошибок в фактах/числах.

4) dates_timeline (даты и порядок событий в таймлайне)
5: Даты и порядок событий соответствуют CONTEXT; “позже/раньше” корректно.
4: В целом корректно; 1 небольшой огрех во второстепенной детали.
3: Есть заметные огрехи в датах/порядке, но общий временной смысл читается.
2: Существенная путаница дат/порядка или приписаны даты, которых нет в CONTEXT.
1: Временная картина в основном неверна.

5) recency (СТРОГИЙ критерий актуальности)
Определи “самое свежее” в CONTEXT как документы с максимальной датой date=YYYY-MM-DD.

5: SUMMARY явно делает акцент “что нового”: содержит хотя бы парочку конкретный факт/событие из самых свежих документов (за последние 30 дней) и корректно подает его как последнее/новое относительно ANCHOR_DATE.
4: SUMMARY использует свежие документы, но акцент “что нового” выражен умеренно (свежие события есть, но не в фокусе).
3: SUMMARY упоминает свежие документы вскользь или неявно; основная часть про более старые события (более трети документов про события дальше 30 дней).
2: SUMMARY почти не использует самые свежие документы (нет явного свежего факта/события), даже если они есть в CONTEXT.
1: SUMMARY искажает актуальность (например, подает старое как новое) или полностью игнорирует свежие документы, или свежих документов вообще нет в CONTEXT.

Вывод:
Верни СТРОГО JSON без Markdown и без лишнего текста:

{
  "scores": {"relevance":1..5,"groundedness":1..5,"facts_numbers":1..5,"dates_timeline":1..5,"recency":1..5},
  "confidence": 0..1,
  "issues": ["..."],
  "evidence": [{"summary_quote":"≤25 слов","context_quote":"≤25 слов","note":"..."}],
  "reasoning": ["..."]
}

Ограничения:
- issues: 0..7
- evidence: 0..4 (обязательно включи хотя бы 1 evidence про свежий факт/событие, если ставишь recency >= 4)
- reasoning: 3..8 коротких пунктов
""".strip()



import json, re
import pandas as pd
from vllm import SamplingParams

def _safe_json_load(s: str):
    s = (s or "").strip()
    try:
        return json.loads(s), None
    except Exception as e1:
        m = re.search(r"\{.*\}", s, flags=re.DOTALL)
        if m:
            try:
                return json.loads(m.group(0)), None
            except Exception as e2:
                return None, f"{repr(e1)} | {repr(e2)}"
        return None, repr(e1)

def _as_int_1_5(x):
    try:
        v = int(x)
        if 1 <= v <= 5:
            return v
    except Exception:
        pass
    return None

def _as_float_0_1(x):
    try:
        v = float(x)
        if 0.0 <= v <= 1.0:
            return v
    except Exception:
        pass
    return None

def judge_pointwise_on_df(
    judge_model,
    judge_tokenizer,
    df_eval: pd.DataFrame,
    system_name: str,
    gen_batch_size: int = 6,
    max_out_tokens: int = 768,
):
    need = {"query", "anchor_date", "context", "summary"}
    miss = need - set(df_eval.columns)
    if miss:
        raise KeyError(f"{system_name}: missing {miss}")

    prompts, meta = [], []
    for r in df_eval.itertuples(index=False):
        q = str(getattr(r, "query"))
        ad = str(getattr(r, "anchor_date"))
        ctx = str(getattr(r, "context"))
        sm = str(getattr(r, "summary"))

        user = f"QUERY: {q}\nANCHOR_DATE: {ad}\n\nCONTEXT:\n{ctx}\n\nSUMMARY:\n{sm}\n"
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT_JUDGE},
            {"role": "user", "content": user},
        ]
        prompt = judge_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)
        meta.append({"system": system_name, "query": q, "anchor_date": ad})

    sampling = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=int(max_out_tokens),
    )

    outs_text = []
    for start in range(0, len(prompts), int(gen_batch_size)):
        chunk = prompts[start:start + int(gen_batch_size)]
        outs = judge_model.generate(chunk, sampling)
        for o in outs:
            outs_text.append(o.outputs[0].text.strip())

    rows = []
    for m, raw in zip(meta, outs_text):
        parsed, err = _safe_json_load(raw)

        row = dict(m)
        row["judge_raw"] = raw
        row["judge_parse_error"] = err

        row["relevance"] = None
        row["groundedness"] = None
        row["facts_numbers"] = None
        row["dates_timeline"] = None
        row["recency"] = None
        row["confidence"] = None

        row["issues"] = None
        row["evidence"] = None
        row["reasoning"] = None

        if isinstance(parsed, dict):
            sc = parsed.get("scores", {})
            if isinstance(sc, dict):
                row["relevance"] = _as_int_1_5(sc.get("relevance"))
                row["groundedness"] = _as_int_1_5(sc.get("groundedness"))
                row["facts_numbers"] = _as_int_1_5(sc.get("facts_numbers"))
                row["dates_timeline"] = _as_int_1_5(sc.get("dates_timeline"))
                row["recency"] = _as_int_1_5(sc.get("recency"))

            row["confidence"] = _as_float_0_1(parsed.get("confidence"))

            iss = parsed.get("issues")
            row["issues"] = iss if isinstance(iss, list) else None

            ev = parsed.get("evidence")
            row["evidence"] = ev if isinstance(ev, list) else None

            rs = parsed.get("reasoning")
            row["reasoning"] = rs if isinstance(rs, list) else None

            if row["relevance"] is None or row["groundedness"] is None or row["facts_numbers"] is None or row["dates_timeline"] is None or row["recency"] is None:
                if row["judge_parse_error"] is None:
                    row["judge_parse_error"] = "Missing or invalid scores fields"

        rows.append(row)

    return pd.DataFrame(rows)



In [7]:
jud_rag = judge_pointwise_on_df(judge, tokenizer, rag_df_common,  "rag",               gen_batch_size=4, max_out_tokens=1024)
jud_ragf = judge_pointwise_on_df(judge, tokenizer, ragf_df_common, "rag_filtered",     gen_batch_size=4, max_out_tokens=1024)
jud_temp = judge_pointwise_on_df(judge, tokenizer, tempf_df_common,"temporal_filtered",gen_batch_size=4, max_out_tokens=1024)

jud_all = pd.concat([jud_rag, jud_ragf, jud_temp], ignore_index=True)

jud_all.to_parquet("judge_scores.parquet", index=False)
print("Saved judge_scores.parquet, rows:", len(jud_all))

score_cols = ["relevance","groundedness","facts_numbers","dates_timeline","recency","confidence"]

jud_all["overall"] = (
    0.45 * jud_all["recency"].astype(float) +
    0.20 * jud_all["dates_timeline"].astype(float) +
    0.20 * jud_all["groundedness"].astype(float) +
    0.10 * jud_all["facts_numbers"].astype(float) +
    0.05 * jud_all["relevance"].astype(float)
)

summary = (
    jud_all.groupby("system")[score_cols + ["overall"]]
    .mean(numeric_only=True)
    .sort_values(["overall", "recency", "dates_timeline", "groundedness"], ascending=False)
)

parse_fail_rate = (
    jud_all.assign(parse_fail=jud_all["judge_parse_error"].notna())
    .groupby("system")["parse_fail"]
    .mean()
)

valid_rate = (
    jud_all.assign(valid_scores=jud_all[["relevance","groundedness","facts_numbers","dates_timeline","recency"]].notna().all(axis=1))
    .groupby("system")["valid_scores"]
    .mean()
)

print("MEAN SCORES:")
display(summary)

print("\nVALID SCORES RATE:")
display(valid_rate)

print("\nPARSE FAIL RATE:")
display(parse_fail_rate)


Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Saved judge_scores.parquet, rows: 600
MEAN SCORES:


Unnamed: 0_level_0,relevance,groundedness,facts_numbers,dates_timeline,recency,confidence,overall
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
temporal_filtered,4.316583,4.005025,4.236181,4.20603,3.59799,0.884171,3.900754
rag_filtered,4.378788,4.0,4.121212,4.090909,3.30303,0.882828,3.735606
rag,4.282828,3.954545,4.075758,3.979798,3.146465,0.867424,3.624495



VALID SCORES RATE:


system
rag                  0.990
rag_filtered         0.990
temporal_filtered    0.995
Name: valid_scores, dtype: float64


PARSE FAIL RATE:


system
rag                  0.010
rag_filtered         0.010
temporal_filtered    0.005
Name: parse_fail, dtype: float64

In [8]:
import re
import numpy as np
import pandas as pd

DATE_RE = re.compile(r"date=(\d{4}-\d{2}-\d{2})")

def extract_doc_dates_from_context(ctx: str):
    if ctx is None:
        return []
    s = str(ctx)
    return DATE_RE.findall(s)

def compute_time_metrics(df: pd.DataFrame, system_name: str) -> pd.DataFrame:
    need = {"query", "anchor_date", "context"}
    miss = need - set(df.columns)
    if miss:
        raise KeyError(f"{system_name}: missing columns {miss}")

    out_rows = []
    for r in df.itertuples(index=False):
        query = str(getattr(r, "query"))
        anchor_date = str(getattr(r, "anchor_date"))

        ad = pd.to_datetime(anchor_date, errors="coerce", utc=True).normalize()
        dates = extract_doc_dates_from_context(getattr(r, "context"))

        dts = pd.to_datetime(pd.Series(dates), errors="coerce", utc=True).dt.normalize()
        dts = dts.dropna()

        if pd.isna(ad) or len(dts) == 0:
            out_rows.append({
                "system": system_name,
                "query": query,
                "anchor_date": anchor_date,
                "n_docs_parsed": int(len(dts)),
                "min_age_days": np.nan,
                "p50_age_days": np.nan,
                "share_docs_7d": np.nan,
                "share_docs_30d": np.nan,
            })
            continue

        ages = (ad - dts).dt.days.astype("float32")

        ages = ages[np.isfinite(ages)]
        ages = ages[ages >= 0]

        if len(ages) == 0:
            out_rows.append({
                "system": system_name,
                "query": query,
                "anchor_date": anchor_date,
                "n_docs_parsed": int(len(dts)),
                "min_age_days": np.nan,
                "p50_age_days": np.nan,
                "share_docs_7d": np.nan,
                "share_docs_30d": np.nan,
            })
            continue

        out_rows.append({
            "system": system_name,
            "query": query,
            "anchor_date": anchor_date,
            "n_docs_parsed": int(len(ages)),
            "min_age_days": float(np.min(ages)),
            "p50_age_days": float(np.median(ages)),
            "share_docs_7d": float(np.mean(ages <= 7)),
            "share_docs_30d": float(np.mean(ages <= 30)),
        })

    return pd.DataFrame(out_rows)

t_rag   = compute_time_metrics(rag_df_common,  "rag")
t_rag_f = compute_time_metrics(ragf_df_common, "rag_filtered")
t_temp  = compute_time_metrics(tempf_df_common,"temporal_filtered")

time_all = pd.concat([t_rag, t_rag_f, t_temp], ignore_index=True)

display(time_all.head())

# Сводка по системам
cols = ["n_docs_parsed","min_age_days","p50_age_days","share_docs_7d","share_docs_30d"]
summary = time_all.groupby("system")[cols].agg(["mean","median"])
display(summary)

# Если хочешь компактную сводку только mean:
mean_summary = time_all.groupby("system")[cols].mean(numeric_only=True)
display(mean_summary)


Unnamed: 0,system,query,anchor_date,n_docs_parsed,min_age_days,p50_age_days,share_docs_7d,share_docs_30d
0,rag,Актуальный курс доллара,2025-09-04,30,38.0,222.5,0.0,0.0
1,rag,"Кофе дорожает, почему?",2025-09-04,30,34.0,192.5,0.0,0.0
2,rag,Почему масло дорожает?Доллар растет к рублю: п...,2025-09-04,30,38.0,267.0,0.0,0.0
3,rag,Доллар падает к рублю: причины и факты,2025-09-04,30,38.0,203.0,0.0,0.0
4,rag,Евро к рублю: что происходит,2025-09-04,30,136.0,280.5,0.0,0.0


Unnamed: 0_level_0,n_docs_parsed,n_docs_parsed,min_age_days,min_age_days,p50_age_days,p50_age_days,share_docs_7d,share_docs_7d,share_docs_30d,share_docs_30d
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median
system,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
rag,30.0,30.0,22.575,10.5,209.35,207.5,0.012833,0.0,0.062667,0.066667
rag_filtered,27.725,30.0,30.787879,23.0,214.977273,215.25,0.015237,0.0,0.051817,0.033333
temporal_filtered,27.14,30.0,7.61809,1.0,123.437186,118.0,0.145563,0.1,0.293405,0.25


Unnamed: 0_level_0,n_docs_parsed,min_age_days,p50_age_days,share_docs_7d,share_docs_30d
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rag,30.0,22.575,209.35,0.012833,0.062667
rag_filtered,27.725,30.787879,214.977273,0.015237,0.051817
temporal_filtered,27.14,7.61809,123.437186,0.145563,0.293405


In [9]:
import re
import numpy as np
import pandas as pd

CTX_DATE_RE = re.compile(r"date=(\d{4}-\d{2}-\d{2})")
ANY_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")

def extract_ctx_doc_dates(ctx: str):
    if ctx is None:
        return []
    return CTX_DATE_RE.findall(str(ctx))

def extract_summary_dates(summary: str, anchor_date: str):
    if summary is None:
        return []
    dates = ANY_DATE_RE.findall(str(summary))
    ad = str(anchor_date)[:10]
    dates = [d for d in dates if d != ad]   # убрать anchor_date
    return dates

def ages_from_dates(dates, anchor_date):
    ad = pd.to_datetime(anchor_date, errors="coerce", utc=True).normalize()
    if pd.isna(ad) or not dates:
        return None
    dts = pd.to_datetime(pd.Series(dates), errors="coerce", utc=True).dt.normalize().dropna()
    if len(dts) == 0:
        return None
    ages = (ad - dts).dt.days.astype("float32")
    ages = ages[np.isfinite(ages)]
    ages = ages[ages >= 0]
    if len(ages) == 0:
        return None
    return ages.to_numpy()

def compute_ctx_vs_sum_recency(df: pd.DataFrame, system_name: str) -> pd.DataFrame:
    need = {"query", "anchor_date", "context", "summary"}
    miss = need - set(df.columns)
    if miss:
        raise KeyError(f"{system_name}: missing {miss}")

    rows = []
    for r in df.itertuples(index=False):
        q = str(getattr(r, "query"))
        ad = str(getattr(r, "anchor_date"))

        ctx_dates = extract_ctx_doc_dates(getattr(r, "context"))
        sum_dates = extract_summary_dates(getattr(r, "summary"), ad)

        ctx_ages = ages_from_dates(ctx_dates, ad)
        sum_ages = ages_from_dates(sum_dates, ad)

        min_ctx = float(np.min(ctx_ages)) if ctx_ages is not None else np.nan
        p50_ctx = float(np.median(ctx_ages)) if ctx_ages is not None else np.nan
        min_sum = float(np.min(sum_ages)) if sum_ages is not None else np.nan
        p50_sum = float(np.median(sum_ages)) if sum_ages is not None else np.nan

        gap = (min_sum - min_ctx) if np.isfinite(min_sum) and np.isfinite(min_ctx) else np.nan

        share_sum_7d  = float(np.mean(sum_ages <= 7)) if sum_ages is not None else np.nan
        share_sum_30d = float(np.mean(sum_ages <= 30)) if sum_ages is not None else np.nan

        rows.append({
            "system": system_name,
            "query": q,
            "anchor_date": ad,
            "n_ctx_docs": int(len(ctx_dates)),
            "n_sum_dates": int(len(sum_dates)),
            "has_sum_dates": int(len(sum_dates) > 0),
            "min_age_ctx": min_ctx,
            "p50_age_ctx": p50_ctx,
            "min_age_sum": min_sum,
            "p50_age_sum": p50_sum,
            "gap_sum_vs_ctx_days": gap,
            "share_sum_7d": share_sum_7d,
            "share_sum_30d": share_sum_30d,
        })
    return pd.DataFrame(rows)

sr_rag  = compute_ctx_vs_sum_recency(rag_df_common,  "rag")
sr_ragf = compute_ctx_vs_sum_recency(ragf_df_common, "rag_filtered")
sr_temp = compute_ctx_vs_sum_recency(tempf_df_common,"temporal_filtered")
sr_all = pd.concat([sr_rag, sr_ragf, sr_temp], ignore_index=True)

cols = ["has_sum_dates","n_sum_dates","min_age_ctx","min_age_sum","gap_sum_vs_ctx_days","share_sum_7d","share_sum_30d"]
display(sr_all.groupby("system")[cols].mean(numeric_only=True))
display(sr_all.groupby("system")[cols].agg(["mean","median"]))


Unnamed: 0_level_0,has_sum_dates,n_sum_dates,min_age_ctx,min_age_sum,gap_sum_vs_ctx_days,share_sum_7d,share_sum_30d
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rag,1.0,23.65,22.575,33.135,10.56,0.02112,0.082304
rag_filtered,0.99,25.475,30.787879,33.838384,3.050505,0.019467,0.068626
temporal_filtered,0.995,21.525,7.61809,10.492462,2.874372,0.147082,0.319016


Unnamed: 0_level_0,has_sum_dates,has_sum_dates,n_sum_dates,n_sum_dates,min_age_ctx,min_age_ctx,min_age_sum,min_age_sum,gap_sum_vs_ctx_days,gap_sum_vs_ctx_days,share_sum_7d,share_sum_7d,share_sum_30d,share_sum_30d
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
system,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
rag,1.0,1.0,23.65,25.0,22.575,10.5,33.135,23.0,10.56,0.0,0.02112,0.0,0.082304,0.061553
rag_filtered,0.99,1.0,25.475,26.0,30.787879,23.0,33.838384,23.5,3.050505,0.0,0.019467,0.0,0.068626,0.043478
temporal_filtered,0.995,1.0,21.525,22.5,7.61809,1.0,10.492462,2.0,2.874372,1.0,0.147082,0.130435,0.319016,0.304348
