In [None]:
# F_Combined_V_X (Rewritten to include combined (chat+audio) text in examples)

# LLM1(gen) -> MCQ(Q1/Q2) from Combined
# LLM2(ans) -> Answer Q1 from Combined
# LLM4(ans) -> Answer same Q1 from Chat-only IF LLM2 correct & has_chat (conditional)
# LLM3(ans) -> Answer Q1 from Chat-only regardless (unconditional)
#
# LLM5(ans) -> Answer Q2 from Combined
# LLM7(ans) -> Answer same Q2 from Chat-only IF LLM5 correct & has_chat (conditional)
# LLM6(ans) -> Answer Q2 from Chat-only regardless (unconditional)
#
# Strict leakage control: answerers only see {question, choices}; no GT leaked
# Resume-safe: read existing artifacts; merge run logs; save to GCS + local
# Extra:
#   - Save EXACT API request texts for up to 2 calls per RUN_ID under run dir (local + GCS)
#   - Dump 3 correct and 3 incorrect examples (question, choices, chosen index) to files
#     ★ and include the 10分の chat+audio テキスト (combined_text) in each example object
#   - Count invalid-format answers (index not in {0,1,2,3} or unparsable) and report
#
# Requirements: google-cloud-aiplatform, gcsfs, fsspec, pandas, tqdm

from __future__ import annotations
import os
import io
import re
import json
import textwrap
from typing import List, Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import fsspec
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig

# ====================== CONFIG ======================
PROJECT_ID = "dena-ai-intern-ds-dev-gcp"
LOCATION   = "us-central1"

# ---- Models (all gemini-2.5-pro as requested) ----
MODEL_NAME_QGEN             = "gemini-2.5-pro"  # LLM1 (MCQ generation) reading Combined
MODEL_NAME_Q1_COMBINED      = "gemini-2.5-pro"  # LLM2 (Q1 answer) reading Combined
MODEL_NAME_Q1_CHAT_UNCOND   = "gemini-2.5-pro"  # LLM3 (Q1 answer) reading Chat-only (unconditional)
MODEL_NAME_Q1_CHAT_COND     = "gemini-2.5-pro"  # LLM4 (Q1 answer) reading Chat-only (conditional)
MODEL_NAME_Q2_COMBINED      = "gemini-2.5-pro"  # LLM5 (Q2 answer) reading Combined
MODEL_NAME_Q2_CHAT_UNCOND   = "gemini-2.5-pro"  # LLM6 (Q2 answer) reading Chat-only (unconditional)
MODEL_NAME_Q2_CHAT_COND     = "gemini-2.5-pro"  # LLM7 (Q2 answer) reading Chat-only (conditional)

GEN_TEMPERATURE = 0

# Input index: local CSV listing comment files
SUFFIX_LIST_CSV = "pococha_comment_sorted_suffixes.csv"
TOP_N_LIVE = 100  # ← 実行対象のライブ数

# Pre-generated 10-min chunk texts (GCS)
COMBINED_PREFIX = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_comment_speeech_combined"
CHATONLY_PREFIX = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_comment_only"

# ====== Run isolation (★重要) ======
RUN_ID = 1050  # ← 適宜変更
print(f"RUN_ID:{RUN_ID}, TOP_N_LIVE:{TOP_N_LIVE}")
RUN_TAG = f"run_{RUN_ID}"

# ---- Local mirror ----
LOCAL_OUT_BASE = os.path.join("llmqa_runs", RUN_TAG)
LOCAL_QUESTIONS_FULL_DIR     = os.path.join(LOCAL_OUT_BASE, "questions_full")            # GT付き（LLM1の出力）
LOCAL_QUESTIONS_PUBLIC_DIR   = os.path.join(LOCAL_OUT_BASE, "questions_public")          # 公開用（question & choicesのみ）
LOCAL_ANS_Q1_COMBINED_DIR    = os.path.join(LOCAL_OUT_BASE, "answers_q1_combined")       # LLM2
LOCAL_ANS_Q1_CHAT_UNCOND_DIR = os.path.join(LOCAL_OUT_BASE, "answers_q1_chat_uncond")    # LLM3
LOCAL_ANS_Q1_CHAT_COND_DIR   = os.path.join(LOCAL_OUT_BASE, "answers_q1_chat_cond")      # LLM4 (conditional)
LOCAL_ANS_Q2_COMBINED_DIR    = os.path.join(LOCAL_OUT_BASE, "answers_q2_combined")       # LLM5
LOCAL_ANS_Q2_CHAT_UNCOND_DIR = os.path.join(LOCAL_OUT_BASE, "answers_q2_chat_uncond")    # LLM6
LOCAL_ANS_Q2_CHAT_COND_DIR   = os.path.join(LOCAL_OUT_BASE, "answers_q2_chat_cond")      # LLM7 (conditional)
LOCAL_RUNLOG_DIR             = os.path.join(LOCAL_OUT_BASE, "run_logs")
LOCAL_DEBUG_INPUTS_DIR       = os.path.join(LOCAL_OUT_BASE, "debug_inputs")
LOCAL_EXAMPLES_DIR           = os.path.join(LOCAL_OUT_BASE, "examples")

# ---- Outputs (GCS, run-scoped) ----
QUESTIONS_PREFIX_BASE = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_mcq_questions"
ANSWERS_PREFIX_BASE   = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_mcq_answers"
RUNLOG_PREFIX_BASE    = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_run_logs"

QUESTIONS_PREFIX   = f"{QUESTIONS_PREFIX_BASE}/{RUN_TAG}"
ANSWERS_PREFIX     = f"{ANSWERS_PREFIX_BASE}/{RUN_TAG}"
RUNLOG_PREFIX      = f"{RUNLOG_PREFIX_BASE}/{RUN_TAG}"
ADOPT_LOG_PATH     = f"{RUNLOG_PREFIX}/adopted_chunks_top{TOP_N_LIVE}.csv"
METRICS_JSON_PATH  = f"{RUNLOG_PREFIX}/metrics_top{TOP_N_LIVE}.json"
DEBUG_INPUTS_PREFIX= f"{RUNLOG_PREFIX}/debug_inputs"
EXAMPLES_PREFIX    = f"{RUNLOG_PREFIX}/examples"

# ---- GCS artifact paths ----
Q_FULL_OUT    = lambda lid, idx: f"{QUESTIONS_PREFIX}/full/{lid}_{idx}.json"
Q_PUB_OUT     = lambda lid, idx: f"{QUESTIONS_PREFIX}/public/{lid}_{idx}.json"

A_Q1C_OUT     = lambda lid, idx: f"{ANSWERS_PREFIX}/q1_combined/{lid}_{idx}.json"       # LLM2
A_Q1CHATU_OUT = lambda lid, idx: f"{ANSWERS_PREFIX}/q1_chat_uncond/{lid}_{idx}.json"    # LLM3
A_Q1CHATC_OUT = lambda lid, idx: f"{ANSWERS_PREFIX}/q1_chat_cond/{lid}_{idx}.json"      # LLM4 (conditional)

A_Q2C_OUT     = lambda lid, idx: f"{ANSWERS_PREFIX}/q2_combined/{lid}_{idx}.json"       # LLM5
A_Q2CHATU_OUT = lambda lid, idx: f"{ANSWERS_PREFIX}/q2_chat_uncond/{lid}_{idx}.json"    # LLM6
A_Q2CHATC_OUT = lambda lid, idx: f"{ANSWERS_PREFIX}/q2_chat_cond/{lid}_{idx}.json"      # LLM7 (conditional)

# Filtering
MIN_COMBINED_CHARS = 500  # Combined が短すぎるチャンクはスキップ
CHECKPOINT_EVERY   = 50

# Debug input saving (save exact payloads sent to API): up to 2 per RUN_ID
_MAX_SAVED_INPUTS = 2
_saved_input_count = 0

# ====================== Vertex AI / FS Helpers ======================
def init_vertex_ai(model_name: str) -> GenerativeModel:
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    return GenerativeModel(model_name)

def fs_gcs() -> fsspec.AbstractFileSystem:
    return fsspec.filesystem("gcs")

def write_gcs_text(path: str, text: str) -> None:
    fs = fs_gcs()
    with fs.open(path, "w") as f:
        f.write(text)

def read_gcs_text(path: str) -> str:
    fs = fs_gcs()
    with fs.open(path, "r") as f:
        return f.read()

def gcs_glob(prefix: str, pattern: str) -> List[str]:
    fs = fs_gcs()
    return sorted(fs.glob(f"{prefix}/{pattern}"))

def ensure_local_dirs() -> None:
    for d in [
        LOCAL_QUESTIONS_FULL_DIR,
        LOCAL_QUESTIONS_PUBLIC_DIR,
        LOCAL_ANS_Q1_COMBINED_DIR,
        LOCAL_ANS_Q1_CHAT_UNCOND_DIR,
        LOCAL_ANS_Q1_CHAT_COND_DIR,
        LOCAL_ANS_Q2_COMBINED_DIR,
        LOCAL_ANS_Q2_CHAT_UNCOND_DIR,
        LOCAL_ANS_Q2_CHAT_COND_DIR,
        LOCAL_RUNLOG_DIR,
        LOCAL_DEBUG_INPUTS_DIR,
        LOCAL_EXAMPLES_DIR,
    ]:
        os.makedirs(d, exist_ok=True)

def _save_debug_input_if_needed(content: str, label: str):
    """Save EXACT payload sent to the API (prompt+doc). Max 2 files per RUN_ID."""
    global _saved_input_count
    if _saved_input_count >= _MAX_SAVED_INPUTS:
        return
    _saved_input_count += 1
    # Local
    local_path = os.path.join(LOCAL_DEBUG_INPUTS_DIR, f"{_saved_input_count:02d}_{label}.txt")
    with open(local_path, "w", encoding="utf-8") as f:
        f.write(content)
    # GCS
    gcs_path = f"{DEBUG_INPUTS_PREFIX}/{_saved_input_count:02d}_{label}.txt"
    try:
        write_gcs_text(gcs_path, content)
    except Exception:
        pass

# ====================== Model Call (robust JSON parsing) ======================
def call_gemini_json_with_content(model: GenerativeModel, content: str, save_label: Optional[str]=None) -> Dict[str, Any]:
    """Call model with prebuilt content; robust to safety blocks & messy JSON; optionally save the EXACT payload."""
    if save_label:
        _save_debug_input_if_needed(content, save_label)

    cfg = GenerationConfig(temperature=GEN_TEMPERATURE)
    try:
        resp = model.generate_content([content], generation_config=cfg)
        try:
            raw = resp.text or ""
        except Exception:
            raw = ""
    except Exception:
        raw = ""

    # direct JSON
    try:
        return json.loads(raw)
    except Exception:
        pass
    # ```json ... ``` extract
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, flags=re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except Exception:
            pass
    # greedy { ... }
    m = re.search(r"(\{.*\})", raw, flags=re.DOTALL)
    if m:
        s = m.group(1)
        s = s[: s.rfind("}") + 1]
        try:
            return json.loads(s)
        except Exception:
            pass
    return {"raw_text": raw}

# ====================== Prompts (MCQ prompt is NOT changed) ======================
def _dedent(s: str) -> str:
    return textwrap.dedent(s).strip()

def prompt_for_mcq_fixed() -> str:
    # ★「変更しない」指定のため、与えられたまま
    return _dedent("""
        以下の『配信ログ本文』だけを根拠に、ライブ配信に関する4択問題を日本語で作成してください。外部知識の持ち込みは禁止です。

        # 出題ルール（厳守）
        - 問題数: 2 問（ちょうど2問。増減しない）
        - 1問目は、次の固定文言をそのまま問題文に使う：
          『配信中に出てきた話題は、以下の四つの選択肢のうちどれが正しいですか？』
        - 2問目は、次のテンプレートの {TOPIC} を 1問目の正解選択肢のテキスト（短い名詞句）に置換して使う：
          『配信中、{TOPIC}に関して行われた会話の内容は、以下のどれか？』
          ※ 出力時に {TOPIC} を残さないこと。
        - 似た選択肢は作らない。正解は各問ちょうど1つ。
        - あいまい表現や主観的解釈は禁止。本文の根拠のみ。
        - 挨拶など普遍的な内容は題材にしない。
        - 出力は JSON のみ。説明文やコードフェンス（```）は禁止。
        - 以下に与えた具体的な例とは異なる問題と選択肢を作成する。
        - 配信者/リスナーの区別や経過時間を問題文に含めない。
        - 説明(explanation)はRecitation回避のため原文の直接引用禁止。根拠は短い要約（100文字以内）。
        - 配信ログ本文を読んだら、必ず正解が選べるような問題を作ってください。

        # 出力フォーマット（厳守）
        {
          "questions": [
            {
              "question": "配信中に出てきた話題は、以下の四つの選択肢のうちどれが正しいですか？",
              "choices": ["選択肢A", "選択肢B", "選択肢C", "選択肢D"],
              "answer_index": 0,
              "explanation": "本文からの根拠（直接引用）"
            },
            {
              "question": "配信中、{TOPIC}に関して行われた会話の内容は、以下のどれか？",
              "choices": ["選択肢A", "選択肢B", "選択肢C", "選択肢D"],
              "answer_index": 0,
              "explanation": "本文からの根拠"
            }
          ]
        }

        # 例（one-shot）
        一つ目の問い
        問題文：配信中に出てきた話題は、以下の四つの選択肢のうちどれが正しいですか？
        選択肢A：唐揚げ
        選択肢B：不動産投資
        選択肢C：大阪の万博
        選択肢D：オーストラリアでのマラソン
        正解のインデックス：0

        二つ目の問い
        問題文：配信中、{TOPIC}に関して行われた会話の内容は、以下のどれか？
        選択肢A：唐揚げはもも肉より胸肉の方が良い
        選択肢B：唐揚げの値上げが嫌だ
        選択肢C：いつも唐揚げ食べると胃もたれする
        選択肢D：唐揚げにはレモンをかけるべきか否か
        正解のインデックス：3

        # 配信ログ本文
    """)

def _prompt_for_answering(qname: str, source_label: str) -> str:
    # qname: "Q1" or "Q2"; source_label: "配信ログ本文（チャット＋音声書き起こし）" or "チャット本文"
    return _dedent(f"""
        次の『{source_label}だけ』を根拠に、与えられた1問({qname})の4択に回答してください。
        - 出力は JSON のみ（コードフェンス禁止）。
        - 回答は 0..3 の整数インデックス（choices 配列の添字）。

        # 出力フォーマット
        {{
          "answers": {{
            "{qname.lower()}_index": 0
          }}
        }}

        # {qname}（question & choices のみ）
        # この後に {qname} のJSONを貼ります（answer_index/explanation は含みません）。

        # 本文
    """)

# ====================== Utilities: parsing & cleaning ======================
_TAG_AT_LINE_START = re.compile(r'^(?:\[[^\]]+\]\s*)+')

def clean_plain_text(s: str) -> str:
    if not isinstance(s, str) or not s:
        return ""
    s = s.replace("\r\n", "\n")
    lines = []
    for line in s.split("\n"):
        line = _TAG_AT_LINE_START.sub("", line).strip()
        lines.append(line)
    return "\n".join(lines).strip()

def parse_chunk_idx_from_path(path: str) -> Optional[int]:
    base = os.path.basename(path)
    try:
        return int(base.split("_")[1].split(".")[0])
    except Exception:
        return None

def top_n_live_ids_from_suffix_csv(csv_path: str, n: int) -> List[int]:
    df = pd.read_csv(csv_path)
    col = "suffix_number" if "suffix_number" in df.columns else df.columns[-1]
    live_ids = (
        pd.to_numeric(df[col], errors="coerce")
        .dropna()
        .astype(int)
        .tolist()
    )
    seen, uniq = set(), []
    for lid in live_ids:
        if lid not in seen:
            seen.add(lid)
            uniq.append(lid)
        if len(uniq) >= n:
            break
    return uniq

# ====================== MCQ helpers ======================
def ensure_two_questions(mcq_obj: Dict[str, Any]) -> Dict[str, Any]:
    if isinstance(mcq_obj, dict) and isinstance(mcq_obj.get("questions"), list):
        if len(mcq_obj["questions"]) > 2:
            mcq_obj["questions"] = mcq_obj["questions"][:2]
    return mcq_obj

def sanitize_mcq_and_fill_topic(mcq: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    # Q1 の正解テキストで Q2 の {TOPIC} を置換
    if not isinstance(mcq, dict) or "questions" not in mcq:
        return None
    qs = mcq.get("questions") or []
    if len(qs) < 2:
        return None
    try:
        aidx = int(qs[0].get("answer_index", 0))
        topic = (qs[0].get("choices") or [])[aidx]
    except Exception:
        return None
    if not isinstance(topic, str) or not topic.strip():
        return None
    q2q = str(qs[1].get("question", ""))
    for ph in ("{TOPIC}", "<Q1正解>", "{一つ目の問の答え}"):
        q2q = q2q.replace(ph, topic)
    qs[1]["question"] = q2q
    mcq["questions"] = qs[:2]
    return mcq

def to_public_question(q: Dict[str, Any]) -> Dict[str, Any]:
    return {"question": str(q.get("question", "")),
            "choices": list(q.get("choices", []))[:4]}

def generate_mcq_for_chunk(model: GenerativeModel, combined_text: str, save_label: Optional[str]=None) -> Optional[Dict[str, Any]]:
    prompt = prompt_for_mcq_fixed()
    content = prompt + "\n" + (combined_text or "")
    obj = call_gemini_json_with_content(model, content, save_label=save_label)
    obj = ensure_two_questions(obj)
    obj = sanitize_mcq_and_fill_topic(obj)
    try:
        qs = obj["questions"]
        assert isinstance(qs, list) and len(qs) == 2
        for q in qs:
            assert isinstance(q.get("question",""), str)
            assert isinstance(q.get("choices", []), list) and len(q["choices"]) == 4
            assert 0 <= int(q.get("answer_index", 0)) < 4
    except Exception:
        return None
    return obj

# ====================== Answering ======================
def build_answer_input_text(public_q_json: Dict[str, Any], body_text: str, label: str) -> str:
    return f"""{label}
{json.dumps(public_q_json, ensure_ascii=False)}

# 本文
{body_text or ""}"""

def answer_question(model: GenerativeModel, qname: str, source_label: str, public_q: Dict[str, Any],
                    body_text: str, save_label: Optional[str]=None) -> Optional[Dict[str, Any]]:
    """
    qname: "Q1" or "Q2"
    source_label: shown in prompt ("配信ログ本文（チャット＋音声書き起こし）" or "チャット本文")
    """
    prompt = _prompt_for_answering(qname, source_label)
    doc = build_answer_input_text(public_q, body_text, f"# {qname} JSON")
    content = prompt + "\n" + doc
    obj = call_gemini_json_with_content(model, content, save_label=save_label)
    key = f"{qname.lower()}_index"
    if isinstance(obj, dict) and "answers" in obj and key in (obj["answers"] or {}):
        return obj
    # Return whatever we got; evaluator will mark invalid-format if needed
    return obj if isinstance(obj, dict) else None

# ====================== Evaluation ======================
def _parse_index_field(ans_obj: Optional[Dict[str, Any]], key: str, num_choices: int = 4) -> Tuple[Optional[int], bool, int]:
    """
    Returns: (pred_index_or_None, is_valid(bool), invalid_format_flag_int)
    invalid_format = 1 when the key exists but value is not an int in [0..num_choices-1]
    """
    if not isinstance(ans_obj, dict):
        return (None, False, 0)
    ans = ans_obj.get("answers", {})
    if not isinstance(ans, dict) or key not in ans:
        return (None, False, 0)  # no key => not counted as invalid-format
    val = ans.get(key)
    try:
        pred = int(val)
    except Exception:
        return (None, False, 1)  # present but unparsable
    if 0 <= pred < num_choices:
        return (pred, True, 0)
    else:
        return (pred, False, 1)  # present but out of range

def evaluate_stage(mcq_full: Dict[str, Any], question_idx: int, ans_obj: Optional[Dict[str, Any]], key: str) -> Dict[str, Any]:
    gt = int(mcq_full["questions"][question_idx]["answer_index"])
    pred, is_valid, invalid_fmt = _parse_index_field(ans_obj, key, num_choices=4)
    correct = int(pred == gt) if is_valid else None
    return {"gt": gt, "pred": pred, "correct": correct, "invalid_format": invalid_fmt}

# ====================== Runner (Resume-safe) ======================
def process_top_n_lives(
    model_gen: GenerativeModel,
    model_q1_combined: GenerativeModel,
    model_q1_chat_uncond: GenerativeModel,
    model_q1_chat_cond: GenerativeModel,
    model_q2_combined: GenerativeModel,
    model_q2_chat_uncond: GenerativeModel,
    model_q2_chat_cond: GenerativeModel,
    n: int = TOP_N_LIVE,
    min_chars: int = MIN_COMBINED_CHARS,
) -> pd.DataFrame:

    ensure_local_dirs()
    fs = fs_gcs()
    lives = top_n_live_ids_from_suffix_csv(SUFFIX_LIST_CSV, n)

    # resume: load existing run log if any
    existing_rows: List[Dict[str, Any]] = []
    if fs.exists(ADOPT_LOG_PATH):
        try:
            txt = read_gcs_text(ADOPT_LOG_PATH)
            if txt.strip():
                existing_rows = pd.read_csv(io.StringIO(txt)).to_dict("records")
        except Exception:
            pass
    rows = existing_rows[:]

    def flush_checkpoint():
        try:
            if rows:
                write_gcs_text(ADOPT_LOG_PATH, pd.DataFrame(rows).to_csv(index=False))
        except Exception:
            pass
        os.makedirs(LOCAL_RUNLOG_DIR, exist_ok=True)
        local_csv = os.path.join(LOCAL_RUNLOG_DIR, f"adopted_chunks_top{TOP_N_LIVE}.csv")
        try:
            pd.DataFrame(rows).to_csv(local_csv, index=False, encoding="utf-8")
        except Exception:
            pass

    ctr = 0

    for live_id in tqdm(lives, desc="lives"):
        combo_paths = gcs_glob(COMBINED_PREFIX, f"{live_id}_*.txt")
        if not combo_paths:
            rows.append({"live_id": live_id, "chunk_idx": None, "status": "no_combined"})
            ctr += 1;  flush_checkpoint() if ctr % CHECKPOINT_EVERY == 0 else None
            continue

        for cp in tqdm(combo_paths, desc=f"chunks {live_id}", leave=False):
            chunk_idx = parse_chunk_idx_from_path(cp)
            chat_path  = f"{CHATONLY_PREFIX}/{live_id}_{chunk_idx}.txt"
            has_chat = fs.exists(chat_path)

            # Read combined
            try:
                combined_text = read_gcs_text(cp)
            except Exception as e:
                rows.append({"live_id": live_id, "chunk_idx": chunk_idx,
                             "status": "read_combined_failed", "err": str(e)})
                ctr += 1;  flush_checkpoint() if ctr % CHECKPOINT_EVERY == 0 else None
                continue
            combined_len_raw = len((combined_text or "").strip())
            if combined_len_raw < min_chars:
                rows.append({"live_id": live_id, "chunk_idx": chunk_idx,
                             "status": "skip_short", "combined_len_raw": combined_len_raw})
                ctr += 1;  flush_checkpoint() if ctr % CHECKPOINT_EVERY == 0 else None
                continue

            # Read chat (if exists)
            chat_text = ""
            if has_chat:
                try:
                    chat_text = read_gcs_text(chat_path)
                except Exception as e:
                    rows.append({"live_id": live_id, "chunk_idx": chunk_idx,
                                 "status": "read_chat_failed", "err": str(e)})
                    ctr += 1;  flush_checkpoint() if ctr % CHECKPOINT_EVERY == 0 else None
                    continue

            # ---- Paths
            q_full_out    = Q_FULL_OUT(live_id, chunk_idx)
            q_public_out  = Q_PUB_OUT(live_id, chunk_idx)

            a_q1c_out     = A_Q1C_OUT(live_id, chunk_idx)
            a_q1chatu_out = A_Q1CHATU_OUT(live_id, chunk_idx)
            a_q1chatc_out = A_Q1CHATC_OUT(live_id, chunk_idx)

            a_q2c_out     = A_Q2C_OUT(live_id, chunk_idx)
            a_q2chatu_out = A_Q2CHATU_OUT(live_id, chunk_idx)
            a_q2chatc_out = A_Q2CHATC_OUT(live_id, chunk_idx)

            # ---- MCQ (resume-aware)
            mcq_full = None
            if fs.exists(q_full_out):
                try:
                    mcq_full = json.loads(read_gcs_text(q_full_out))
                except Exception:
                    mcq_full = None
            if mcq_full is None:
                mcq_full = generate_mcq_for_chunk(
                    model_gen, combined_text,
                    save_label=f"mcq_gen_combined_live{live_id}_chunk{chunk_idx}"
                )
                if not (isinstance(mcq_full, dict) and mcq_full.get("questions") and len(mcq_full["questions"]) == 2):
                    rows.append({"live_id": live_id, "chunk_idx": chunk_idx, "status": "mcq_invalid"})
                    ctr += 1;  flush_checkpoint() if ctr % CHECKPOINT_EVERY == 0 else None
                    continue
                try:
                    write_gcs_text(q_full_out, json.dumps(mcq_full, ensure_ascii=False, indent=2))
                except Exception:
                    pass
                with open(os.path.join(LOCAL_QUESTIONS_FULL_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                    json.dump(mcq_full, f, ensure_ascii=False, indent=2)

            # ---- Public (resume-aware)
            public_q1 = public_q2 = None
            if fs.exists(q_public_out):
                try:
                    pub = json.loads(read_gcs_text(q_public_out))
                    if isinstance(pub, dict) and "questions" in pub and len(pub["questions"]) == 2:
                        public_q1, public_q2 = pub["questions"][0], pub["questions"][1]
                except Exception:
                    public_q1 = public_q2 = None
            if public_q1 is None or public_q2 is None:
                public_q1 = to_public_question(mcq_full["questions"][0])
                public_q2 = to_public_question(mcq_full["questions"][1])
                try:
                    write_gcs_text(q_public_out, json.dumps({"questions":[public_q1, public_q2]}, ensure_ascii=False, indent=2))
                except Exception:
                    pass
                with open(os.path.join(LOCAL_QUESTIONS_PUBLIC_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                    json.dump({"questions":[public_q1, public_q2]}, f, ensure_ascii=False, indent=2)

            # ================= Q1 pipeline =================
            # LLM2: Q1 @ Combined
            ans_q1_combined = None
            if fs.exists(a_q1c_out):
                try:
                    ans_q1_combined = json.loads(read_gcs_text(a_q1c_out))
                except Exception:
                    ans_q1_combined = None
            if ans_q1_combined is None:
                ans_q1_combined = answer_question(
                    model_q1_combined, "Q1", "配信ログ本文（チャット＋音声書き起こし）",
                    public_q1, combined_text,
                    save_label=f"answer_Q1_combined_live{live_id}_chunk{chunk_idx}"
                )
                if ans_q1_combined is not None:
                    try:
                        write_gcs_text(a_q1c_out, json.dumps(ans_q1_combined, ensure_ascii=False, indent=2))
                    except Exception:
                        pass
                    with open(os.path.join(LOCAL_ANS_Q1_COMBINED_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                        json.dump(ans_q1_combined, f, ensure_ascii=False, indent=2)

            # Evaluate Q1 @ Combined
            ev_q1c = evaluate_stage(mcq_full, 0, ans_q1_combined, "q1_index")
            q1_comb_correct = (ev_q1c["correct"] == 1)

            # LLM4: Q1 @ Chat-only (conditional on LLM2 correct + has_chat)
            ans_q1_chat_cond = None
            ev_q1chat_cond = {"pred": None, "correct": None, "invalid_format": 0}
            if q1_comb_correct and has_chat:
                if fs.exists(a_q1chatc_out):
                    try:
                        ans_q1_chat_cond = json.loads(read_gcs_text(a_q1chatc_out))
                    except Exception:
                        ans_q1_chat_cond = None
                if ans_q1_chat_cond is None:
                    ans_q1_chat_cond = answer_question(
                        model_q1_chat_cond, "Q1", "チャット本文",
                        public_q1, chat_text,
                        save_label=f"answer_Q1_chat_cond_live{live_id}_chunk{chunk_idx}"
                    )
                    if ans_q1_chat_cond is not None:
                        try:
                            write_gcs_text(a_q1chatc_out, json.dumps(ans_q1_chat_cond, ensure_ascii=False, indent=2))
                        except Exception:
                            pass
                        with open(os.path.join(LOCAL_ANS_Q1_CHAT_COND_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                            json.dump(ans_q1_chat_cond, f, ensure_ascii=False, indent=2)
                ev_q1chat_cond = evaluate_stage(mcq_full, 0, ans_q1_chat_cond, "q1_index")

            # LLM3: Q1 @ Chat-only (unconditional)
            ans_q1_chat_uncond = None
            if fs.exists(a_q1chatu_out):
                try:
                    ans_q1_chat_uncond = json.loads(read_gcs_text(a_q1chatu_out))
                except Exception:
                    ans_q1_chat_uncond = None
            if ans_q1_chat_uncond is None and has_chat:
                ans_q1_chat_uncond = answer_question(
                    model_q1_chat_uncond, "Q1", "チャット本文",
                    public_q1, chat_text,
                    save_label=f"answer_Q1_chat_uncond_live{live_id}_chunk{chunk_idx}"
                )
                if ans_q1_chat_uncond is not None:
                    try:
                        write_gcs_text(a_q1chatu_out, json.dumps(ans_q1_chat_uncond, ensure_ascii=False, indent=2))
                    except Exception:
                        pass
                    with open(os.path.join(LOCAL_ANS_Q1_CHAT_UNCOND_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                        json.dump(ans_q1_chat_uncond, f, ensure_ascii=False, indent=2)
            ev_q1chat_uncond = evaluate_stage(mcq_full, 0, ans_q1_chat_uncond, "q1_index") if has_chat else {"pred": None, "correct": None, "invalid_format": 0}

            # ================= Q2 pipeline =================
            # LLM5: Q2 @ Combined
            ans_q2_combined = None
            if fs.exists(a_q2c_out):
                try:
                    ans_q2_combined = json.loads(read_gcs_text(a_q2c_out))
                except Exception:
                    ans_q2_combined = None
            if ans_q2_combined is None:
                ans_q2_combined = answer_question(
                    model_q2_combined, "Q2", "配信ログ本文（チャット＋音声書き起こし）",
                    public_q2, combined_text,
                    save_label=f"answer_Q2_combined_live{live_id}_chunk{chunk_idx}"
                )
                if ans_q2_combined is not None:
                    try:
                        write_gcs_text(a_q2c_out, json.dumps(ans_q2_combined, ensure_ascii=False, indent=2))
                    except Exception:
                        pass
                    with open(os.path.join(LOCAL_ANS_Q2_COMBINED_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                        json.dump(ans_q2_combined, f, ensure_ascii=False, indent=2)
            ev_q2c = evaluate_stage(mcq_full, 1, ans_q2_combined, "q2_index")
            q2_comb_correct = (ev_q2c["correct"] == 1)

            # LLM7: Q2 @ Chat-only (conditional)
            ans_q2_chat_cond = None
            ev_q2chat_cond = {"pred": None, "correct": None, "invalid_format": 0}
            if q2_comb_correct and has_chat:
                if fs.exists(a_q2chatc_out):
                    try:
                        ans_q2_chat_cond = json.loads(read_gcs_text(a_q2chatc_out))
                    except Exception:
                        ans_q2_chat_cond = None
                if ans_q2_chat_cond is None:
                    ans_q2_chat_cond = answer_question(
                        model_q2_chat_cond, "Q2", "チャット本文",
                        public_q2, chat_text,
                        save_label=f"answer_Q2_chat_cond_live{live_id}_chunk{chunk_idx}"
                    )
                    if ans_q2_chat_cond is not None:
                        try:
                            write_gcs_text(a_q2chatc_out, json.dumps(ans_q2_chat_cond, ensure_ascii=False, indent=2))
                        except Exception:
                            pass
                        with open(os.path.join(LOCAL_ANS_Q2_CHAT_COND_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                            json.dump(ans_q2_chat_cond, f, ensure_ascii=False, indent=2)
                ev_q2chat_cond = evaluate_stage(mcq_full, 1, ans_q2_chat_cond, "q2_index")

            # LLM6: Q2 @ Chat-only (unconditional)
            ans_q2_chat_uncond = None
            if fs.exists(a_q2chatu_out):
                try:
                    ans_q2_chat_uncond = json.loads(read_gcs_text(a_q2chatu_out))
                except Exception:
                    ans_q2_chat_uncond = None
            if ans_q2_chat_uncond is None and has_chat:
                ans_q2_chat_uncond = answer_question(
                    model_q2_chat_uncond, "Q2", "チャット本文",
                    public_q2, chat_text,
                    save_label=f"answer_Q2_chat_uncond_live{live_id}_chunk{chunk_idx}"
                )
                if ans_q2_chat_uncond is not None:
                    try:
                        write_gcs_text(a_q2chatu_out, json.dumps(ans_q2_chat_uncond, ensure_ascii=False, indent=2))
                    except Exception:
                        pass
                    with open(os.path.join(LOCAL_ANS_Q2_CHAT_UNCOND_DIR, f"{live_id}_{chunk_idx}.json"), "w", encoding="utf-8") as f:
                        json.dump(ans_q2_chat_uncond, f, ensure_ascii=False, indent=2)
            ev_q2chat_uncond = evaluate_stage(mcq_full, 1, ans_q2_chat_uncond, "q2_index") if has_chat else {"pred": None, "correct": None, "invalid_format": 0}

            # ---- Compose row ----
            row = {
                "live_id": live_id,
                "chunk_idx": chunk_idx,
                "status": "ok",
                "combined_path": cp,
                "chat_path": chat_path if has_chat else None,
                "has_chat": bool(has_chat),

                # artifacts
                "mcq_full_path": q_full_out,
                "mcq_public_path": q_public_out,

                "ans_q1_combined_path": a_q1c_out if ans_q1_combined is not None else None,
                "ans_q1_chat_cond_path": a_q1chatc_out if (q1_comb_correct and has_chat and ans_q1_chat_cond is not None) else None,
                "ans_q1_chat_uncond_path": a_q1chatu_out if (has_chat and ans_q1_chat_uncond is not None) else None,

                "ans_q2_combined_path": a_q2c_out if ans_q2_combined is not None else None,
                "ans_q2_chat_cond_path": a_q2chatc_out if (q2_comb_correct and has_chat and ans_q2_chat_cond is not None) else None,
                "ans_q2_chat_uncond_path": a_q2chatu_out if (has_chat and ans_q2_chat_uncond is not None) else None,

                # eval Q1
                "q1_gt": ev_q1c["gt"],
                "q1_pred_combined": ev_q1c["pred"],
                "q1_correct_combined": ev_q1c["correct"],
                "q1_combined_invalid_format": ev_q1c["invalid_format"],

                "q1_pred_chat_cond": ev_q1chat_cond["pred"],
                "q1_chat_cond_correct": ev_q1chat_cond["correct"],
                "q1_chat_cond_invalid_format": ev_q1chat_cond["invalid_format"],

                "q1_pred_chat_uncond": ev_q1chat_uncond["pred"],
                "q1_chat_uncond_correct": ev_q1chat_uncond["correct"],
                "q1_chat_uncond_invalid_format": ev_q1chat_uncond["invalid_format"],

                # eval Q2
                "q2_gt": ev_q2c["gt"],
                "q2_pred_combined": ev_q2c["pred"],
                "q2_correct_combined": ev_q2c["correct"],
                "q2_combined_invalid_format": ev_q2c["invalid_format"],

                "q2_pred_chat_cond": ev_q2chat_cond["pred"],
                "q2_chat_cond_correct": ev_q2chat_cond["correct"],
                "q2_chat_cond_invalid_format": ev_q2chat_cond["invalid_format"],

                "q2_pred_chat_uncond": ev_q2chat_uncond["pred"],
                "q2_chat_uncond_correct": ev_q2chat_uncond["correct"],
                "q2_chat_uncond_invalid_format": ev_q2chat_uncond["invalid_format"],
            }
            rows.append(row)

            ctr += 1
            if ctr % CHECKPOINT_EVERY == 0:
                flush_checkpoint()

    flush_checkpoint()
    return pd.DataFrame(rows)

# ====================== Metrics ======================
def compute_metrics(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Return metrics including:
      - Combined (Q1/Q2): totals, correct totals, accuracy
      - Chat-only UNCONDITIONAL (Q1/Q2): totals, correct totals, accuracy
      - Chat-only CONDITIONAL (Q1|LLM2-correct&has_chat / Q2|LLM5-correct&has_chat):
          possible_total, trigger_total(=answered_total), answered_total, correct_total, accuracy, not_attempted_total
      - invalid_format_total across all stages
    """
    out: Dict[str, Any] = {}

    # ---- Q1 Combined
    mask_q1c = df["q1_correct_combined"].notna()
    out["q1_combined_total"] = int(mask_q1c.sum())
    out["q1_combined_correct_total"] = int(df.loc[mask_q1c, "q1_correct_combined"].fillna(0).astype(int).sum())
    out["q1_combined_accuracy"] = (float(out["q1_combined_correct_total"] / out["q1_combined_total"])
                                   if out["q1_combined_total"] else None)

    # ---- Q2 Combined
    mask_q2c = df["q2_correct_combined"].notna()
    out["q2_combined_total"] = int(mask_q2c.sum())
    out["q2_combined_correct_total"] = int(df.loc[mask_q2c, "q2_correct_combined"].fillna(0).astype(int).sum())
    out["q2_combined_accuracy"] = (float(out["q2_combined_correct_total"] / out["q2_combined_total"])
                                   if out["q2_combined_total"] else None)

    # ---- Q1 Chat-only UNCONDITIONAL (LLM3): only rows with has_chat
    mask_q1u = df["q1_chat_uncond_correct"].notna()
    out["q1_chat_uncond_total"] = int(mask_q1u.sum())
    out["q1_chat_uncond_correct_total"] = int(df.loc[mask_q1u, "q1_chat_uncond_correct"].fillna(0).astype(int).sum())
    out["q1_chat_uncond_accuracy"] = (float(out["q1_chat_uncond_correct_total"] / out["q1_chat_uncond_total"])
                                      if out["q1_chat_uncond_total"] else None)

    # ---- Q2 Chat-only UNCONDITIONAL (LLM6): only rows with has_chat
    mask_q2u = df["q2_chat_uncond_correct"].notna()
    out["q2_chat_uncond_total"] = int(mask_q2u.sum())
    out["q2_chat_uncond_correct_total"] = int(df.loc[mask_q2u, "q2_chat_uncond_correct"].fillna(0).astype(int).sum())
    out["q2_chat_uncond_accuracy"] = (float(out["q2_chat_uncond_correct_total"] / out["q2_chat_uncond_total"])
                                      if out["q2_chat_uncond_total"] else None)

    # ---- Q1 Chat-only CONDITIONAL (LLM4)
    has_chat = df["has_chat"].fillna(False)
    possible_q1 = (df["q1_correct_combined"] == 1) & has_chat
    out["q1_chat_cond_possible_total"] = int(possible_q1.sum())

    attempted_q1 = df["ans_q1_chat_cond_path"].notna() | df["q1_chat_cond_correct"].notna()
    trigger_q1 = attempted_q1  # attempted == answered in this pipeline
    out["q1_chat_cond_trigger_total"] = int(trigger_q1.sum())
    out["q1_chat_cond_answered_total"] = int(trigger_q1.sum())

    correct_q1 = df.loc[trigger_q1, "q1_chat_cond_correct"].fillna(0).astype(int).sum()
    out["q1_chat_cond_correct_total"] = int(correct_q1)
    out["q1_chat_cond_accuracy"] = (float(correct_q1 / out["q1_chat_cond_answered_total"])
                                    if out["q1_chat_cond_answered_total"] else None)
    not_attempted_q1 = possible_q1 & (~trigger_q1)
    out["q1_chat_cond_not_attempted_total"] = int(not_attempted_q1.sum())

    # ---- Q2 Chat-only CONDITIONAL (LLM7)
    possible_q2 = (df["q2_correct_combined"] == 1) & has_chat
    out["q2_chat_cond_possible_total"] = int(possible_q2.sum())

    attempted_q2 = df["ans_q2_chat_cond_path"].notna() | df["q2_chat_cond_correct"].notna()
    trigger_q2 = attempted_q2
    out["q2_chat_cond_trigger_total"] = int(trigger_q2.sum())
    out["q2_chat_cond_answered_total"] = int(trigger_q2.sum())

    correct_q2 = df.loc[trigger_q2, "q2_chat_cond_correct"].fillna(0).astype(int).sum()
    out["q2_chat_cond_correct_total"] = int(correct_q2)
    out["q2_chat_cond_accuracy"] = (float(correct_q2 / out["q2_chat_cond_answered_total"])
                                    if out["q2_chat_cond_answered_total"] else None)
    not_attempted_q2 = possible_q2 & (~trigger_q2)
    out["q2_chat_cond_not_attempted_total"] = int(not_attempted_q2.sum())

    # ---- invalid-format counts across all answer stages
    invalid_cols = [
        "q1_combined_invalid_format", "q1_chat_uncond_invalid_format", "q1_chat_cond_invalid_format",
        "q2_combined_invalid_format", "q2_chat_uncond_invalid_format", "q2_chat_cond_invalid_format",
    ]
    total_invalid = 0
    for c in invalid_cols:
        if c in df.columns:
            total_invalid += int(df[c].fillna(0).astype(int).sum())
    out["invalid_format_total"] = int(total_invalid)

    return out

def save_metrics(metrics: Dict[str, Any]) -> None:
    try:
        write_gcs_text(METRICS_JSON_PATH, json.dumps(metrics, ensure_ascii=False, indent=2))
    except Exception:
        pass
    os.makedirs(LOCAL_RUNLOG_DIR, exist_ok=True)
    local_metrics = os.path.join(LOCAL_RUNLOG_DIR, "metrics.json")
    try:
        with open(local_metrics, "w", encoding="utf-8") as f:
            json.dump(metrics, f, ensure_ascii=False, indent=2)
    except Exception:
        pass

# ====================== Examples (3 correct / 3 incorrect) ======================
def _load_public_q(q_public_path: str) -> Optional[Dict[str, Any]]:
    try:
        data = json.loads(read_gcs_text(q_public_path))
        if isinstance(data, dict) and "questions" in data and len(data["questions"]) == 2:
            return data
    except Exception:
        # try local
        try:
            with open(os.path.join(LOCAL_QUESTIONS_PUBLIC_DIR, os.path.basename(q_public_path)), "r", encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, dict) and "questions" in data and len(data["questions"]) == 2:
                    return data
        except Exception:
            pass
    return None

def _load_combined_text(path: Optional[str]) -> Optional[str]:
    if not isinstance(path, str):
        return None
    try:
        return read_gcs_text(path)
    except Exception:
        return None

def gather_examples(df: pd.DataFrame, stage: str, correct_flag: int, k: int = 3) -> List[Dict[str, Any]]:
    """
    stage: one of
      - 'q1_combined', 'q1_chat_uncond', 'q1_chat_cond',
      - 'q2_combined', 'q2_chat_uncond', 'q2_chat_cond'
    correct_flag: 1 -> correct examples, 0 -> incorrect examples
    ★ combined_text (10分の chat+audio) を例に含める
    """
    # column mapping
    pred_col_map = {
        "q1_combined": "q1_pred_combined",
        "q1_chat_uncond": "q1_pred_chat_uncond",
        "q1_chat_cond": "q1_pred_chat_cond",
        "q2_combined": "q2_pred_combined",
        "q2_chat_uncond": "q2_pred_chat_uncond",
        "q2_chat_cond": "q2_pred_chat_cond",
    }
    corr_col_map = {
        "q1_combined": "q1_correct_combined",
        "q1_chat_uncond": "q1_chat_uncond_correct",
        "q1_chat_cond": "q1_chat_cond_correct",
        "q2_combined": "q2_correct_combined",
        "q2_chat_uncond": "q2_chat_uncond_correct",
        "q2_chat_cond": "q2_chat_cond_correct",
    }
    qidx_map = {
        "q1_combined": 0, "q1_chat_uncond": 0, "q1_chat_cond": 0,
        "q2_combined": 1, "q2_chat_uncond": 1, "q2_chat_cond": 1,
    }

    pred_col = pred_col_map[stage]
    corr_col = corr_col_map[stage]
    qidx = qidx_map[stage]

    subset = df[df[corr_col] == correct_flag].copy()
    if subset.empty:
        return []
    examples = []
    for _, r in subset.head(k).iterrows():
        pub_path = r.get("mcq_public_path")
        pq = _load_public_q(pub_path) if isinstance(pub_path, str) else None
        if not pq:
            continue
        qobj = pq["questions"][qidx]
        pred = r.get(pred_col)

        combined_path = r.get("combined_path")
        combined_text = _load_combined_text(combined_path)

        ex = {
            "stage": stage,
            "live_id": int(r["live_id"]) if pd.notna(r["live_id"]) else None,
            "chunk_idx": int(r["chunk_idx"]) if pd.notna(r["chunk_idx"]) else None,
            "combined_path": combined_path,
            "combined_text": combined_text,  # ★ 10分の chat+audio テキストを格納
            "question": qobj.get("question"),
            "choices": qobj.get("choices"),
            "pred_index": int(pred) if pd.notna(pred) else None,
            "pred_choice_text": (qobj.get("choices")[int(pred)] if pd.notna(pred) and 0 <= int(pred) < 4 else None),
        }
        examples.append(ex)
    return examples

def save_examples(examples: Dict[str, List[Dict[str, Any]]]) -> None:
    # Local
    os.makedirs(LOCAL_EXAMPLES_DIR, exist_ok=True)
    local_path = os.path.join(LOCAL_EXAMPLES_DIR, "examples.json")
    with open(local_path, "w", encoding="utf-8") as f:
        json.dump(examples, f, ensure_ascii=False, indent=2)
    # GCS
    try:
        write_gcs_text(f"{EXAMPLES_PREFIX}/examples.json", json.dumps(examples, ensure_ascii=False, indent=2))
    except Exception:
        pass

# ====================== Main ======================
if __name__ == "__main__":
    # init models (all gemini-2.5-pro)
    model_gen             = init_vertex_ai(MODEL_NAME_QGEN)            # LLM1
    model_q1_combined     = init_vertex_ai(MODEL_NAME_Q1_COMBINED)     # LLM2
    model_q1_chat_uncond  = init_vertex_ai(MODEL_NAME_Q1_CHAT_UNCOND)  # LLM3
    model_q1_chat_cond    = init_vertex_ai(MODEL_NAME_Q1_CHAT_COND)    # LLM4
    model_q2_combined     = init_vertex_ai(MODEL_NAME_Q2_COMBINED)     # LLM5
    model_q2_chat_uncond  = init_vertex_ai(MODEL_NAME_Q2_CHAT_UNCOND)  # LLM6
    model_q2_chat_cond    = init_vertex_ai(MODEL_NAME_Q2_CHAT_COND)    # LLM7

    df = process_top_n_lives(
        model_gen,
        model_q1_combined, model_q1_chat_uncond, model_q1_chat_cond,
        model_q2_combined, model_q2_chat_uncond, model_q2_chat_cond,
        n=TOP_N_LIVE
    )

    metrics = compute_metrics(df)
    print(json.dumps(metrics, ensure_ascii=False, indent=2))
    save_metrics(metrics)

    # ---- Gather & save 3 correct / 3 incorrect examples (Q1/Q2 Combined & Unconditional Chat) ----
    examples = {
        "q1_combined_correct_top3":      gather_examples(df, "q1_combined", correct_flag=1, k=3),
        "q1_combined_incorrect_top3":    gather_examples(df, "q1_combined", correct_flag=0, k=3),
        "q2_combined_correct_top3":      gather_examples(df, "q2_combined", correct_flag=1, k=3),
        "q2_combined_incorrect_top3":    gather_examples(df, "q2_combined", correct_flag=0, k=3),

        "q1_chat_uncond_correct_top3":   gather_examples(df, "q1_chat_uncond", correct_flag=1, k=3),
        "q1_chat_uncond_incorrect_top3": gather_examples(df, "q1_chat_uncond", correct_flag=0, k=3),
        "q2_chat_uncond_correct_top3":   gather_examples(df, "q2_chat_uncond", correct_flag=1, k=3),
        "q2_chat_uncond_incorrect_top3": gather_examples(df, "q2_chat_uncond", correct_flag=0, k=3),

        # （必要なら条件付きチャット側も保存）
        "q1_chat_cond_correct_top3":     gather_examples(df, "q1_chat_cond", correct_flag=1, k=3),
        "q1_chat_cond_incorrect_top3":   gather_examples(df, "q1_chat_cond", correct_flag=0, k=3),
        "q2_chat_cond_correct_top3":     gather_examples(df, "q2_chat_cond", correct_flag=1, k=3),
        "q2_chat_cond_incorrect_top3":   gather_examples(df, "q2_chat_cond", correct_flag=0, k=3),
    }
    save_examples(examples)

    # ---- Also print invalid-format total for quick visibility ----
    print("invalid_format_total:", metrics.get("invalid_format_total"))
