# Vanilla Qwen Runner (no RAG)

Runs the fixed evaluation set (`eval_set.jsonl`) through a vanilla Qwen model and saves results to a CSV.

**You only need to edit the PATHS cell.**

In [None]:
# ===== Install (Colab) =====
!pip -q install -U transformers accelerate bitsandbytes pandas tqdm

In [None]:
# ===== PATHS (EDIT ME) =====
EVAL_SET_PATH = "/content/drive/MyDrive/your_project/data/eval_set.jsonl"  # <- TODO
OUTPUT_DIR = "/content/drive/MyDrive/your_project/outputs"                # <- TODO

# Model (edit if you want)
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

# Generation (keep temperature=0 for reproducibility)
TEMPERATURE = 0.0
TOP_P = 1.0
MAX_NEW_TOKENS = 256

In [None]:
import os, json, re, time, datetime
import pandas as pd
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
def load_eval_set(jsonl_path: str):
    items = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    # Basic validation
    for it in items:
        assert "qid" in it and "question" in it, "Each line must include qid and question"
        if "choices" in it:
            assert isinstance(it["choices"], dict), "choices must be a dict like {A:...,B:...,C:...,D:...}"
    return items

def format_mcq_prompt(item: dict) -> str:
    q = item["question"].strip()
    choices = item.get("choices", {})
    if choices:
        choices_str = "\n".join([f"{k}. {v}" for k,v in choices.items()])
        return (
            "你是一位職安衛法規助理。請根據題目選出最正確的選項，只輸出選項字母(A/B/C/D)。\n\n"
            f"題目：{q}\n"
            f"{choices_str}\n\n"
            "答案："
        )
    else:
        # non-MCQ (optional)
        return f"請回答：{q}"

ANSWER_RE = re.compile(r"\b([ABCD])\b", re.IGNORECASE)

def parse_choice(text: str):
    # Robust: find first A/B/C/D
    m = ANSWER_RE.search(text.strip())
    if not m:
        return None
    return m.group(1).upper()

In [None]:
# ===== Load model =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype="auto",
    load_in_4bit=True,   # works on Colab T4/L4/A100; if error, set to False and remove bitsandbytes
)
model.eval()

In [None]:
eval_items = load_eval_set(EVAL_SET_PATH)
len(eval_items), eval_items[0].keys()

In [None]:
def generate_one(prompt: str):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=(TEMPERATURE > 0),
            temperature=TEMPERATURE if TEMPERATURE > 0 else None,
            top_p=TOP_P,
            pad_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # Keep only the tail after "答案：" if present
    if "答案：" in text:
        text_tail = text.split("答案：", 1)[-1].strip()
    else:
        text_tail = text.strip()
    return text, text_tail

run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
run_id = f"vanilla_{run_ts}"
os.makedirs(os.path.join(OUTPUT_DIR, "vanilla"), exist_ok=True)

rows = []
for item in tqdm(eval_items, desc="Vanilla eval"):
    qid = item["qid"]
    gold = item.get("answer")
    prompt = format_mcq_prompt(item)
    t0 = time.time()
    raw, tail = generate_one(prompt)
    latency_ms = int((time.time() - t0) * 1000)
    pred = parse_choice(tail)
    correct = int(pred == gold) if (pred is not None and gold is not None) else None
    rows.append({
        "run_id": run_id,
        "method": "vanilla",
        "model_name": MODEL_NAME,
        "qid": qid,
        "gold_choice": gold,
        "parsed_choice": pred,
        "correct": correct,
        "raw_output": raw,
        "latency_ms": latency_ms,
        # keep generation params for reproducibility
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
        "max_new_tokens": MAX_NEW_TOKENS,
        # rag fields kept for schema alignment
        "retrieved_k": None,
        "retrieved_ids": None,
        "context_chars": None,
    })

df = pd.DataFrame(rows)
out_path = os.path.join(OUTPUT_DIR, "vanilla", f"{run_id}.csv")
df.to_csv(out_path, index=False, encoding="utf-8-sig")
out_path, df["correct"].mean()