In [2]:
import re
import numpy as np
from tqdm import tqdm
from transformers import pipeline
from datasets import load_dataset
import json
from collections import defaultdict

# ============================================================
#  MODEL + DATA
# ============================================================

print("Loading model...")
pipe = pipeline("text-generation", model="Qwen/Qwen3-1.7B")

print("Loading dataset...")
with open("test_math.json", "r", encoding="utf-8") as f:
    testdata_all = json.load(f)

testdata_dict = defaultdict(list)
for dt in testdata_all:
    testdata_dict[dt['level']].append(dt)


# ============================================================
#   BOXED extraction utilities
# ============================================================

def extract_all_boxed(text):
    results = []
    i = 0
    key = r"\boxed"

    while True:
        start = text.find(key, i)
        if start == -1:
            break

        j = start + len(key)
        while j < len(text) and text[j].isspace():
            j += 1

        if j >= len(text) or text[j] != '{':
            i = start + 1
            continue

        depth = 0
        content_start = j + 1

        for k in range(content_start, len(text)):
            if text[k] == '{':
                depth += 1
            elif text[k] == '}':
                if depth == 0:
                    results.append(text[content_start:k].strip())
                    i = k + 1
                    break
                depth -= 1
        else:
            break

    return results


def extract_last_boxed(text):
    all_boxed = extract_all_boxed(text)
    return all_boxed[-1] if all_boxed else None


# ============================================================
#   GENERATE ANSWER (single)
# ============================================================

def generate_pipe_answer(problem, max_new_tokens=512):
    """
    Áî®ÊôÆÈÄö pipelineÔºàÈùûRAGÔºâÁîüÊàêÁ≠îÊ°à
    Âπ∂ÊâìÂç∞ÊâÄÊúâ boxed ‰∏éÊúÄÁªà boxedÔºàÁî®‰∫é debugÔºâ
    """

    prompt = (
        "You are a math assistant. Provide a concise step-by-step solution in English.\n"
        "Rules:\n"
        "1) Keep steps brief (<=6 lines when possible).\n"
        "2) Use LaTeX for math.\n"
        "3) On the LAST LINE, output ONLY the final result as \\boxed{<answer>} with no extra text.\n"
        f"Problem: {problem}\n"
        "Solution:\n"
    )

    # call model
    result = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]

    # capture boxed
    all_boxes = extract_all_boxed(result)
    last_box = extract_last_boxed(result)

    debug_output = []
    debug_output.append("====== PIPE MODEL RAW OUTPUT ======")
    debug_output.append(result)
    debug_output.append("\n====== ALL COLLECTED BOXED ======")
    if all_boxes:
        for idx, b in enumerate(all_boxes, 1):
            debug_output.append(f"{idx}. {b}")
    else:
        debug_output.append("No boxed answers found.")
    debug_output.append("\n====== LAST BOXED (FINAL) ======")
    debug_output.append(str(last_box))
    debug_output.append("=================================")

    full_debug_text = "\n".join(debug_output)

    if last_box is None or last_box.strip() == "":
        return "Null", full_debug_text

    return last_box.strip(), full_debug_text


# ============================================================
#   ANSWER + METRICS
# ============================================================

def extract_ref_answer(ref_text):
    return extract_last_boxed(ref_text)


def normalize_ans(ans):
    if ans is None:
        return None

    ans = ans.strip()
    ans = ans.replace(" ", "")
    ans = ans.replace("\\frac", "frac")
    ans = ans.replace("\\dfrac", "frac")
    return ans


def exact_match(preds, refs, verbose=True):
    n = len(preds)
    correct = 0
    total = 0

    for pred, ref in zip(preds, refs):
        ref_box = extract_ref_answer(ref)
        pred_norm = normalize_ans(pred)
        ref_norm = normalize_ans(ref_box)

        if pred_norm is None or pred_norm == "Null":
            continue

        total += 1
        if pred_norm == ref_norm:
            correct += 1

    acc = correct / max(total, 1)

    if verbose:
        print(f"\nüìå PIPE Exact Match Accuracy on {n} problems = {acc:.4f} [{correct}/{max(total,1)}]")

    return acc

def run_pipe_eval(n=20, prob_level='Level 2', show_debug=False, save=True, save_path="pipe_debug_log.txt"):

    preds = []
    refs = []
    outputs = []
    debug_lines = []

    for i in tqdm(range(n)):
        item = testdata_dict[prob_level][i]
        problem = item["problem"]
        reference = item["output"]

        pred, full_output = generate_pipe_answer(problem)
        ref_box = extract_ref_answer(reference)

        preds.append(pred)
        refs.append(reference)
        outputs.append(full_output)

        debug_lines.append("\n=====================")
        debug_lines.append(f"Problem {i+1}")
        debug_lines.append("---------------------")
        debug_lines.append(f"Problem:\n{problem}")
        debug_lines.append("")
        debug_lines.append(full_output)
        debug_lines.append("")
        debug_lines.append(f"Final Pred Boxed  : {pred}")
        debug_lines.append(f"Ref Full Solution : {reference}")
        debug_lines.append(f"Ref Boxed         : {ref_box}")
        same = (normalize_ans(pred) == normalize_ans(ref_box))
        debug_lines.append(f"Exact Match       : {same}")
        debug_lines.append("=====================\n")

        if show_debug:
            print("\n".join(debug_lines[-30:]))

    acc = exact_match(preds, refs)

    if save:
        with open(save_path, "w", encoding="utf-8") as f:
            f.write("\n".join(debug_lines))
            f.write(f"\n PIPE Exact Match Accuracy on {n} problems = {acc:.4f}")
        print(f"\nüîç Debug saved to: {save_path}")

    return acc, preds, refs, outputs


Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading dataset...


In [4]:
acc, preds, refs, outputs = run_pipe_eval(n=50, prob_level='Level 1', show_debug=False, save=True, save_path="baseline_qwen1.7b_level_1.txt")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [20:44<00:00, 24.88s/it]


üìå PIPE Exact Match Accuracy on 50 problems = 0.8200 [41/50]

üîç Debug saved to: baseline_qwen1.7b_level_1.txt





In [5]:
acc, preds, refs, outputs = run_pipe_eval(n=50, prob_level='Level 2', show_debug=False, save=True, save_path="baseline_qwen1.7b_level_2.txt")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [20:20<00:00, 24.40s/it]


üìå PIPE Exact Match Accuracy on 50 problems = 0.6600 [33/50]

üîç Debug saved to: baseline_qwen1.7b_level_2.txt





In [6]:
acc, preds, refs, outputs = run_pipe_eval(n=50, prob_level='Level 3', show_debug=False, save=True, save_path="baseline_qwen1.7b_level_3.txt")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [20:21<00:00, 24.43s/it]


üìå PIPE Exact Match Accuracy on 50 problems = 0.7200 [36/50]

üîç Debug saved to: baseline_qwen1.7b_level_3.txt





In [7]:
acc, preds, refs, outputs = run_pipe_eval(n=50, prob_level='Level 4', show_debug=False, save=True, save_path="baseline_qwen1.7b_level_4.txt")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [20:05<00:00, 24.12s/it]


üìå PIPE Exact Match Accuracy on 50 problems = 0.6600 [33/50]

üîç Debug saved to: baseline_qwen1.7b_level_4.txt





In [8]:
acc, preds, refs, outputs = run_pipe_eval(n=50, prob_level='Level 5', show_debug=False, save=True, save_path="baseline_qwen1.7b_level_5.txt")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [20:20<00:00, 24.40s/it]


üìå PIPE Exact Match Accuracy on 50 problems = 0.4800 [24/50]

üîç Debug saved to: baseline_qwen1.7b_level_5.txt



