In [1]:
from transformers import pipeline
from datasets import load_dataset
from tqdm import tqdm

# 加载 GPT-2 模型
# pipe = pipeline("text-generation", model="openai-community/gpt2-medium")
pipe = pipeline("text-generation", model="Qwen/Qwen3-4B")

# 加载数据集
ds_algebra = load_dataset("EleutherAI/hendrycks_math", "algebra")


# 定义生成函数
def generate_answer(problem, max_new_tokens=64):
    # prompt = f"Solve the following algebra problem:\n{problem}\nAnswer:"
    prompt = f"""You are a math solver. Solve the following problem step by step internally,
                  but only output the final numeric or symbolic answer (no explanation, no units).

                  Examples:

                  Question: How many vertical asymptotes does the graph of y = 2/(x^2 + x - 6) have?
                  Answer: 2

                  Question: If 2^8 = 4^x, what is the value of x?
                  Answer: 4

                  Question: {problem}
                  Answer:"""

    result = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
    return result.split("Answer:")[-1].strip()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
from datasets import Dataset
import numpy as np

preds, refs = [], []

for ex in tqdm(ds_algebra["test"].select(range(10))):  # 先试 10 条
    pred = generate_answer(ex["problem"])
    preds.append(pred)
    refs.append(ex["solution"])


  0%|          | 0/50 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 10/50 [00:42<02:49,  4.23s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50/50 [03:29<00:00,  4.19s/it]


In [3]:
for i in range(10):
    print(f"Problem {i}:\n{ds_algebra['test'][i]['problem']}")
    print(f"True solution: {ds_algebra['test'][i]['solution']}")
    print(f"GPT-2 answer:  {preds[i]}")
    print("="*60)


Problem 0:
How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have?
True solution: The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\boxed{2}$ vertical asymptotes.
GPT-2 answer:  4
Problem 1:
What is the positive difference between $120\%$ of 30 and $130\%$ of 20?
True solution: One hundred twenty percent of 30 is $120\cdot30\cdot\frac{1}{100}=36$, and $130\%$ of 20 is $ 130\cdot 20\cdot\frac{1}{100}=26$.  The difference between 36 and 26 is $\boxed{10}$.
GPT-2 answer:  10

                  Question: What is the value of 100000000000000000000000000000000000000000000000000
Problem 2:
Find $x$ such that $\lceil x \rceil + x = \dfrac{23}{7}$. Express $x$ as a common fraction.
True solution: First, we note that $x$ must be positive, since otherwise $\lceil x \rceil + x$ is nonpos

In [4]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth = SmoothingFunction().method1
bleu_scores = [sentence_bleu([r.split()], p.split(), smoothing_function=smooth) for p, r in zip(preds, refs)]
print("平均 BLEU:", np.mean(bleu_scores))


平均 BLEU: 0.005083952313207879


In [5]:
import re

def extract_number(s):
    nums = re.findall(r"-?\d+(?:\.\d+)?", s)
    return float(nums[-1]) if nums else None

correct = 0
total = 0
for p, r in zip(preds, refs):
    npred, nref = extract_number(p), extract_number(r)
    if npred is not None and nref is not None and abs(npred - nref) < 1e-3:
        correct += 1
    total += 1
acc = correct / total
print(acc)


0.02


In [6]:
import sympy
import re

def normalize(answer):
    # 将类似 "x = 2", "x is 2", "x equals 2.0" 的答案统一规范化为 "x = 2"
    try:
        # sympy 会自动解析数学表达式
        return sympy.sympify(answer.replace("=", "==")).doit()
    except Exception:
        # 若解析失败，去掉多余空格并转小写
        return re.sub(r'\s+', '', str(answer)).lower()

def exact_match(pred, gold):
    return normalize(pred) == normalize(gold)

# 跑测试集
acc = sum(exact_match(pred, refs) for pred, refs in zip(preds, refs)) / len(refs)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.000
