In [25]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Check MPS availability
if torch.backends.mps.is_available():
    device = "mps"
    print("✓ Using Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = "cuda"
    print("✓ Using NVIDIA GPU")
else:
    device = "cpu"
    print("Using CPU")

print(f"avaible device: {device}")

device = "cpu"
print(f"current device: {device}")

✓ Using Apple Silicon GPU (MPS)
avaible device: mps
current device: cpu


In [None]:
print('Downloading Qwen/Qwen2.5-0.5B to your Mac...')
print('This will take 2-5 minutes...')

model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-0.5B',
    dtype=torch.float16,
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B')

# Save to local disk
model.save_pretrained('../models/Qwen2.5-0.5B')
tokenizer.save_pretrained('../models/Qwen2.5-0.5B')

print('✓ Model downloaded and saved to ../models/Qwen2.5-0.5B')
print('✓ Ready for local development!')

Downloading Qwen/Qwen2.5-0.5B to your Mac...
This will take 2-5 minutes...
✓ Model downloaded and saved to ../models/Qwen2.5-0.5B
✓ Ready for local development!


In [26]:

# Load model on CPU with float32 for better stability
model_path = "../models/Qwen2.5-0.5B"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    dtype=torch.float16,  # float16 works on MPS!
    # dtype=torch.float32,  # Use float32 for CPU
    low_cpu_mem_usage=True,
)
model = model.to(device) 
tokenizer = AutoTokenizer.from_pretrained(model_path)


NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [3]:
# Read the r1_zero.prompt from the prompts folder
prompt_path = "prompts/r1_zero.prompt"
with open(prompt_path, "r", encoding="utf-8") as f:
    r1_zero_prompt = f.read()

print("Loaded r1_zero.prompt from prompts folder:")
print(r1_zero_prompt)


Loaded r1_zero.prompt from prompts folder:
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: {question}
Assistant: <think>


In [1]:
import json

gsm8k_file = "../data/gsm8k/test.jsonl"
gsm8k_data = []
with open(gsm8k_file, "r", encoding="utf-8") as f:
    for line in f:
        gsm8k_data.append(json.loads(line.strip()))

print(f"Loaded {len(gsm8k_data)} samples from {gsm8k_file}")


Loaded 1319 samples from ../data/gsm8k/test.jsonl


In [5]:
question = gsm8k_data[0]['question']
prompt = r1_zero_prompt.replace("{question}", question)

print('prompt:',prompt)
print('r1_zero_prompt:',r1_zero_prompt)

prompt: A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Assistant: <think>
r1_zero_prompt: A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think

In [6]:

# Test generation
prompt = "Q: What is 2+2?\nA:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

print("\nGenerating")
import time
start = time.time()

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id explicitly
        eos_token_id=tokenizer.eos_token_id   # Also set eos_token_id to be explicit
    )

elapsed = time.time() - start
response = tokenizer.decode(outputs[0])

print(f"Response: {response}")
print(f"Time: {elapsed:.2f}s")
print(f"Tokens/sec: {50/elapsed:.1f}")


Generating
Response: Q: What is 2+2?
A: 4<|endoftext|>
Time: 1.20s
Tokens/sec: 41.6


In [8]:
def evaluate_single_worker(model, tokenizer, r1_zero_prompt,  i, current_data):
    import re   
    
    question = current_data['question']
    raw_answer = current_data['answer']
    match = re.search(r'####\s*([-+]?\d*\.?\d+)', raw_answer)
    if match:
        answer = match.group(1)
    else:
        answer = raw_answer
 
    prompt = r1_zero_prompt.replace("{question}", question)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id  
    )

    response = tokenizer.decode(outputs[0])

    from drgrpo_grader import r1_zero_reward_fn

    reward = r1_zero_reward_fn(response, answer, fast=True)

    if "<answer>" in response and "</answer>" in response:
        answer_content = response.split("<answer>")[-1].split("</answer>")[0].strip()
        match2 = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\-?\d*\.?\d+)', answer_content)
        if match2:
            answer_extracted = match2.group(1)
        else:
            answer_extracted = answer_content
    else:
        answer_extracted = ""

    if answer == answer_extracted:
        reward['answer'] = 1
    
    return {'index': i, 'reward': reward}
       

In [None]:
from tqdm import tqdm


sample_num = len(gsm8k_data)

print(f"Running serial processing with {sample_num} samples...")

results = []
start_time = time.time()
for i in tqdm(range(sample_num)):
    result = evaluate_single_worker(model, tokenizer, r1_zero_prompt,  i, gsm8k_data[i])
    results.append(result)
end_time = time.time()

elapsed = end_time - start_time
print(f"Time: {elapsed:.2f}s, Avg: {elapsed/sample_num:.2f}s per sample")

Running serial processing with 1319 samples...


100%|██████████| 1319/1319 [6:29:39<00:00, 17.73s/it]  

Time: 23379.56s, Avg: 17.73s per sample
Results: All=0, Format=1319, Answer=0, None=0





In [21]:

eval_num = {"all_correct": [], "format_correct": [], "answer_correct": [], "nothing_correct": []}
for r in results:
    reward = r["reward"]
    # print(reward)
    if reward["format_reward"] == 1:
        eval_num["format_correct"].append(r["index"])
        if reward["reward"] == 1:
            eval_num["all_correct"].append(r["index"])
        elif reward.get("answer") == 1:
            eval_num["answer_correct"].append(r["index"])
    else:
        eval_num["nothing_correct"].append(r["index"])
        print("reponse", r["response"])
        print("answer", r["answer"])
        print("answer_extracted", r["answer_extracted"])

print(f"""Results: All={len(eval_num["all_correct"])}, Format={len(eval_num["format_correct"])}, Answer={len(eval_num["answer_correct"])}, None={len(eval_num["nothing_correct"])}""")

Results: All=0, Format=1319, Answer=183, None=0


In [23]:
import json
with open("../data/gsm8k/test_eval_num.json", "w") as f:
    json.dump(eval_num, f, indent=4)