In [11]:
%load_ext autoreload
%autoreload 2

# Model Eval using vLLM

In [None]:
from datasets import load_dataset
from typing import Callable, List
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
from vllm import LLM, SamplingParams
import json

OUTPUT_FILE = "results/math_1.5B_eval_results.jsonl"

eval_batch_size = 1000

sampling_params = SamplingParams(
    temperature=1.0,
    top_p=1.0,
    max_tokens=1024,
    stop=["</answer>"],
    include_stop_str_in_output=True,
)

with open("prompts/r1_zero.prompt") as f:
    prompt_template = f.read()


def evaluate_vllm(
    vllm_model: LLM,
    reward_fn: Callable[[str, str], dict[str, float]],
    prompts: List[str],
    ground_truth_list: List[str],
    eval_sampling_params: SamplingParams,
):
    res = []
    model_outputs = vllm_model.generate(prompts, eval_sampling_params)
    for (
        prompt,
        output,
        ground_truth,
    ) in zip(prompts, model_outputs, ground_truth_list):
        model_answer = output.outputs[0].text
        res.append(
            {
                "prompt": prompt,
                "generated_text": model_answer,
                "ground_truth": ground_truth,
                "score": reward_fn(model_answer, ground_truth),
            }
        )
    return res


def main():
    # Load Eval Data
    dataset = load_dataset("jeggers/competition_math", "original")
    # Load Model
    llm = LLM(model="Qwen/Qwen2.5-Math-1.5B")
    for i in range(0, len(dataset["test"]), eval_batch_size):
        examples = dataset["test"][i : i + eval_batch_size]
        prompts = [
            prompt_template.format(question=problem) for problem in examples["problem"]
        ]
        extracted_solutions = [solution for solution in examples["extracted_solution"]]
        batch_results = evaluate_vllm(
            llm, r1_zero_reward_fn, prompts, extracted_solutions, sampling_params
        )
        with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
            for item in batch_results:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        print(f"Batch {i // eval_batch_size + 1} saved. ({len(batch_results)} items)")


main()

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

file_path = "results/math_1.5B_eval_results.jsonl"
eval_df = pd.read_json(file_path, lines=True)
eval_df

Unnamed: 0,prompt,generated_text,ground_truth,score
0,A conversation between User and Assistant. The...,We must first factor $x^2+x-6$.\nLet's use th...,2,"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
1,A conversation between User and Assistant. The...,"To solve this problem, we need to calculate th...",10,"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
2,A conversation between User and Assistant. The...,we have to find t </think> <answer> $41/49$ <...,\dfrac{9}{7},"{'format_reward': 1.0, 'answer_reward': 0.0, '..."
3,A conversation between User and Assistant. The...,To evaluate the expression $i^5+i^{-25}+i^{45...,i,"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
4,A conversation between User and Assistant. The...,"Since $4$ is $2^2$, we can rewrite the equati...",4,"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
...,...,...,...,...
4995,A conversation between User and Assistant. The...,"First, let's square the equation $\sin x + \co...",-\frac{4}{3},"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
4996,A conversation between User and Assistant. The...,Since the projection matrix \( P \) projects o...,\begin{pmatrix} 1 \\ 2 \\ -3 \end{pmatrix},"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
4997,A conversation between User and Assistant. The...,Since $\mathbf{a} + \mathbf{b} + \mathbf{c} = ...,-\frac{155}{2},"{'format_reward': 0.0, 'answer_reward': 0.0, '..."
4998,A conversation between User and Assistant. The...,"okay, I think this problem might require the u...",159,"{'format_reward': 1.0, 'answer_reward': 0.0, '..."


# Generate training SFT data from AI-MO/NuminaMath-CoT

In [1]:
from datasets import load_dataset


def extract_last_boxed_content(text):
    if not isinstance(text, str):
        return None
    start_idx = text.rfind("\\boxed{")
    if start_idx == -1:
        return None
    content_start = start_idx + 7
    brace_depth = 0

    for i in range(content_start, len(text)):
        char = text[i]
        if char == "{":
            brace_depth += 1
        elif char == "}":
            if brace_depth == 0:
                return text[content_start:i]
            brace_depth -= 1

    return None


def format_r1_style(row):
    solution = row["solution"]
    extracted_answer = extract_last_boxed_content(solution)
    if extracted_answer is None:
        extracted_answer = "Error: No boxed answer found"

    formatted_text = f" {solution} </think> <answer> {extracted_answer} </answer>"

    return formatted_text


def format_prompt(row):
    return prompt_template.format(question=row["problem"])


with open("prompts/r1_zero.prompt") as f:
    prompt_template = f.read()

dataset = load_dataset("AI-MO/NuminaMath-CoT")
df = dataset["train"].to_pandas()
df = df[df["source"] == "math"].copy()
df["generated_text"] = df.apply(format_r1_style, axis=1)
df["prompt"] = df.apply(format_prompt, axis=1)
train_df = df[["prompt", "generated_text"]]
train_df

Unnamed: 0,prompt,generated_text
89,A conversation between User and Assistant. The...,To find all real numbers $x$ such that the pr...
279,A conversation between User and Assistant. The...,Let's denote the second integer as $n$. Given...
309,A conversation between User and Assistant. The...,"To solve this problem, we start by dividing t..."
551,A conversation between User and Assistant. The...,To simplify the expression $\frac{1}{1+\sqrt{...
935,A conversation between User and Assistant. The...,To find the real roots of the given polynomia...
...,...,...
859191,A conversation between User and Assistant. The...,To solve for $n$ given that the average value...
859227,A conversation between User and Assistant. The...,To find the distance between the foci of the ...
859401,A conversation between User and Assistant. The...,"To solve the given problem, we first convert ..."
859414,A conversation between User and Assistant. The...,To determine the number of different triangle...


## decontamination

In [4]:
def normalize_text(text):
    return "".join(text.split())


def get_overlap(train_df, eval_df):
    train_fingerprints = set(train_df["prompt"].apply(normalize_text))
    test_fingerprints = set(eval_df["prompt"].apply(normalize_text))
    return train_fingerprints.intersection(test_fingerprints)


overlap = get_overlap(train_df, eval_df)
if len(overlap) > 0:
    train_df["norm_prompt"] = train_df["prompt"].apply(normalize_text)
    train_df = train_df[~train_df["norm_prompt"].isin(overlap)].copy()
    train_df.drop(columns=["norm_prompt"], inplace=True)
assert len(get_overlap(train_df, eval_df)) == 0
train_df.to_json(
    "results/math_1.5B_train.jsonl", orient="records", lines=True, force_ascii=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["norm_prompt"] = train_df["prompt"].apply(normalize_text)


# SFT

## Load base model

In [1]:
from vllm.model_executor import set_random_seed as vllm_set_random_seed
from unittest.mock import patch
from datasets import load_dataset
from typing import Callable, List
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
from vllm import LLM, SamplingParams
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
import torch
import pandas as pd
from cs336_alignment.sft import *
import wandb


tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B")
train_df = pd.read_json("results/math_1.5B_train.jsonl", lines=True)
train_df_batch = train_df[:1]
prompt_strs = list(train_df_batch["prompt"])
output_strs = list(train_df_batch["prompt"])
tokenized_res = tokenize_prompt_and_output(prompt_strs, output_strs, tokenizer)
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_batch_size = 1000
gradient_accumulation_steps = 4

sampling_params = SamplingParams(
    temperature=1.0,
    top_p=1.0,
    max_tokens=1024,
    stop=["</answer>"],
    include_stop_str_in_output=True,
)

with open("prompts/r1_zero.prompt") as f:
    prompt_template = f.read()


def init_policy_model(
    model_id: str,
    device: str,
):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    model.to(device)
    return model


def init_vllm(
    model_id: str, device: str, seed: int, gpu_memory_utilization: float = 0.85
):
    """
    Start the inference process, here we use vLLM to hold a model on a GPU separate from the policy.
    """
    vllm_set_random_seed(seed)

    world_size_patch = patch("torch.distributed.get_world_size", return_value=1)
    profiling_patch = patch(
        "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
        return_value=None,
    )
    with world_size_patch, profiling_patch:
        return LLM(
            model=model_id,
            device=device,
            dtype=torch.bfloat16,
            enable_prefix_caching=True,
            gpu_memory_utilization=gpu_memory_utilization,
        )


def load_policy_into_vllm_instance(policy: PreTrainedModel, llm: LLM):
    state_dict = policy.state_dict()
    llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
    llm_model.load_weights(state_dict.items())


# # Setup wandb metrics
# wandb.init()
# wandb.define_metric("train_step")  # the x‐axis for training
# wandb.define_metric("eval_step")  # the x‐axis for evaluation
# # everything that starts with train/ is tied to train_step
# wandb.define_metric("train/*", step_metric="train_step")
# # everything that starts with eval/ is tied to eval_step
# wandb.define_metric("eval/*", step_metric="eval_step")

INFO 11-22 19:03:18 __init__.py:190] Automatically detected platform cuda.


In [2]:
model = init_policy_model("Qwen/Qwen2.5-Math-1.5B", device)
# eval_model = init_vllm(
#     "Qwen/Qwen2.5-Math-1.5B", device, seed=42, gpu_memory_utilization=0.3
# )

In [6]:
train_df = pd.read_json("results/math_1.5B_train.jsonl", lines=True)
train_df_batch = train_df[:1]
prompt_strs = list(train_df_batch["prompt"])
output_strs = list(train_df_batch["generated_text"])

model_input = tokenize_prompt_and_output(prompt_strs, output_strs, tokenizer, device)
model_output = get_response_log_probs(
    model, model_input["input_ids"], model_input["labels"], True
)
loss, metadata = sft_microbatch_train_step(
    model_output["log_probs"],
    model_input["response_mask"],
    gradient_accumulation_steps,
    1.0,
)

In [None]:
from datasets import load_dataset
import torch

dataset = load_dataset(
    "json", data_files="results/math_1.5B_train.jsonl", split="train"
)


def process_fn(examples):
    prompt_strs = examples["prompt"]
    output_strs = examples["generated_text"]
    model_input = tokenize_prompt_and_output(
        prompt_strs, output_strs, tokenizer, device
    )
    return model_input


import torch

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01,
    betas=(0.9, 0.999),
    eps=1e-8,
    fused=True,
)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=1)
for idx, examples in enumerate(train_loader):
    prompt_strs = examples["prompt"]
    output_strs = examples["generated_text"]
    model_input = tokenize_prompt_and_output(
        prompt_strs, output_strs, tokenizer, device
    )
    model_output = get_response_log_probs(
        model, model_input["input_ids"], model_input["labels"], True
    )
    loss, metadata = sft_microbatch_train_step(
        model_output["log_probs"],
        model_input["response_mask"],
        gradient_accumulation_steps,
        1.0,
    )

tensor(35.7500, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(15., device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(76., device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(18.7500, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(31., device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(54.5000, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(103., device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(62.2500, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(79.5000, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(37.5000, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(35.2500, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(21.8750, device='cuda:0', dtype=torch.bfloat16, grad_fn=<DivBackward0>)
tensor(25.2500, device='cuda:0', dtype=torch.bfloat16, grad_fn=<Div

KeyboardInterrupt: 

In [None]:
import gc

gc.collect()

3672

In [None]:
gradient_accumulation_steps = 4

In [None]:
gc.collect()
torch.cuda.empty_cache()

# 4. 开启省显存模式 (防止下次 OOM)
model.gradient_checkpointing_enable()

print("显存大扫除完成。现在显存应该降回 3GB 左右了。")
print(f"Current Alloc: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

显存大扫除完成。现在显存应该降回 3GB 左右了。
Current Alloc: 6.14 GB
