# How to load calibration dataset from HuggingFace

In [2]:
from datasets import load_dataset

dataset = "math500"
policy_model_id_safe = "Llama-3.2-1B-Instruct"

ds = load_dataset("young-j-park/prm_calibration", data_files=f"{dataset}/{policy_model_id_safe}/data.json", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sample_idx = 38
x = ds[sample_idx]

question = x["question"]
reasoning_prefix = x["reasoning_prefix"]
success_prob = x["success_prob"]

print(f"Question: {question}")
print(f"Prefix: {reasoning_prefix}")

Question: A reflection takes $\begin{pmatrix} 5 \\ 0 \end{pmatrix}$ to $\begin{pmatrix} 4 \\ 3 \end{pmatrix}.$  Which vector does the reflection take $\begin{pmatrix} -2 \\ 3 \end{pmatrix}$ to?
Prefix: ## Step 1: Find the midpoint of the vector $\begin{pmatrix} 5 \\ 0 \end{pmatrix}$ and $\begin{pmatrix} 4 \\ 3 \end{pmatrix}$.
The midpoint formula is given by $\left( \frac{x_1+x_2}{2}, \frac{y_1+y_2}{2} \right)$.

## Step 2: Calculate the midpoint using the given vectors.
Midpoint = $\left( \frac{5+4}{2}, \frac{0+3}{2} \right) = \left( \frac{9}{2}, \frac{3}{2} \right)$.

## Step 3: Determine the vector perpendicular to the line of reflection.
This can be found by calculating the vector $\begin{pmatrix} 5 \\ 0 \end{pmatrix} - \left( \frac{9}{2}, \frac{3}{2} \right) = \begin{pmatrix} 5-\frac{9}{2} \\ 0-\frac{3}{2} \end{pmatrix} = \begin{pmatrix} \frac{1}{2} \\ -\frac{3}{2} \end{pmatrix}$.

## Step 4: Calculate the projection of $\begin{pmatrix} -2 \\ 3 \end{pmatrix}$ onto the perpendicula

# How to load PRM from HuggingFace

In [None]:
from prm import load_prm

prm_model_id = "Qwen/Qwen2.5-Math-PRM-7B"
prm = load_prm(prm_model_id)

Loading checkpoint shards: 100%|█████████████████| 4/4 [00:02<00:00,  1.50it/s]
Some weights of the model checkpoint at Qwen/Qwen2.5-Math-PRM-7B were not used when initializing Qwen2ForProcessRewardModel: {'lm_head.weight'}
- This IS expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Qwen2ForProcessRewardModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Uncalibrated Quantile Scores

In [5]:
# original prm: 3584 -> 2 [good, bad]
prm.model.score

Sequential(
  (0): Linear(in_features=3584, out_features=3584, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3584, out_features=2, bias=True)
)

In [6]:
prm.quantile_regression

False

In [7]:
uncalibrated_scores = prm.score([question], [[reasoning_prefix]])
prefix_reward = uncalibrated_scores[0][0][-1]  # we only need the last score

print("Success Probability (Ground Truth):", success_prob)
print("Uncalibrated PRM Reward (Estimation):", prefix_reward)

0it [00:00, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
1it [00:00,  1.82it/s]

Success Probability (Ground Truth): 0.0
Uncalibrated PRM Reward (Estimation): 0.375





## Attatch quantile heads

In [8]:
# calibrated prm: 3584 -> 3 [0.1, 0.5, 0.9]
prm.convert_to_quantile_regression_head(M=3)
prm.model.score

Sequential(
  (0): Linear(in_features=3584, out_features=3584, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3584, out_features=3, bias=True)
)

In [9]:
prm.quantile_regression

True

## Load from HF

In [10]:
from peft import PeftModel

prm_model_id_safe = prm_model_id.split("/")[-1]

peft_model_id = f"young-j-park/{prm_model_id_safe}-calibrated-{policy_model_id_safe}"
peft_model = PeftModel.from_pretrained(prm.model, peft_model_id)

### Calibrated Quantile Scores

In [12]:
calibrated_scores = prm.score([question], [[reasoning_prefix]])
prefix_reward_quantiles = calibrated_scores[0][0][-1]  # we only need the last score

print("Success Probability (Ground Truth):", success_prob)
print("10% Quantile (Estimation):", prefix_reward_quantiles[0])
print("50% Quantile (Estimation):", prefix_reward_quantiles[1])
print("90% Quantile (Estimation):", prefix_reward_quantiles[2])

1it [00:00, 27.34it/s]

Success Probability (Ground Truth): 0.0
10% Quantile (Estimation): 7.486343383789062e-05
50% Quantile (Estimation): 0.0025482177734375
90% Quantile (Estimation): 0.064453125



