In [1]:
import os, psutil 
import gc
import time
import copy
import pprint
import json
from collections import defaultdict

import re
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from dataclasses import dataclass

In [2]:

import torch 
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
from vllm import LLM, SamplingParams


In [3]:
# from sal.models.reward_models import RLHFFlow

from sal.search.utils import Beam, build_conv, generate_k_steps, last
from sal.config import Config
# from sal.models.reward_models import PRM
from sal.utils.score import aggregate_scores

from reward_models import RLHFFlow

In [4]:
# base_path
base_path = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_path = base_path + "/prm800k/math_splits"

# llm and prm path
llm_path = base_path + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_path = base_path + "/Llama-3.2-1B-Instruct"
prm_tokenizer_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data"

In [34]:
# gpu_memory_utilization=0.2
llm_vllm = LLM(
    model = llm_tokenizer_path,
    gpu_memory_utilization = 0.4,  # Utilize 50% of GPU memory
    max_model_len = 20000,
    dtype = "float16",
    seed = 123)

# use the gguf quantized model 
# llm_vllm = LLM(
#     model = llm_path,
#     tokenizer = llm_tokenizer_path,
#     tensor_parallel_size=1,
#     gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
#     max_model_len = 20000,
#     dtype = "float16",
#     seed = 123)

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 03-21 21:53:39 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 03-21 21:53:39 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=20000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=123, serv

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-21 21:53:41 model_runner.py:1115] Loading model weights took 2.3029 GB
INFO 03-21 21:53:42 worker.py:267] Memory profiling takes 0.82 seconds
INFO 03-21 21:53:42 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.40) = 12.69GiB
INFO 03-21 21:53:42 worker.py:267] model weights take 2.30GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.24GiB; the rest of the memory reserved for KV Cache is 9.15GiB.
INFO 03-21 21:53:42 executor_base.py:111] # cuda blocks: 18741, # CPU blocks: 8192
INFO 03-21 21:53:42 executor_base.py:116] Maximum concurrency for 20000 tokens per request: 14.99x
INFO 03-21 21:53:43 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_ut

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:19<00:00,  1.84it/s]

INFO 03-21 21:54:02 model_runner.py:1562] Graph capturing finished in 19 secs, took 0.04 GiB
INFO 03-21 21:54:02 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 20.65 seconds





#--- memory: 11.538497924804688


In [33]:
del(llm_vllm)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 0.0709228515625


In [11]:
prm = RLHFFlow(model_path=prm_tokenizer_path, device_map='cuda:1')

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#--- memory: 14.95752763748169


In [12]:
data_by_levels = defaultdict(list)
with open(f"{dataset_path}/test.jsonl", 'r', encoding='utf-8') as filein:
    for line in filein:
        if line.strip():
            data = json.loads(line)
            # print(data['level'])
            data_by_levels[f"{data['level']}"].append(data)

    # data =  [json.loads(line) for line in filein if line.strip()]
    # pprint.pprint(data, compact=True)

for key in range(1,6):
    key = str(key)
    print(f"{key}: {len(data_by_levels[key])}")
    # pprint.pprint(data_by_levels[key][:2], compact=True)
# print(data_by_levels.keys())
# pprint.pprint(data_by_levels['2'], compact=True)

1: 43
2: 90
3: 105
4: 128
5: 134


In [25]:
def best_of_n(batch_of_prompts, config: Config, llm_vllm: LLM, prm):

    convs = [
        [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": prompt},
        ]
        for prompt in batch_of_prompts
    ]
    
    tokenizer = llm_vllm.get_tokenizer()
    # TODO: set the augmented template from a file
    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
        
    templated_convs = tokenizer.apply_chat_template(
        convs, add_generation_prompt=True, tokenize=False,
    )

    # Duplicate convs to generate config.n completions per prompt so we can do continous batching
    # This makes [p1, p2, p3, p4] become [p1, p1, p2, p2, p3, p3, p4, p4] for e.g. config.n=2
    templated_convs = [c for conv in templated_convs for c in [conv] * config.n]

    # Initialize empty lists for completions and completion tokens
    completions = [[] for _ in range(len(batch_of_prompts))]
    completion_tokens = [[] for _ in range(len(batch_of_prompts))]

    # Create a sampling_params 
    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        n=1,  # Since we've already duplicated the prompt_token_ids, we only need to generate 1 completion per prompt
    )

    # Generate responses 
    responses = llm_vllm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    if len(responses) != len(batch_of_prompts) * config.n:
        raise ValueError(
            f"Generated {len(responses)} responses instead of {len(batch_of_prompts * config.n)}"
        )

    # Collect the completions from responses
    for i in range(len(completions)):
        completions[i] = [
            output.text
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]
        completion_tokens[i] = [
            len(output.token_ids)
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]

    # Check we generated the correct number of completions for each prompt
    for c in completions:
        if len(c) != config.n:
            raise ValueError(f"Generated {len(c)} completions instead of {config.n}")

    # Compute the scores of completions
    scores = prm.score(batch_of_prompts, completions)
    agg_scores = [
        [aggregate_scores(s, config.agg_strategy) for s in score] for score in scores
    ]
    # print(agg_scores)
    # print(len(completions))

    results = {"completions": [], "best_completions": [], "completion_tokens": [], "all_scores": [], "best_scores": []}
    results["completions"] = completions
    results["completion_tokens"] = completion_tokens
    results["all_scores"] = agg_scores
    
    for pidx in range(len(batch_of_prompts)):
        best_idx = np.argmax(agg_scores[pidx])
        results["best_scores"].append(agg_scores[pidx][best_idx])
        results["best_completions"].append(completions[pidx][best_idx])

    return results

In [14]:
# general params
config = Config()
config.agg_strategy = 'last'
config.n = 4                  # num of generations in BoN 
config.lookahead = 0
config.num_iterations = 10
config.sort_completed = False

# diverse_select params
config.lam = 10
config.normalize_embeddings = True

In [35]:
num_questions = len(data_by_levels['4'])
num_questions = 10
num_trials = 50
print(num_questions)

batch_of_prompts = [data_by_levels['4'][q_idx]['problem'] for q_idx in range(num_questions)]

start_time = time.time()
results = best_of_n(batch_of_prompts, config, llm_vllm, prm)
all_best_scores = results["best_scores"]
total_time = time.time() - start_time

score_mean = np.mean(all_best_scores)
score_error = np.std(all_best_scores, ddof=1)/np.sqrt(num_questions)

print(f"result: {score_mean:4f} (\u00B1{score_error:0.4f})")
print(f"it takes {total_time:0.4f}s")

10
result: 0.782007 (±0.1040)
it takes 17.0415s


In [None]:
level = '4'
num_questions = len(data_by_levels[level])
# num_questions = 10
num_trials = 50
print(f"num_trials = {num_trials}")
print(f"num_questions = {num_questions}")

batch_of_prompts = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]

start_time = time.time()
all_results = []
for trial_idx in range(num_trials):
    print(f"trial {trial_idx}")
    results = best_of_n(batch_of_prompts, config, llm_vllm, prm)
    all_results.append(results)
    # all_best_scores = results["best_scores"]
total_time = time.time() - start_time

print(f"it takes {total_time:0.4f}s")

result_filename = "results/run_best_of_n_prm800k_level{level}_v21.json"
with open(result_filename, 'w+', encoding = 'utf-8') as fout:
    json.dump(all_results, fout, ensure_ascii=True, indent=4)

# score_mean = np.mean(all_best_scores)
# score_error = np.std(all_best_scores, ddof=1)/np.sqrt(num_questions)

# print(f"result: {score_mean:4f} (\u00B1{score_error:0.4f})")


num_trials = 50
num_questions = 128


In [12]:
print(results.keys())

dict_keys(['completions', 'best_completions', 'completion_tokens', 'all_scores', 'best_scores'])


In [37]:
total_score = 0
correct_idxes = []
for q_idx in range(num_questions):
    print(f"question {q_idx}")
    # print(f"question: {data_by_levels['4'][q_idx]['problem']}")
    best_completion = results['best_completions'][q_idx]
    print(f"best completion: {best_completion}")
    pred_answer = extract_last_boxed_answer(best_completion)
    gt_answer = data_by_levels['4'][q_idx]['answer']
    is_correct = grader.grade_answer(pred_answer, gt_answer)
    print(f"pred answer: {pred_answer}")
    print(f"gt answer: {gt_answer}")
    print(f"is correct: {is_correct}")
    print(f"all scores = {results['all_scores'][q_idx]}")
    print(f"best score = {results['best_scores'][q_idx]}")
    if is_correct:
        correct_idxes.append(q_idx)

num_corrects = len(correct_idxes)
acc = num_corrects/num_questions
print(f"num correct answers = {num_corrects}")
print(f"acc = {acc:0.4f}")

question 0
best completion: ## Step 1: Identify the direction vectors of the two lines
To find the angle between the two lines, we first need to identify their direction vectors. The direction vectors corresponding to the given lines are (2,3,-1) and (3,-1,-4) respectively.

## Step 2: Calculate the dot product of the direction vectors
The dot product of two vectors (a,b,c) and (d,e,f) is given by the formula ad + be + cf. We calculate the dot product of the direction vectors:
(2,3,-1) · (3,-1,-4) = 2(3) + 3(-1) + (-1)(-4)

## Step 3: Simplify the dot product
Simplifying the expression:
6 - 3 + 4 = 7

## Step 4: Calculate the magnitudes of the direction vectors
Now, we need to calculate the magnitudes (lengths) of the direction vectors:
Magnitude of (2,3,-1) = √(2² + 3² + (-1)²) = √(4 + 9 + 1) = √14
Magnitude of (3,-1,-4) = √(3² + (-1)² + (-4)²) = √(9 + 1 + 16) = √26

## Step 5: Calculate the cosine of the angle between the two lines
We use the formula for the cosine of the angle betwe

In [36]:
def extract_last_boxed_answer(text):
    """
    Extracts the content inside the last \\boxed{...} in the given text, 
    handling nested braces properly.
    """
    # Find the starting index of the last '\\boxed{'
    boxed_start = text.rfind('\\boxed{')
    if boxed_start == -1:
        return None  # No \\boxed{ found
    
    # Start after the opening '{'
    start_index = boxed_start + len('\\boxed{')
    brace_count = 1  # We've seen the opening '{'
    content = ''
    
    # Iterate through the text to find the matching closing brace
    for i in range(start_index, len(text)):
        char = text[i]
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0:
                return content.strip()  # Return content when braces balance
        content += char
    
    return None  # No matching closing brace found

In [27]:
del(extract_answer)

In [16]:
import grader