In [1]:
import copy
import pprint
import json
import os, psutil
from collections import defaultdict

import gc
import time
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from dataclasses import dataclass

In [2]:

import torch 
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
from vllm import LLM, SamplingParams


In [7]:
# from sal.models.reward_models import RLHFFlow

from sal.search.utils import Beam, build_conv, generate_k_steps, last
from sal.config import Config
# from sal.models.reward_models import PRM
from sal.utils.score import aggregate_scores

from reward_models import RLHFFlow

In [8]:
# base_path
base_path = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_path = base_path + "/prm800k/math_splits"

# llm and prm path
llm_path = base_path + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_path = base_path + "/Llama-3.2-1B-Instruct"
prm_tokenizer_path = base_path + "/Llama3.1-8B-PRM-Deepseek-Data"

In [5]:
# gpu_memory_utilization=0.2
llm = LLM(
    model = llm_tokenizer_path,
    gpu_memory_utilization = 0.2,  # Utilize 50% of GPU memory
    max_model_len = 10000,
    dtype = "float16",
    seed = 123)

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

INFO 03-21 14:09:22 __init__.py:207] Automatically detected platform cuda.
INFO 03-21 14:09:33 config.py:549] This model supports multiple tasks: {'reward', 'classify', 'generate', 'embed', 'score'}. Defaulting to 'generate'.
INFO 03-21 14:09:33 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='/groups/kjun/tnn/datasets//Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-21 14:09:40 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 03-21 14:09:41 worker.py:267] Memory profiling takes 0.79 seconds
INFO 03-21 14:09:41 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.20) = 6.35GiB
INFO 03-21 14:09:41 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 2.73GiB.
INFO 03-21 14:09:41 executor_base.py:111] # cuda blocks: 5592, # CPU blocks: 8192
INFO 03-21 14:09:41 executor_base.py:116] Maximum concurrency for 10000 tokens per request: 8.95x
INFO 03-21 14:09:42 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.20it/s]

INFO 03-21 14:09:58 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.13 GiB
INFO 03-21 14:09:58 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 18.68 seconds





#--- memory: 5.0653533935546875


In [13]:
# del(prm)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 5.0653533935546875


In [10]:
prm = RLHFFlow(model_path=prm_tokenizer_path, device_map='cuda:1')

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#--- memory: 14.95752763748169


In [12]:
data_by_levels = defaultdict(list)
with open(f"{dataset_path}/test.jsonl", 'r', encoding='utf-8') as filein:
    for line in filein:
        if line.strip():
            data = json.loads(line)
            # print(data['level'])
            data_by_levels[f"{data['level']}"].append(data)

    # data =  [json.loads(line) for line in filein if line.strip()]
    # pprint.pprint(data, compact=True)

for key in range(1,6):
    key = str(key)
    print(f"{key}: {len(data_by_levels[key])}")
    # pprint.pprint(data_by_levels[key][:2], compact=True)
# print(data_by_levels.keys())
# pprint.pprint(data_by_levels['2'], compact=True)

1: 43
2: 90
3: 105
4: 128
5: 134


In [16]:
def best_of_n(batch_of_prompts, config: Config, llm: LLM, prm):

    convs = [
        [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": prompt},
        ]
        for prompt in batch_of_prompts
    ]
    
    tokenizer = llm.get_tokenizer()
    # TODO: set the augmented template from a file
    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
        
    templated_convs = tokenizer.apply_chat_template(
        convs, add_generation_prompt=True, tokenize=False,
    )

    # Duplicate convs to generate config.n completions per prompt so we can do continous batching
    # This makes [p1, p2, p3, p4] become [p1, p1, p2, p2, p3, p3, p4, p4] for e.g. config.n=2
    templated_convs = [c for conv in templated_convs for c in [conv] * config.n]

    # Initialize empty lists for completions and completion tokens
    completions = [[] for _ in range(len(batch_of_prompts))]
    completion_tokens = [[] for _ in range(len(batch_of_prompts))]

    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        n=1,  # Since we've already duplicated the prompt_token_ids, we only need to generate 1 completion per prompt
    )

    responses = llm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    if len(responses) != len(batch_of_prompts) * config.n:
        raise ValueError(
            f"Generated {len(responses)} responses instead of {len(batch_of_prompts * config.n)}"
        )

    for i in range(len(completions)):
        completions[i] = [
            output.text
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]
        completion_tokens[i] = [
            len(output.token_ids)
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]

    # Check we generated the correct number of completions for each prompt
    for c in completions:
        if len(c) != config.n:
            raise ValueError(f"Generated {len(c)} completions instead of {config.n}")

    scores = prm.score(batch_of_prompts, completions)
    agg_scores = [
        [aggregate_scores(s, config.agg_strategy) for s in score] for score in scores
    ]
    # print(agg_scores)
    # print(len(completions))

    results = {"completions": [], "pred": [], "completion_tokens": [], "best_scores": []}
    results["completions"] = completions
    results["completion_tokens"] = completion_tokens
    
    for pidx in range(len(batch_of_prompts)):
        best_idx = np.argmax(agg_scores[pidx])
        results["best_scores"].append(agg_scores[pidx][best_idx])
        results["pred"].append(completions[pidx][best_idx])

    return results

In [13]:
# general params
config = Config()
config.agg_strategy = 'last'
config.n = 4                  # num of generations in BoN 
config.lookahead = 0
config.num_iterations = 10
config.sort_completed = False

# diverse_select params
config.lam = 10
config.normalize_embeddings = True

In [21]:
num_questions = len(data_by_levels['4'])
num_questions = 10
print(num_questions)
all_scores =  []
start_time = time.time()
for q_idx in range(num_questions):
    print(f"question {q_idx}")
    batch_of_prompts = [data_by_levels['4'][q_idx]['problem']]
    results = best_of_n(batch_of_prompts, config, llm, prm)
    all_scores.append(results['best_scores'])
    
total_time = time.time() - start_time
avg_scores = np.mean(all_scores)
print(f"avg_scores: {avg_scores}")
print(f"it takes {total_time:0.4f}s")

10
question 0
question 1
question 2
question 3
question 4
question 5
question 6
question 7
question 8
question 9
avg_scores: 0.8376220703125
it takes 42.2492s
