In [1]:
from vllm import LLM, SamplingParams
import os
import gzip
import json
from typing import Any, Callable, Dict, List, Optional, Tuple
from collections import defaultdict
import random
from transformers import AutoTokenizer
from functools import reduce
import torch
import pickle

import blobfile as bf
import numpy as np
import orjson

In [2]:
# model_name_or_path = "/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9"
model_name_or_path = "/data/users/zhangjunlei/tyx/reward-by-prm800k/models/direct-prediction/meta-llama/Llama-2-7b-hf"
tokenizer_name_or_path = "/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--hf-internal-testing--llama-tokenizer/snapshots/99eceeba6e8289bee767f0771166b5917e70e470"
# gpu_ids = [1,3,5,7]
gpu_ids = [0]
tensor_parallel_size=len(gpu_ids)
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
samples_to_rate_path = "/data/users/zhangjunlei/tyx/reward-by-prm800k/datasets/prm800k-scored-test-samples.jsonl"
input_ids_path = "/data/users/zhangjunlei/tyx/reward-by-prm800k/datasets/gpt4-generated-math-solutions-till-each-step-input-ids-list.pkl"
rated_samples_path = "/data/users/zhangjunlei/tyx/reward-by-prm800k/eval/rated-samples/gpt-4-generatations/llama-2-7b-2023-08-15-2-step-2040-ratings.jsonl"

__DEBUG__ = False
# __DEBUG__ = True
__DEBUG_FOR__ = {
    # "inference_sample_num": 100
    "inference_sample_num": None
}

In [3]:
def json_loads(s: str) -> Dict:
    try:
        return orjson.loads(s)
    except Exception:
        return json.loads(s)  # fallback


def open_jsonl(file: str):
    if file.endswith(".gz"):
        return gzip.open(bf.BlobFile(file, "rb"))
    return bf.BlobFile(file, "r")


def _read_jsonl(file: str) -> List[Dict]:
    assert bf.exists(file), file
    with open_jsonl(file) as f:
        return [json_loads(l) for l in f.readlines() if l]

In [5]:
generated_samples = _read_jsonl(samples_to_rate_path)
# samples_by_problem = _key_by_problem(samples)  # group samples by problem
# num_problems = len(samples_by_problem)  # num of problmes

In [6]:
print(len(generated_samples))

815631


In [7]:
# sample = random.choice(all_samples)
sample = generated_samples[0]
print(sample)


{'problem': 'A $90^\\circ$ rotation around the origin in the counter-clockwise direction is applied to $7 + 2i.$  What is the resulting complex number?', 'answer': '7i - 2', 'is_correct': True, 'subject': 'Precalculus', 'level': 2, 'unique_id': 'test/precalculus/779.json', 'steps': ['I know that a $90^\\circ$ rotation around the origin in the complex plane can be achieved by multiplying the original complex number by $i.$', 'So, I can write $7 + 2i$ as $7i - 2$ after applying the rotation.', 'To check, I can draw the original and rotated complex numbers on the complex plane and see if they form a right angle at the origin.', 'The original complex number corresponds to the point $(7, 2)$ and the rotated one to the point $(-2, 7).$', 'I can use the dot product formula to see if they are perpendicular.', 'The dot product of two perpendicular vectors is zero, so I want to see if $(7, 2) \\cdot (-2, 7) = 0.$', 'I can calculate this by multiplying the corresponding components and adding them

In [8]:
for rating2prob_step in sample["rating_probs"]:
    print(rating2prob_step)
    print(sum(list(rating2prob_step.values())))

{'-1': 0.4955421880432429, '1': 0.4850565065558231, '0': 0.019401305400933915}
0.9999999999999999
{'-1': 0.8115352052850885, '1': 0.16596184275124684, '0': 0.02250295196366458}
0.9999999999999999
{'0': 0.6546451880933941, '1': 0.31537486890985433, '-1': 0.02997994299675163}
1.0
{'1': 0.8262738884082665, '0': 0.1682199724210449, '-1': 0.005506139170688614}
1.0
{'1': 0.49459962721441486, '0': 0.49145594479362104, '-1': 0.013944427991964166}
1.0000000000000002
{'1': 0.8565057970399176, '0': 0.13033172825087222, '-1': 0.013162474709210219}
1.0
{'1': 0.9610508839385529, '0': 0.03856784254309562, '-1': 0.00038127351835143063}
0.9999999999999999
{'1': 0.7603325694682361, '0': 0.23845109311131918, '-1': 0.0012163374204446772}
1.0
{'-1': 0.4995217602188542, '1': 0.2852564103239134, '0': 0.21522182945723228}
1.0


In [9]:
positive_probs = [rating_str2_prob['1'] for rating_str2_prob in sample["rating_probs"]]
positive_probs_product = reduce(lambda x, y: x * y, positive_probs)
positive_probs_minimum = min(positive_probs)
print(positive_probs_product, positive_probs_minimum)
print(sample["prm_score"])

0.0018523420058605313 0.16596184275124684
0.044593952640983744


In [10]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=4096)

rating2word = {1: "positive", -1: "negative", 0: "neutral"}
rating_words = list(rating2word.values())
rating_token_ids = tokenizer(rating_words, add_special_tokens=False).input_ids # [[6374], [8178], [21104]]
rating_token_ids = set([token_id[0] for token_id in rating_token_ids]) # [6374, 8178, 21104]
print(rating_token_ids)
token_id2rating_str = {tokenizer(word, add_special_tokens=False).input_ids[0]: str(rating) for rating, word in rating2word.items()}
print(token_id2rating_str)

{21104, 8178, 6374}
{6374: '1', 8178: '-1', 21104: '0'}


In [11]:
# no default pad token for llama!
# here we add all special tokens again, because the default ones are not in the special_tokens_map

num_added_tokens = tokenizer.add_special_tokens(
    {
        "bos_token": "<s>",
        "eos_token": "</s>",
        "unk_token": "<unk>",
        "pad_token": "<pad>",
    }
)
assert num_added_tokens in [
    0,
    1,
], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."


In [None]:
# print(type(tokenizer))
print(tokenizer)

LlamaTokenizerFast(name_or_path='/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--hf-internal-testing--llama-tokenizer/snapshots/99eceeba6e8289bee767f0771166b5917e70e470', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False)


In [22]:
# input_ids_list = []
# for sample in all_samples[:10000]:
#     problem = sample["problem"]
#     steps = sample["steps"]
#     solution_text = problem + "\n"
#     solution_input_ids = tokenizer(
#         problem + "\n",
#         return_tensors="pt",
#         padding=False,
#         truncation=False,
#         add_special_tokens=True,
#         return_attention_mask=False,
#     )["input_ids"][0]

#     solution_input_ids_list = []
#     for step in steps:
#         step = step.strip()

#         step_input_ids = tokenizer(
#             "\n" + step + "\n",
#             return_tensors="pt",
#             padding=False,
#             truncation=False,
#             # add_special_tokens=True,
#             add_special_tokens=False,
#             return_attention_mask=False,
#         )["input_ids"][0]
#         step_input_ids = step_input_ids[2:]  # remove "\n" at beginning
#         solution_input_ids = torch.cat((solution_input_ids, step_input_ids), dim=-1)[:-1]
#         solution_input_ids_list.append(solution_input_ids)
#     input_ids_list += solution_input_ids_list
# print(input_ids_list[:10])

In [12]:
prompts = None

In [6]:

prompts = []
for sample in generated_samples:
    problem = sample["problem"]
    steps = sample["steps"]
    solution_so_far = problem
    solution_until_step_idx = []
    for step in steps:
        solution_so_far += "\n" + step
        solution_until_step_idx.append(solution_so_far)
    prompts += solution_until_step_idx
    if __DEBUG__: break


In [15]:
if prompts is not None:
    print(len(prompts))
    print(prompts[0])

9703694
A $90^\circ$ rotation around the origin in the counter-clockwise direction is applied to $7 + 2i.$  What is the resulting complex number?
I know that a $90^\circ$ rotation around the origin in the complex plane can be achieved by multiplying the original complex number by $i.$


In [4]:
prompts = None
if prompts is None:
    if os.path.exists(input_ids_path):
        with open(input_ids_path, "rb") as f:
            prompt_input_ids_list = pickle.load(f)
    else:
        # input_token_ids = tokenizer(all_prompts_to_rate[:1000], return_tensors="pt", padding="longest", add_special_tokens=True).input_ids
        prompt_input_ids_list = tokenizer(prompts, add_special_tokens=True).input_ids

        with open(input_ids_path, "wb") as f:
            pickle.dump(prompt_input_ids_list, f)
            
        print(prompt_input_ids_list[0])

In [5]:
if prompt_input_ids_list is not None:
    print("prompt_input_ids_list[0]:", prompt_input_ids_list[0])

prompt_input_ids_list[0]: [1, 319, 395, 29929, 29900, 3823, 6034, 29938, 13733, 2820, 278, 3978, 297, 278, 6795, 29899, 13058, 3538, 5305, 338, 7436, 304, 395, 29955, 718, 29871, 29906, 29875, 7449, 29871, 1724, 338, 278, 9819, 4280, 1353, 29973, 13, 29902, 1073, 393, 263, 395, 29929, 29900, 3823, 6034, 29938, 13733, 2820, 278, 3978, 297, 278, 4280, 10694, 508, 367, 14363, 491, 6674, 5890, 278, 2441, 4280, 1353, 491, 395, 29875, 7449]


In [18]:
llm = None

In [6]:
# _DATA_PARALLEL_GROUP = None
llm = LLM(
    model=model_name_or_path,
    tokenizer=tokenizer_name_or_path,
    tokenizer_mode="auto",
    trust_remote_code=True,
    tensor_parallel_size=len(gpu_ids),
    dtype="auto",
    seed=0,
)

INFO 08-17 06:09:18 llm_engine.py:70] Initializing an LLM engine with config: model='/data/users/zhangjunlei/tyx/reward-by-prm800k/models/direct-prediction/meta-llama/Llama-2-7b-hf', tokenizer='/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--hf-internal-testing--llama-tokenizer/snapshots/99eceeba6e8289bee767f0771166b5917e70e470', tokenizer_mode=auto, trust_remote_code=True, dtype=torch.bfloat16, use_dummy_weights=False, download_dir=None, use_np_weights=False, tensor_parallel_size=1, seed=0)
INFO 08-17 06:09:18 tokenizer.py:29] For some LLaMA-based models, initializing the fast tokenizer may take a long time. To eliminate the initialization time, consider using 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer.
INFO 08-17 06:09:33 llm_engine.py:196] # GPU blocks: 7439, # CPU blocks: 512


In [7]:
top_k = 3
sampling_params = SamplingParams(
    temperature=1,
    top_p=1,
    top_k=top_k,
    max_tokens=1,
    logprobs=top_k,
)

# new_sample = sample.copy()

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.

if __DEBUG_FOR__["inference_sample_num"] is not None:
    outputs = llm.generate(prompts=prompts[:__DEBUG_FOR__["inference_sample_num"]], prompt_token_ids=prompt_input_ids_list[:__DEBUG_FOR__["inference_sample_num"]], sampling_params=sampling_params)
else:
    outputs = llm.generate(prompts=prompts, prompt_token_ids=prompt_input_ids_list, sampling_params=sampling_params)


In [15]:
rating2prob_list = []
for idx, output in enumerate(outputs):
    if __DEBUG__:
        print(output)
            
        token_id2logprob = output.outputs[0].logprobs[0]
        print("logprobs:", token_id2logprob)
        
        top_token_ids = token_id2logprob.keys()
        
        logprobs = list(token_id2logprob.values())
        probs = np.exp(logprobs)
        print("probs:", probs)
        sum_probs = sum(probs)
        print("sum(probs):", sum_probs)
        
        norm_probs = probs / sum_probs
        print("norm_probs:", norm_probs)
        
        if set(top_token_ids) != set(rating_token_ids):
            print("idx:", idx)
            print("top_ids:", top_token_ids)
            print("logprobs:", token_id2logprob)
            
        rating_strs = [token_id2rating_str.get(top_token_id, top_token_id) for top_token_id in top_token_ids ]
        rating_probs = {
            rating_str: norm_prob for rating_str, norm_prob in zip(rating_strs, norm_probs)
        }
        print("rating_probs:", rating_probs)
        rating2prob_list.append(rating_probs)
        # break
    else:

        token_id2logprob = output.outputs[0].logprobs[0]

        top_token_ids = token_id2logprob.keys()
        
        logprobs = list(token_id2logprob.values())
        probs = np.exp(logprobs)
        sum_probs = sum(probs)
        
        top_p = 0.95
        if sum_probs < top_p:
            print(f"{idx}: sum_probs < {top_p}",)
        
        norm_probs = probs / sum_probs
        
        if set(top_token_ids) != set(rating_token_ids):
            print(f"{idx}: set(top_token_ids) != set(rating_token_ids):")
            
        rating_strs = [token_id2rating_str.get(top_token_id, top_token_id) for top_token_id in top_token_ids ]
        rating_probs = {
            rating_str: norm_prob for rating_str, norm_prob in zip(rating_strs, norm_probs)
        }
        rating2prob_list.append(rating_probs)
print("rating_probs_list:", rating2prob_list)
print("len(rating_probs_list):", len(rating2prob_list))

rating_probs_list: [{'-1': 0.930783418447706, '1': 0.05950298609472064, '0': 0.00971359545757343}, {'1': 0.9869514676596802, '-1': 0.007535465430872128, '0': 0.005513066909447719}, {'1': 0.8956494787243626, '0': 0.09440075491006221, '-1': 0.009949766365575163}, {'1': 0.9626037721204714, '0': 0.03293848730756566, '-1': 0.004457740571963018}, {'1': 0.9479232720388627, '0': 0.03675497231548548, '-1': 0.015321755645651719}, {'1': 0.8938154222986121, '0': 0.09420744258510651, '-1': 0.01197713511628137}, {'1': 0.760529949783203, '0': 0.2178954787687271, '-1': 0.021574571448069933}, {'1': 0.6781062627227672, '0': 0.3203147177144465, '-1': 0.0015790195627863536}, {'1': 0.9688331775035378, '-1': 0.022784772954677318, '0': 0.008382049541784885}, {'-1': 0.930783418447706, '1': 0.05950298609472064, '0': 0.00971359545757343}, {'1': 0.8079490703640116, '-1': 0.1404004751429728, '0': 0.051650454493015426}, {'1': 0.9740897662796335, '0': 0.015744694753628954, '-1': 0.010165538966737497}, {'1': 0.99094

In [16]:
total_step_num_so_far = 0
for sample in generated_samples:
    sample_step_num_so_far = 0
    for step_idx, step in enumerate(sample["steps"]):
        rating2prob = rating2prob_list[total_step_num_so_far]
        
        rating2prob = {rating: float(prob) for rating, prob in rating2prob.items()}
        
        sample["rating_probs"][step_idx] = rating2prob
        
        total_step_num_so_far += 1
        
    sample["orm_score"] = None

    positive_probs = [rating2prob["1"] for rating2prob in sample["rating_probs"]]
    sample["prm_score"] = {
        "positive_probs_product": float(np.prod(positive_probs)),
        "positive_probs_minimum": float(np.min(positive_probs)),
    } 
    
    if __DEBUG__:
        print(sample)
        break

IndexError: list index out of range



In [None]:
# 打开文件以写入JSONL格式
with open(rated_samples_path, 'w') as f:
    # 迭代列表中的每个元素
    for item in generated_samples:
        # 将列表中的每个元素转换为JSON字符串
        json_str = orjson.dumps(item).decode()
        
        # 将JSON字符串写入文件，并添加换行符以分隔每个元素
        f.write(json_str + '\n')
        if __DEBUG__:
            break
f.close()
            