In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random

In [2]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config
from sal.models.reward_models import PRM
from sal.utils.score import score, aggregate_scores

from datasets import Dataset, load_dataset

from core.reward_models import RLHFFlow

from core import best_of_n
from utils.load_data import load_data_prm800k

In [3]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
data_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
if torch.cuda.is_available():
    GPUs = os.environ.get('CUDA_VISIBLE_DEVICES', "0").split(',')
    print(f"GPUs = {GPUs}")
else:
    print("CUDA is not available.") 

GPUs = ['0', '1', '2', '3']


In [5]:
prm = RLHFFlow(model_path=prm_tokenizer_dir, device_map='cuda:1')

gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))
# print('#--- memory:', torch.cuda.memory_allocated(2)/(1024**3))
# print('#--- memory:', torch.cuda.memory_allocated(3)/(1024**3))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#--- memory: 0.0
#--- memory: 14.95752763748169


In [6]:
# del(prm)
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))
print('#--- memory:', torch.cuda.memory_allocated(1)/(1024**3))

#--- memory: 0.0
#--- memory: 14.95752763748169


In [14]:
config = Config()

#  load data 
data_by_levels = load_data_prm800k(data_dir)
# print(data_by_levels)

orig_dataset = load_dataset(config.dataset_name, split=config.dataset_split, cache_dir=data_dir)
print(orig_dataset)
# stop
# for data in dataset:
#     pprint.pprint(data)
#     stop
# ds_completions = load_completions(completions_dir)

# load random_seeds     
# random_seeds = np.loadtxt("random_seeds.txt").astype("int64")
# random_seeds = [int(seed) for seed in random_seeds]

1: 43
2: 90
3: 105
4: 128
5: 134
Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id'],
    num_rows: 500
})


In [59]:
# general params
config = Config()
config.agg_strategy = 'last'
config.n = 8
config.beam_width = 2
config.lookahead = 0
config.num_iterations = 2
config.sort_completed = False

# diverse_select params
config.lam = 10
config.normalize_embeds = True

config.num_proc = 12 

dataset_id = "tnguyen9210/LLM-Reasoning-Math-500"
level = '4'
num_questions = len(data_by_levels[level])
# num_questions = 2
num_trials = 1
print(f"num_questions = {num_questions}")

# get batch of questions
batch_of_questions = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]
orig_dataset_by_level = orig_dataset.filter(lambda example: example['level'] == int(level))
orig_dataset_by_level = orig_dataset_by_level.select(range(num_questions))
print(orig_dataset_by_level)
# for data in orig_dataset_by_level:
#     print(data)

# load completions
config_name  = f"sd_prm800k_level{level}_n{config.n}_bw{config.beam_width}_depth{config.num_iterations}_lam{config.lam}_{config.normalize_embeds}_v11"
completions_dir = f"results/generate_{config_name}.jsonl"
scores_dir = f"results/scores_{config_name}.jsonl"

# compute results
start_time = time.time()    
with open(completions_dir, 'r', encoding = 'utf-8') as fin:
    trial_idx = 0
    for line in fin:
        
        if trial_idx >= num_trials:
            break
            
        trial_data = json.loads(line)

        # Compute the scores of completions
        # print(len(trial_data["completions"]))
        # print(len(batch_of_questions))
        # print(trial_data["completions"][0][0])
        scores = prm.score(batch_of_questions, trial_data["completions"][:num_questions])
        agg_scores = [
            [aggregate_scores(s, config.agg_strategy) for s in score] for score in scores
        ]
        print(agg_scores)
        print((len(trial_data)))
        orig_dataset_by_level = orig_dataset_by_level.add_column("completions", trial_data["completions"][:num_questions])
        orig_dataset_by_level = orig_dataset_by_level.add_column("scores", scores)
        print(orig_dataset_by_level)

        # for data in orig_dataset_by_level:
        #     print(data.keys())
            # print(data["completions"])
                
        orig_dataset_by_level = score(orig_dataset_by_level, config)
        # for data in orig_dataset_by_level:
        #     print(data)
        #     stop

        config_name = f"sd--n-{config.n}--bw-{config.beam_width}--depth-{config.num_iterations}--lam-{config.lam}--{config.normalize_embeds}--level-{level}--trial-{trial_idx}--v11"
        orig_dataset_by_level.push_to_hub(dataset_id, config_name=config_name, split='test')
        orig_dataset_by_level.to_json(f"results/{config_name}.jsonl")
        
        # # compute the time
        # if trial_idx % 1 == 0:
        #     total_time = time.time() - start_time
        #     time_per_trial = total_time/(trial_idx+1)
        #     time_per_question = time_per_trial/num_questions
        #     print(f"trial {trial_idx}")
        #     print(f"it takes {time_per_question:0.4f}s per question")
        #     print(f"it takes {time_per_trial:0.4f}s per trial")

        # # x = defaultdict()
        # # x["problem"] = tri
        trial_idx += 1
        

total_time = time.time() - start_time
print(f"it takes {total_time:0.4f}s in total")

num_questions = 128
Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id'],
    num_rows: 128
})
[[0.0526123046875, 0.0618896484375, 0.042083740234375, 0.018829345703125, 0.175537109375, 0.9111328125, 0.034088134765625, 0.02801513671875], [0.57763671875, 0.57763671875, 0.5, 0.044677734375, 0.021942138671875, 0.021942138671875, 0.56591796875, 0.56591796875], [0.99951171875, 0.2120361328125, 0.9970703125, 0.96044921875, 0.658203125, 0.861328125, 0.97705078125, 0.98681640625], [0.99951171875, 0.99853515625, 0.89892578125, 0.93896484375, 0.26904296875, 0.9970703125, 0.978515625, 0.99755859375], [0.373779296875, 0.373779296875, 0.958984375, 0.89892578125, 0.57763671875, 0.861328125, 0.140380859375, 0.60009765625], [0.0263519287109375, 0.916015625, 0.9228515625, 0.974609375, 0.9072265625, 0.9970703125, 0.515625, 0.0838623046875], [0.955078125, 0.98583984375, 0.93798828125, 0.93798828125, 0.92919921875, 0.9990234375, 0.689453125, 0.689453125], [1.0, 1.0, 0.

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

[4, 8]


Computing majority & weighted predictions:   0%|          | 0/2 [00:00<?, ?it/s]

Subsample 4 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Extract answers 4 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute weighted pred 4 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute majority pred 4 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute naive pred 4 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Computing majority & weighted predictions:  50%|█████     | 1/2 [00:03<00:03,  3.42s/it]

Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id', 'completions', 'scores', 'agg_scores', 'completions@4', 'agg_scores@4', 'preds@4', 'pred_weighted@4', 'pred_maj@4', 'pred_naive@4'],
    num_rows: 128
})


Subsample 8 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Extract answers 8 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute weighted pred 8 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute majority pred 8 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Compute naive pred 8 (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

Computing majority & weighted predictions: 100%|██████████| 2/2 [00:08<00:00,  4.18s/it]

Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id', 'completions', 'scores', 'agg_scores', 'pred_weighted@4', 'pred_maj@4', 'pred_naive@4', 'completions@8', 'agg_scores@8', 'preds@8', 'pred_weighted@8', 'pred_maj@8', 'pred_naive@8'],
    num_rows: 128
})





Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

it takes 369.0821s in total


In [52]:
dataset_id = "tnguyen9210/LLM-Reasoning-Math-500"
orig_dataset_by_level.push_to_hub(dataset_id, config_name=config_name, split='test')
orig_dataset_by_level.to_json(f"results/{config_name}.jsonl")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

43293