In [1]:
# credits:
# https://www.kaggle.com/code/bsmit1659/aimo-vllm-accelerated-tot-sc-deepseekmath

In [2]:
!pip uninstall -y torch -q
!pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q
# keep data in float16 to avoid OOM
file_path = '/opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py'
with open(file_path, 'r') as file:
    file_contents = file.readlines()
file_contents = [line for line in file_contents if "logits = logits.float()" not in line]
with open(file_path, 'w') as file:
    file.writelines(file_contents)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
dask-cuda 23.8.0 requires pynvml<11.5,>=11.0.0, but you have pynvml 11.5.0 which is incompatible.
raft-dask 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.[0m[31m
[0m

In [3]:
from vllm import LLM, SamplingParams
import numpy as np
from transformers import LlamaForSequenceClassification
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
llm = LLM(model="/kaggle/input/deepseek-math",
          dtype='half',
          enforce_eager=True,
          gpu_memory_utilization=0.99,
          swap_space=4,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)

tokenizer = llm.get_tokenizer()

prm_tokenizer = tokenizer
prm_model = LlamaForSequenceClassification.from_pretrained('/kaggle/input/prm-shep',\
                                                    num_labels=1,\
                                                    device_map="cuda:1",
                                                    torch_dtype="auto",
                                                    ).eval()
base_model = prm_model.model
prm_model.score.load_state_dict(torch.load('/kaggle/input/prm-shep/model_score.pth'))

2024-05-18 17:03:11,246	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 05-18 17:03:12 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 05-18 17:03:12 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='/kaggle/input/deepseek-math', tokenizer='/kaggle/input/deepseek-math', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-18 17:03:13 selector.py:40] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 05-18 17:03:13 selector.py:25] Using XFormers backend.
INFO 05-18 17:05:03 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 05-18 17:05:04 gpu_executor.py:94] # GPU blocks: 177, # CPU blocks: 1092


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/prm-shep and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
import aimo
env = aimo.make_env()
iter_test = env.iter_test()

In [5]:
logit2prob = lambda x: 1/(1+np.exp(-x))
def eval_prm(candidates):
    all_log_probs = []
    for i in range(len(candidates)):
        input_ids = prm_tokenizer.encode(candidates[i], return_tensors="pt").to("cuda:1")
        with torch.no_grad():
            hidden_states = base_model(input_ids)[0][:,-1] # 1,l,d -> 1,d
            logits = prm_model.score(hidden_states)[0]
        all_log_probs.append(logit2prob(logits.item()))
    return all_log_probs

In [6]:
stop_words = [tokenizer.eos_token if tokenizer is not None and tokenizer.eos_token is not None else '</s>']
stop_words.append("\n")

sampling_params = SamplingParams(temperature=1,
                                 max_tokens=256,
                                 min_tokens=32,
                                 stop=stop_words)

cot_instruction = "\nYou are an expert at mathematical reasoning. Please reason step by step, and put your final answer within \\boxed{}. The answer should be an interger between 0 and 999."


n = 1 # beams
n_sol = 6
samples = 21
max_depth = 24
max_pct = 0.66

all_prompts = []
total_paths = []
total_answers = []

def is_integer(num):
    if isinstance(num, float):
        return num.is_integer()
    elif isinstance(num, int):
        return True
    else:
        return False
    
def is_between_0_and_999(num):
    return 0 <= num <= 999

import re
def extract_number(text):
    patterns = [
        r'The answer is.*\\boxed\{(.*?)\}',
        r"The answer is[:\s]*\$([0-9]+)\$",
        r"The answer is[:\s]*([0-9]+)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
    return 'parse err'

def group_and_sum(A, B):
    '''
    A = ['a','b','a']
    B = [1,2,3]
    -> {'a': 4, 'b': 2}
    '''
    result_dict = {}
    for a, b in zip(A, B):
        if a in result_dict:
            result_dict[a] += b
        else:
            result_dict[a] = b
    return result_dict

def max_dict(d):
    return max(d.items(), key=lambda x: x[1])[0]

def tot_agg(completed_paths):
    answers,scores = zip(*completed_paths)
    if answers:
        groups = group_and_sum(answers, scores)
        return max_dict(groups)
    else:
        return 37 # empty completed_paths


for test, sample_submission in iter_test:
    problem = test['problem'].values[0]

    messages = [
        {
            "role": "user",
            "content": problem + cot_instruction
        }
    ]

    base_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
    current_level = 1

    current_level_nodes = [base_prompt]
    completed_paths = []
    completed_path_splits = []
    try:
        while (len(completed_paths) < n_sol) and (current_level < max_depth) and (current_level_nodes):
            # for generation, remove special tokens for PRM
            batch_responses = llm.generate(current_level_nodes*samples, sampling_params)
            prm_inputs = []

            # Collect candidates for reward model evaluation
            for candidate,parent in zip(batch_responses,current_level_nodes*samples):
                prm_input = parent + candidate.outputs[0].text
                prm_inputs.append(prm_input)
                
            # Get the indices of unique elements in prm_inputs
            unique_indices = [i for i, x in enumerate(prm_inputs) if prm_inputs.index(x) == i]
            prm_inputs = [prm_inputs[i] for i in unique_indices]

            # Batch reward model evaluation
            prm_scores = eval_prm(prm_inputs)
    #             prm_scores = [min(old,new) for old,new in zip(current_scores,prm_scores)]
            next_level_nodes = []
            nodes_split = []

            # Prune to keep only the top 'n' candidates based on scores
            combined = list(zip(prm_inputs,prm_scores))
            combined.sort(key=lambda x: x[1], reverse=True)  # Sort nodes by their scores
            max_score = combined[0][1]
            for node,score in combined:
                answer = extract_number(node)
                if answer == 'parse err': # not finished
                    if len(next_level_nodes) < n:
                        next_level_nodes.append(node)
                else: # finished
                    if score > max_score * max_pct:
                        try:
                            answer = eval(answer)
                            if is_integer(answer) and is_between_0_and_999(answer):# correct format
                                completed_paths.append((answer,score))
                        except: # bad eval
                            continue
            # if current_level_nodes is empty, all max out or err out. exit loop
            current_level_nodes =  next_level_nodes
            current_level += 1

        #     print(f'problem {i}, sol {completed_paths}')
        #     total_paths.append(completed_paths)

        sample_submission['answer'] = tot_agg(completed_paths)
    except:
        sample_submission['answer'] = 37
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Processed prompts: 100%|██████████| 21/21 [00:07<00:00,  2.82it/s]
Processed prompts: 100%|██████████| 21/21 [00:05<00:00,  3.86it/s]
Processed prompts: 100%|██████████| 21/21 [00:07<00:00,  2.88it/s]
Processed prompts: 100%|██████████| 21/21 [00:05<00:00,  3.85it/s]


In [7]:
# total_paths
# len(set(current_level_nodes)),len(current_level_nodes),len(set(prm_inputs)),len(prm_inputs)