In [1]:
# credits:
# https://www.kaggle.com/code/bsmit1659/aimo-vllm-accelerated-tot-sc-deepseekmath

In [2]:
!pip uninstall -y torch -q
!pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q

# keep data in float16 to avoid OOM
file_path = '/opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py'
with open(file_path, 'r') as file:
    file_contents = file.readlines()
file_contents = [line for line in file_contents if "logits = logits.float()" not in line]
with open(file_path, 'w') as file:
    file.writelines(file_contents)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
dask-cuda 23.8.0 requires pynvml<11.5,>=11.0.0, but you have pynvml 11.5.0 which is incompatible.
raft-dask 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.[0m[31m
[0m

In [3]:
from vllm import LLM, SamplingParams
import pandas as pd
from tqdm import tqdm
import gc
import re
import sys
import subprocess
from collections import defaultdict, Counter
import numpy as np
from transformers import (AutoModelForCausalLM,
    AutoTokenizer,
    set_seed)
import torch
import math

llm = LLM(model="/kaggle/input/deepseek-math",
          dtype='half',
          enforce_eager=True,
          gpu_memory_utilization=0.99,
          swap_space=4,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)

tokenizer = llm.get_tokenizer()

good_token = '+'
bad_token = '-'
step_tag = 'ки'

prm_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/math-shepherd-mistral-7b-prm')
prm_candidate_tokens = prm_tokenizer.encode(f"{good_token} {bad_token}")[1:] # [648, 387]
step_tag_id = prm_tokenizer.encode(f"{step_tag}")[-1] # 12902
prm_model = AutoModelForCausalLM.from_pretrained('/kaggle/input/math-shepherd-mistral-7b-prm',
                                                 torch_dtype=torch.float16,
                                                 device_map="balanced_low_0").eval()

2024-04-28 19:31:49,928	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 04-28 19:31:51 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-28 19:31:51 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='/kaggle/input/deepseek-math', tokenizer='/kaggle/input/deepseek-math', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-28 19:31:53 selector.py:40] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 04-28 19:31:53 selector.py:25] Using XFormers backend.
INFO 04-28 19:33:19 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-28 19:33:21 gpu_executor.py:94] # GPU blocks: 177, # CPU blocks: 1092


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [4]:
import aimo
env = aimo.make_env()
iter_test = env.iter_test()

In [5]:
def eval_prm(candidates):
    all_log_probs = []
    for i in range(len(candidates)):
        input_ids = prm_tokenizer.encode(candidates[i], return_tensors="pt").to("cuda:1")
        with torch.no_grad():
            logits = prm_model(input_ids).logits[:, :, prm_candidate_tokens] # b,l,C
            scores = logits.softmax(dim=-1)[:,:,0][input_ids == 12902].min()
            all_log_probs.append(scores.item())
    return all_log_probs

In [6]:
stop_words = [tokenizer.eos_token if tokenizer is not None and tokenizer.eos_token is not None else '</s>']
# stop_words.append("\n")

sampling_params = SamplingParams(temperature=1,
                                 max_tokens=2048,
                                 min_tokens=32,
                                 stop=stop_words)

cot_instruction = "\nYou are an expert at mathematical reasoning. Please reason step by step, and put your final answer within \\boxed{}. The answer should be an interger between 0 and 999."


n = 5 # beams
samples = 35
max_depth = 24
overlap_threshold = 0.6
all_prompts = []
total_paths = []
total_answers = []

def is_integer(num):
    if isinstance(num, float):
        return num.is_integer()
    elif isinstance(num, int):
        return True
    else:
        return False
    
def is_between_0_and_999(num):
    return 0 <= num <= 999

def prm_prompt(text, current_level):
    return f"Step {str(current_level)}:" + text + ' ки'

def process_string(long_string, threshold=10):
    chunks = long_string.split("\n")
    result = []
    current_level = 1
    # Process each chunk
    for chunk in chunks:
        if len(chunk) > threshold:
            # Apply the function to chunks with length greater than the threshold
            processed_chunk = prm_prompt(chunk, current_level)
            result.append(processed_chunk)
            current_level += 1  # Increment the count for chunks above the threshold
        else:
            # Append the chunk as is if below the threshold
            result.append(chunk)
    # Join the processed chunks back into a single string if needed
    return "\n".join(result),current_level
    
import re
def extract_number(text):
    patterns = [
        r'The answer is.*\\boxed\{(.*?)\}',
        r"The answer is[:\s]*\$([0-9]+)\$",
        r"The answer is[:\s]*([0-9]+)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
    return 'parse err'

def parse_strict(text):
    try:
        text = extract_number(text.split('\n')[-1])
        if text == 'parse err':
            return 'parse err'
        else:
            answer = eval(text)
            if is_integer(answer) and is_between_0_and_999(answer):
                return int(answer)
            else:
                return 'parse err'
    except:
        return 'parse err'

def tot_agg(completed_paths):
    # [(answer,score,current_level),...]
    if completed_paths:
        return max(completed_paths,key=lambda x:x[1]+x[2]**2*0.00108)[0]
    else:
        return 37 # empty completed_paths


for test, sample_submission in iter_test:
    problem = test['problem']

    messages = [{"role": "user","content": problem + cot_instruction}]
    base_prompt = tokenizer.apply_chat_template(messages,tokenize=False)
    batch_responses = llm.generate([base_prompt]*samples, sampling_params)
    outs = []
    inputs  = []
    for o in batch_responses:
        text = o.outputs[0].text
        answer = parse_strict(text)
        if answer != 'parse err':
            text, level = process_string(text) # add special token for PRM
            outs.append((answer,level))
            inputs.append(text)
    scores = eval_prm(inputs)
    sample_submission['answer'] = tot_agg([(answer,score,level) for score,(answer,level) in zip(scores,outs)])

    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Processed prompts: 100%|██████████| 35/35 [02:40<00:00,  4.59s/it]
Processed prompts: 100%|██████████| 35/35 [00:27<00:00,  1.29it/s]
Processed prompts: 100%|██████████| 35/35 [00:45<00:00,  1.30s/it]


In [7]:
# total_paths
# len(set(current_level_nodes)),len(current_level_nodes),len(set(prm_inputs)),len(prm_inputs)