In [1]:
%%capture
%load_ext autoreload
%autoreload 2
%cd /home/ubuntu/projects/hyper-sloth

In [2]:
from fastcore.all import *
from speedy_utils.all import *
from llm_utils import *

#### Create a LLM model

In [8]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "01"  # Choose any GPU you want to use
from vllm.lora.request import LoRARequest
from vllm import LLM, SamplingParams
import torch

if "llm" in locals():
    del llm  #
    torch.cuda.empty_cache()
    
llm = LLM(
    # model="outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH-merge",
    model="Qwen/Qwen2.5-1.5B-Instruct",
    tensor_parallel_size=1,
    task="generate",
    enforce_eager=True,
    dtype=torch.bfloat16,
    max_model_len=16384,
    enable_lora=True,
    # quantization="bitsandbytes", load_format="bitsandbytes",gpu_memory_utilization=0.95
)

tokenizer = llm.get_tokenizer()

from datasets import load_dataset

# Load the GSM8K dataset
gsm8k = load_dataset("gsm8k", "main")
test = gsm8k["test"]


INFO 03-16 17:00:46 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocesso

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-16 17:00:49 model_runner.py:1115] Loading model weights took 2.8787 GB
INFO 03-16 17:00:49 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-16 17:00:51 worker.py:267] Memory profiling takes 1.59 seconds
INFO 03-16 17:00:51 worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.90) = 21.28GiB
INFO 03-16 17:00:51 worker.py:267] model weights take 2.88GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.42GiB; the rest of the memory reserved for KV Cache is 16.98GiB.
INFO 03-16 17:00:51 executor_base.py:111] # cuda blocks: 39731, # CPU blocks: 9362
INFO 03-16 17:00:51 executor_base.py:116] Maximum concurrency for 16384 tokens per request: 38.80x
INFO 03-16 17:00:51 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 2.61 seconds


In [None]:
# Prepare prompts for GSM8K evaluation
all_questions = [item["question"] for item in test][:100]
standardized_prompts = [
    tokenizer.apply_chat_template(
        [
            {
                "role": "user",
                "content": f"{question}\nSolve step by step and put your final numerical answer inside \\boxed{{}}",
            }
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    for question in all_questions
]

# Set sampling parameters for deterministic generation
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    top_k=64,
    max_tokens=10000,
)
# Generate responses for all questions
outputs = llm.generate(
    standardized_prompts,
    sampling_params,
    lora_request=LoRARequest(
        "math", 1, "./outputs/loras/qwen1.5-openr1/checkpoint-732/"
    ),
)
all_outputs = [output.outputs[0].text for output in outputs]

Processed prompts:  60%|██████    | 60/100 [00:59<00:54,  1.35s/it, est. speed input: 97.95 toks/s, output: 831.39 toks/s] 

In [None]:
def get_final_output(response):
    try:
        return int(response.split("\\boxed{")[1].split("}")[0])
    except:
        return None

In [6]:
final_outputs = [get_final_output(response) for response in all_outputs]
accs = []
num_error = 0
for i, gt in enumerate(test):
    if i >= len(final_outputs):
        break
    pred = final_outputs[i]
    try:
        num = gt['answer'].split('####')[1]
        num = int(num)
        pred = int(pred)
        accs.append(num == pred)
    except:
        num_error += 1
        accs.append(0)
        pass

In [7]:
np.mean(accs), num_error/len(final_outputs)

(0.65, 0.12)