In [2]:
%%capture
%load_ext autoreload
%autoreload 2
%cd /home/ubuntu/projects/hyper-sloth

In [3]:
from speedy_utils.all import *
from llm_utils import *
from fastcore.all import *



#### Create a LLM model

In [4]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Choose any GPU you want to use
from vllm.lora.request import LoRARequest
from vllm import LLM, SamplingParams
import torch
# setup logger
import logging
from os import pipe

if "llm" in locals():
    del llm  #
    torch.cuda.empty_cache()
    
llm = LLM(
    model="unsloth/Qwen2.5-1.5B-Instruct",
    tensor_parallel_size=1,
    task="generate",
    enforce_eager=True,
    dtype=torch.bfloat16,
    max_model_len=16384,
    enable_lora=True,
    # quantization="bitsandbytes", load_format="bitsandbytes",gpu_memory_utilization=0.95
)

tokenizer = llm.get_tokenizer()

from datasets import load_dataset

# Load the GSM8K dataset
gsm8k = load_dataset("gsm8k", "main")
test = gsm8k["test"]


INFO 03-15 09:29:17 [__init__.py:256] Automatically detected platform cuda.
INFO 03-15 09:29:27 [arg_utils.py:1770] LORA is experimental on VLLM_USE_V1=1. Falling back to V0 Engine.
INFO 03-15 09:29:27 [llm_engine.py:241] Initializing a V0 LLM engine (v0.7.4.dev473+g9ed6ee92) with config: model='unsloth/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='unsloth/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-15 09:29:32 [loader.py:429] Loading weights took 0.63 seconds
INFO 03-15 09:29:32 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-15 09:29:33 [model_runner.py:1146] Model loading took 2.9227 GB and 2.148398 seconds
INFO 03-15 09:29:39 [worker.py:267] Memory profiling takes 6.17 seconds
INFO 03-15 09:29:39 [worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.90) = 21.28GiB
INFO 03-15 09:29:39 [worker.py:267] model weights take 2.92GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 1.43GiB; the rest of the memory reserved for KV Cache is 16.85GiB.
INFO 03-15 09:29:39 [executor_base.py:111] # cuda blocks: 39431, # CPU blocks: 9362
INFO 03-15 09:29:39 [executor_base.py:116] Maximum concurrency for 16384 tokens per request: 38.51x
INFO 03-15 09:29:43 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 10.90 seconds


In [None]:
# Prepare prompts for GSM8K evaluation
all_questions = [item["question"] for item in test]
standardized_prompts = [
    tokenizer.apply_chat_template(
        [{"role": "user", "content": f"{question}\nSolve step by step and put your final numerical answer inside \\boxed{{}}"}],
        tokenize=False,
        add_generation_prompt=True,
    )
    for question in all_questions
]

# Set sampling parameters for deterministic generation
sampling_params = SamplingParams(temperature=0.0, top_p=0.95, top_k=64, max_tokens=5000)

# Generate responses for all questions
outputs = llm.generate(standardized_prompts, sampling_params, lora_request=LoRARequest("think_math", 1, 'outputs/lora/Qwen2.5-1.5B-Instruct-LORA-MATH/'))
all_outputs = [output.outputs[0].text for output in outputs]


Processed prompts:   0%|          | 1/1319 [00:05<1:56:43,  5.31s/it, est. speed input: 16.75 toks/s, output: 15.62 toks/s]

In [8]:
print(all_outputs[0])

<think>
Janet's ducks: 16 eggs/day. Breakfast: 3, muffins: 4, sells: 16-3-4=9 eggs/day.

Eggs sold: 9. Price: $2/egg. Total: 9 * $2 = $18.

**Final Answer**
\boxed{18}
</think>


Janet's ducks lay 16 eggs per day. She eats 3 eggs for breakfast and bakes muffins for her friends with 4 eggs. The remaining eggs are sold at the farmers' market.

First, we calculate the number of eggs left after Janet eats and bakes:
\[
16 - 3 - 4 = 9 \text{ eggs}
\]

Next, we determine the revenue from selling these 9 eggs at $2 each:
\[
9 \times 2 = 18 \text{ dollars}
\]

Thus, Janet makes \(\boxed{18}\) dollars every day at the farmers' market.


In [46]:
def get_final_output(response):
    try:
        return int(response.split("\\boxed{")[1].split("}")[0])
    except:
        return None

In [51]:
final_outputs = [get_final_output(response) for response in all_outputs]
accs = []
num_error = 0
for i, gt in enumerate(test):
    pred = final_outputs[i]
    try:
        num = gt['answer'].split('####')[1]
        # num = all_answers.append(int(num))
        # all_answers.append(num)
        num = int(num)
        pred = int(pred)
        accs.append(num == pred)
    except:
        num_error += 1
        accs.append(0)
        pass
        # all_answers.append(None)


In [52]:
np.mean(accs), num_error/len(final_outputs)

(0.689158453373768, 0.11902956785443518)