In [1]:

import argparse
import json
import logging
from pathlib import Path
from typing import List, Dict
from tqdm import tqdm
from dotenv import load_dotenv
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from datasets import load_dataset
import os

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

load_dotenv()

# Few-shot prompt with 8 examples
FEW_SHOT_PROMPT = """Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
A: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.

Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
A: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8.

Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
A: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9.

Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
A: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29.

Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?
A: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.

Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
A: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.

Q: {question}
A:"""

ZERO_SHOT_PROMPT = "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}."

def extract_ground_truth(text):
    """Extract the numerical answer from GSM8K answer text.

    Args:
        text: Answer text in format "explanation #### number"

    Returns:
        Extracted numerical answer
    """
    return text.split('####')[-1].strip()


def load_gsm8k_data(split: str = 'test') -> List[Dict]:
    """Load the GSM8K dataset.

    Args:
        split: Dataset split to load ('train' or 'test')

    Returns:
        List of dictionaries with question and answer
    """
    logger.info(f"Loading GSM8K dataset (split: {split})...")
    dataset = load_dataset('gsm8k-local', 'main', split=split, cache_dir=os.path.join("../", '.hf_cache'))
    logger.info(f"Loaded {len(dataset)} examples from GSM8K {split} set")
    return dataset


def format_prompts(
    questions: List[str],
    use_few_shot: bool = False,
    use_chat_template: bool = True,
    tokenizer: AutoTokenizer = None,
    system_message: str = None,
    enable_thinking: bool = False
) -> List[str]:
    """Format GSM8K questions into prompts.

    Args:
        questions: List of GSM8K questions
        use_few_shot: Whether to use few-shot prompting (8 examples)
        use_chat_template: Whether to apply chat template
        tokenizer: HuggingFace tokenizer for chat template
        system_message: Optional system message for chat template
        enable_thinking: Whether to enable thinking mode for Qwen3 models (default: False)

    Returns:
        List of formatted prompts ready for inference
    """
    formatted_prompts = []

    # ======================= TODO: Implement this method =========================
    # Your task: Format each question into a prompt suitable for the model
    #
    # Steps:
    # 1. For each question:
    #    a. Choose the appropriate prompt template based on use_few_shot flag
    #       - FEW_SHOT_PROMPT: includes 8 example Q&A pairs (defined above)
    #       - ZERO_SHOT_PROMPT: just the question with instruction (defined above)
    #    b. Format the template with the question using .format(question=...)
    #
    # 2. If use_chat_template is True and tokenizer is provided:
    #    - Create a messages list in chat format (list of dicts with "role" and "content")
    #    - If system_message is provided, add {"role": "system", "content": system_message}
    #    - Add the formatted prompt as {"role": "user", "content": formatted_prompt}
    #    - Apply tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    #    - Wrap in try-except to handle models that don't support chat templates
    #
    # 3. Otherwise: use the formatted prompt directly (without chat template)
    #
    # Hint: The chat template transforms messages into the model's expected format
    # (e.g., "<|im_start|>user\n{content}<|im_end|>" for Qwen models)
    # =======================================================================
    
    for question in questions:
        prompt_template = FEW_SHOT_PROMPT if use_few_shot else ZERO_SHOT_PROMPT
        formatted_prompt = prompt_template.format(question=question)
        
        if use_chat_template and tokenizer is not None:
            try:
                messages = []
                if system_message is not None:
                    messages.append({"role": "system", "content": system_message})
                messages.append({"role": "user", "content": formatted_prompt})
                formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
            except Exception as e:
                logger.warning(f"Error applying chat template: {e}, use raw prompt instead")
        
        formatted_prompts.append(formatted_prompt)
        
    return formatted_prompts

INFO 11-13 20:30:33 [__init__.py:216] Automatically detected platform cuda.


In [2]:
def run_inference(
    model_path: str,
    tensor_parallel_size: int = 1,
    max_tokens: int = 512,
    temperature: float = 0.0,
    top_p: float = 1.0,
    top_k: int = -1,
    system_message: str = None,
    gpu_memory_utilization: float = 0.9,
    use_few_shot: bool = False,
    use_chat_template: bool = True,
    enable_thinking: bool = False,
    n_rollouts: int = 1,
    split: str = 'test',
    n_queries: int = -1,
) -> None:
    """Run VLLM inference on GSM8K dataset.

    Args:
        model_path: Path to the trained model
        output_path: Path to save output responses in jsonl format
        tensor_parallel_size: Number of GPUs for tensor parallelism
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature (0.0 for greedy decoding)
        top_p: Nucleus sampling parameter
        top_k: Top-k sampling parameter (-1 to disable)
        batch_size: Batch size for inference (not used in VLLM)
        system_message: Optional system message for chat template
        gpu_memory_utilization: GPU memory utilization fraction
        use_few_shot: Whether to use few-shot prompting (8 examples)
        use_chat_template: Whether to apply chat template
        enable_thinking: Whether to enable thinking mode for Qwen3 models (default: False)
        n_rollouts: Number of rollouts (generations) per question (default: 1)
        split: Dataset split to use ('train' or 'test', default: 'test')
        n_queries: Number of queries to use from dataset (-1 for all, default: -1)
    """

    # Load GSM8K dataset
    dataset = load_gsm8k_data(split=split)
    total_queries = len(dataset)

    # Limit to n_queries if specified
    if n_queries > 0 and n_queries < total_queries:
        dataset = dataset.select(range(n_queries))
        logger.info(f"Using subset of {n_queries} queries (from {total_queries} total)")
    else:
        logger.info(f"Using full dataset ({total_queries} queries)")

    questions = [item["question"] for item in dataset]

    # Load tokenizer if using chat template
    tokenizer = None
    if use_chat_template:
        logger.info(f"Loading tokenizer from {model_path}")
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    # Format prompts
    logger.info("Formatting prompts...")
    formatted_prompts = format_prompts(
        questions,
        use_few_shot=use_few_shot,
        use_chat_template=use_chat_template,
        tokenizer=tokenizer,
        system_message=system_message,
        enable_thinking=enable_thinking
    )

    # Initialize VLLM
    logger.info("Initializing VLLM engine")
    # ======================= TODO: Implement VLLM inference =========================
    # Your task: Use VLLM to generate model outputs for the formatted prompts
    #
    # Steps:
    # 1. Initialize the VLLM LLM engine with appropriate parameters
    #    (model path, tensor parallelism, memory utilization, etc.)
    #
    # 2. Configure sampling parameters:
    #    - Consider the difference between greedy decoding (temperature=0.0) and
    #      sampling-based generation (temperature>0.0)
    #    - When temperature=0.0: deterministic, always picks highest probability token
    #    - When temperature>0.0: stochastic, samples from probability distribution
    #    - For multiple rollouts (n_rollouts>1), you typically want temperature>0.0
    #      to get diverse outputs
    #
    # 3. Generate outputs using the LLM engine with the formatted prompts
    #    Store the results in a variable named "outputs"
    #
    # Hint: Check the VLLM documentation for LLM and SamplingParams classes
    # can refer to https://docs.vllm.ai/en/stable/getting_started/quickstart.html
    # =======================================================================
    DEFAULT_TEMPERATURE = 0.6
    llm = LLM(
        model=model_path,
        tensor_parallel_size=tensor_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
    )
    sampling_params = SamplingParams(
        n=n_rollouts,
        temperature= DEFAULT_TEMPERATURE if n_rollouts > 1 and temperature == 0.0 else temperature,
        top_p=top_p,
        top_k=top_k,
        max_tokens=max_tokens,
    )
        
    # Generate outputs
    outputs = llm.generate(formatted_prompts, sampling_params=sampling_params)
    
    return outputs


In [11]:
outputs = run_inference(
    model_path="../Qwen3-0.6B",
    tensor_parallel_size=1,
    max_tokens=512,
    temperature=0.6,
    n_rollouts=8,
    n_queries=20,
    gpu_memory_utilization=0.9,
)

2025-11-13 20:42:10,042 - INFO - Loading GSM8K dataset (split: test)...
Using the latest cached version of the dataset since gsm8k-local couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at ../.hf_cache/gsm8k-local/main/0.0.0/5f13071cc147f4f3 (last modified on Thu Nov 13 00:19:00 2025).
2025-11-13 20:42:10,334 - INFO - Loaded 1319 examples from GSM8K test set
2025-11-13 20:42:10,336 - INFO - Using subset of 20 queries (from 1319 total)
2025-11-13 20:42:10,337 - INFO - Loading tokenizer from ../Qwen3-0.6B
2025-11-13 20:42:10,600 - INFO - Formatting prompts...
2025-11-13 20:42:10,602 - INFO - Initializing VLLM engine


INFO 11-13 20:42:10 [utils.py:328] non-default args: {'disable_log_stats': True, 'model': '../Qwen3-0.6B'}
INFO 11-13 20:42:10 [__init__.py:742] Resolved architecture: Qwen3ForCausalLM
INFO 11-13 20:42:10 [__init__.py:1815] Using max model len 40960
INFO 11-13 20:42:10 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:11 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:11 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='../Qwen3-0.6B', speculative_config=None, tokenizer='../Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;36m(EngineCore_DP0 pid=397066)[0;0m ERROR 11-13 20:42:12 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8


[W1113 20:42:23.637198514 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W1113 20:42:33.650658973 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:33 [parallel_state.py:1165] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:33 [gpu_model_runner.py:2338] Starting to load model ../Qwen3-0.6B...
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:34 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=397066)[0;0m 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:34 [default_loader.py:268] Loading weights took 0.51 seconds
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:35 [gpu_model_runner.py:2392] Model loading took 1.1201 GiB and 0.768070 seconds
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:39 [backends.py:539] Using cache directory: /home/timho/.cache/vllm/torch_compile_cache/0bb49a1cc0/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:39 [backends.py:550] Dynamo bytecode transform time: 4.13 s
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:41 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.797 s
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:42 [monitor.py:34] torch.compile takes 4.13 s in total
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:43 [gpu_worker.py:298] Available KV cache memory: 17.26 GiB
[1;36m(EngineCore_DP0 pid=3970

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:02<00:00, 24.86it/s]


[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:47 [gpu_model_runner.py:3118] Graph capturing finished in 4 secs, took 0.77 GiB
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:47 [gpu_worker.py:391] Free memory on device (20.79/22.0 GiB) on startup. Desired GPU memory utilization is (0.9, 19.8 GiB). Actual usage is 1.12 GiB for weight, 1.4 GiB for peak activation, 0.01 GiB for non-torch memory, and 0.77 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=17555316531` to fit into requested memory, or `--kv-cache-memory=18615512064` to fully utilize gpu memory. Current kv cache memory in use is 18537107251 bytes.
[1;36m(EngineCore_DP0 pid=397066)[0;0m INFO 11-13 20:42:47 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.63 seconds
INFO 11-13 20:42:48 [llm.py:295] Supported_tasks: ['generate']
INFO 11-13 20:42:48 [__init__.py:36] No IOProcessor plugins requested by the model


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/160 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



In [17]:
print("\n============\n".join([o.text for o in outputs[0].outputs]))

We are given the following information about Janet and her daily activities:

- **Janet’s ducks lay 16 eggs per day.**
- **She eats 3 eggs for breakfast each morning.**
- **She bakes muffins for her friends every day, using 4 eggs per muffin.**
- **She sells the remainder at the farmers' market for $2 per fresh duck egg.**

---

### Step-by-step calculation:

#### **1. Total eggs laid per day:**
- 16 eggs/day

#### **2. Eggs eaten for breakfast:**
- 3 eggs

#### **3. Eggs eaten for baking muffins:**
- 4 eggs/muffin × 1 muffin = 4 eggs

#### **4. Total eggs eaten daily:**
- 3 (breakfast) + 4 (baking) = **7 eggs/day**

#### **5. Eggs remaining for market:**
- Total eggs - eggs eaten = $16 - 7 = 9$ eggs/day

#### **6. Money made from eggs at the market:**
- 9 eggs × $2 per egg = **$18**

---

### ✅ Final Answer:

$$
\boxed{18} \text{ dollars}
$$
We are given the following information about Janet:

- **Janet’s ducks lay 16 eggs per day.**
- **She eats 3 eggs for breakfast every morning.**
