In [1]:
! kill -9 $(nvidia-smi --query-compute-apps=pid --format=csv,noheader)

In [1]:
! nvidia-smi

Thu Apr 24 08:19:57 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:47:00.0 Off |                    0 |
| N/A   27C    P0              57W / 400W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import re
import os
import wandb
from datetime import datetime
from typing import Optional, Union
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader

from datasets import Dataset
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams

INFO 04-24 08:20:46 [__init__.py:239] Automatically detected platform cuda.


2025-04-24 08:20:52.066617: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745497252.495211  263942 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745497252.596952  263942 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745497253.689265  263942 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745497253.689295  263942 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745497253.689297  263942 computation_placer.cc:177] computation placer alr

In [3]:
baseline = "Qwen/Qwen2.5-1.5B-Instruct"
finetune = "/network/rit/lab/wang_lab_cs/ptian/output/rft/qwen2.5-1.5b-instruct-grpo-20250423-080507"
# finetune = "grpo_finetuned_model"

In [4]:
class FewShotEvaluator:
    """
    Few-shot evaluator for math reasoning tasks.
    """
    def __init__(self, dataset: Dataset, n_shots: int = 3, device: str = "cuda", batch_size: int = 16) -> None:
        self.dataset = dataset
        self.n_shots = n_shots
        self.device = device
        self.batch_size = batch_size
        self.fewshot_prompt = self.get_fewshot_prompt()

    def get_fewshot_prompt(self) -> str:
        prompt = "Solve these math problems:\n\n"
        for i in range(self.n_shots):
            example = self.dataset[i]
            prompt += f"Question: {example['question']}\nAnswer: {example['answer']}" + "\n\n"
        return prompt

    def preprocess_eval(self, examples: dict) -> dict:
        # Preprocess the example to include the few-shot prompt
        return {
            "prompt": [self.fewshot_prompt + f"Question: {question}\nAnswer:\n" for question in examples["question"]]
        }

    def parse_answer(self, answer: str) -> Optional[str]:
        # Extract the answer from the generated text
        try:
            predicted_answer = re.search(r"#### (-?\d+\.?\d*)", answer).group(1)
        except:
            predicted_answer = None
        return predicted_answer

    # def eval(self, model_path: str, tokenizer: AutoTokenizer, device: str = "cuda", temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256) -> float:
    def eval(self, model_path: str, dtype: str = "auto", device: str = "cuda", temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256) -> float:
        """
        Evaluate exact match accuracy
        """
        # Load dataset
        eval_dataset = self.dataset.select(range(self.n_shots, len(self.dataset)))
        eval_dataset = eval_dataset.map(self.preprocess_eval, batched=True)
        eval_dataloader = DataLoader(eval_dataset, batch_size=self.batch_size, shuffle=False)

        # Load model
        llm = LLM(model=model_path, dtype=dtype)
        # Shared or individual sampling settings
        sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens)

        correct = 0
        num_questions = 0

        answers = []

        # batch inference
        for _, batch in tqdm(enumerate(eval_dataloader), desc="Eval Inference: ", total=len(eval_dataloader)):
            # inputs = tokenizer(batch["prompt"], return_tensors="pt", max_length=256, padding="max_length", truncation=True).to(device)
            # outputs = model.generate(**inputs, max_new_tokens=256)
            # batch_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            # answers.extend(batch_answers)
            prompts = batch["prompt"]
            outputs = llm.generate(prompts, sampling_params)
            batch_answers = [output.outputs[0].text.strip() for output in outputs]
            answers.extend(batch_answers)
            torch.cuda.empty_cache()

        # text parse for exact match
        for i, (correct_answer, generated_answer) in tqdm(enumerate(zip(eval_dataset['answer'], answers)), desc="Evaluating Exact Match Accuracy: ", total=len(eval_dataset)):
            # # Remove the input tokens from the output for transformers inference
            # generated_answer = generated_answer[len(eval_dataset['prompt'][i]):]

            # Extract final answer
            predicted_answer = self.parse_answer(generated_answer)
            ground_truth = self.parse_answer(correct_answer)

            # Check if the predicted answer matches the ground truth
            if ground_truth:
                num_questions += 1
                if predicted_answer and predicted_answer == ground_truth:
                    correct += 1

        return correct / num_questions if num_questions > 0 else 0

In [5]:
eval_ds = load_dataset("openai/gsm8k", "main", split="test", num_proc=4)
evaluator = FewShotEvaluator(eval_ds, n_shots=3, device="cuda", batch_size=128)

In [10]:
# pretrain evaluation
qem1 = evaluator.eval(baseline, device="cuda")
qem1

INFO 04-22 12:45:10 [config.py:689] This model supports multiple tasks: {'generate', 'embed', 'classify', 'reward', 'score'}. Defaulting to 'generate'.
INFO 04-22 12:45:11 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-22 12:45:11 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_e

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-04-22 12:45:13,718 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 04-22 12:45:14 [parallel_state.py:959] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 04-22 12:45:14 [cuda.py:221] Using Flash Attention backend on V1 engine.
INFO 04-22 12:45:15 [gpu_model_runner.py:1276] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...
INFO 04-22 12:45:15 [topk_topp_sampler.py:44] Currently, FlashInfer top-p & top-k sampling sampler is disabled because FlashInfer>=v0.2.3 is not backward compatible. Falling back to the PyTorch-native implementation of top-p & top-k sampling.
INFO 04-22 12:45:15 [weight_utils.py:265] Using model weights format ['*.safetensors']
INFO 04-22 12:45:15 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-22 12:45:16 [loader.py:458] Loading weights took 0.71 seconds
INFO 04-22 12:45:16 [gpu_model_runner.py:1291] Model loading took 2.8876 GiB and 1.129856 seconds
INFO 04-22 12:45:23 [backends.py:416] Using cache directory: /network/rit/home/ptian_wang_lab_cs/.cache/vllm/torch_compile_cache/d3bcff9216/rank_0_0 for vLLM's torch.compile
INFO 04-22 12:45:23 [backends.py:426] Dynamo bytecode transform time: 7.41 s
INFO 04-22 12:45:27 [backends.py:132] Cache the graph of shape None for later use
INFO 04-22 12:45:47 [backends.py:144] Compiling a graph for general shape takes 23.06 s
INFO 04-22 12:45:58 [monitor.py:33] torch.compile takes 30.48 s in total
INFO 04-22 12:45:59 [kv_cache_utils.py:634] GPU KV cache size: 2,482,944 tokens
INFO 04-22 12:45:59 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 75.77x
INFO 04-22 12:46:29 [gpu_model_runner.py:1626] Graph capturing finished in 30 secs, took 1.47 GiB
INFO 04-22 12:46:29 [core.py:163] init engine (profile, cr

Eval Inference:   0%|          | 0/11 [00:00<?, ?it/s]

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                            | 0/128 [00:00<?, ?it/s, e…

Processed prompts:   0%|                                                             | 0/36 [00:00<?, ?it/s, e…

Evaluating Exact Match Accuracy:   0%|          | 0/1316 [00:00<?, ?it/s]

0.5851063829787234

In [7]:
torch.cuda.empty_cache()

In [6]:
# finetune evaluation
qem2 = evaluator.eval(finetune, device="cuda")
qem2

INFO 04-24 08:21:33 [config.py:689] This model supports multiple tasks: {'generate', 'classify', 'embed', 'reward', 'score'}. Defaulting to 'generate'.
INFO 04-24 08:21:33 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-24 08:21:36 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='/network/rit/lab/wang_lab_cs/ptian/output/rft/qwen2.5-1.5b-instruct-grpo-20250423-080507', speculative_config=None, tokenizer='/network/rit/lab/wang_lab_cs/ptian/output/rft/qwen2.5-1.5b-instruct-grpo-20250423-080507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decodin

2025-04-24 08:21:38,844 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 04-24 08:21:43 [parallel_state.py:959] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 04-24 08:21:43 [cuda.py:221] Using Flash Attention backend on V1 engine.
INFO 04-24 08:21:43 [gpu_model_runner.py:1276] Starting to load model /network/rit/lab/wang_lab_cs/ptian/output/rft/qwen2.5-1.5b-instruct-grpo-20250423-080507...
INFO 04-24 08:21:44 [topk_topp_sampler.py:44] Currently, FlashInfer top-p & top-k sampling sampler is disabled because FlashInfer>=v0.2.3 is not backward compatible. Falling back to the PyTorch-native implementation of top-p & top-k sampling.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-24 08:21:54 [loader.py:458] Loading weights took 10.62 seconds
INFO 04-24 08:21:55 [gpu_model_runner.py:1291] Model loading took 2.8876 GiB and 11.600211 seconds
INFO 04-24 08:22:13 [backends.py:416] Using cache directory: /network/rit/home/ptian_wang_lab_cs/.cache/vllm/torch_compile_cache/5bef00c68c/rank_0_0 for vLLM's torch.compile
INFO 04-24 08:22:13 [backends.py:426] Dynamo bytecode transform time: 17.81 s
INFO 04-24 08:22:17 [backends.py:132] Cache the graph of shape None for later use
INFO 04-24 08:22:38 [backends.py:144] Compiling a graph for general shape takes 24.81 s
INFO 04-24 08:22:50 [monitor.py:33] torch.compile takes 42.62 s in total
INFO 04-24 08:22:52 [kv_cache_utils.py:634] GPU KV cache size: 2,482,944 tokens
INFO 04-24 08:22:52 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 75.77x
INFO 04-24 08:23:22 [gpu_model_runner.py:1626] Graph capturing finished in 30 secs, took 1.47 GiB
INFO 04-24 08:23:22 [core.py:163] init engine (profile,

Eval Inference:   0%|          | 0/11 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|           | 0/36 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Evaluating Exact Match Accuracy:   0%|          | 0/1316 [00:00<?, ?it/s]

0.5965045592705167