In [1]:
import os
import json
import re
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import torch

from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
import argparse

INFO 04-29 15:51:14 __init__.py:190] Automatically detected platform cuda.


In [2]:
BSZ = 64
MODEL_PATH = "/data/hf_cache/Qwen2.5-VL-7B-COT-SFT/"
file_name = "trial_jupyter_notebook"
dataset_name = 'vsibench'

In [3]:
llm = LLM(
    model=MODEL_PATH,
    tensor_parallel_size=torch.cuda.device_count(),
    max_model_len = 8192,
    gpu_memory_utilization=0.8,
    limit_mm_per_prompt={"image": 1, "video": 1},
)

INFO 04-29 15:51:25 config.py:542] This model supports multiple tasks: {'score', 'classify', 'generate', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 04-29 15:51:25 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/data/hf_cache/Qwen2.5-VL-7B-COT-SFT/', speculative_config=None, tokenizer='/data/hf_cache/Qwen2.5-VL-7B-COT-SFT/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/data/hf_c

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 04-29 15:51:33 model_runner.py:1115] Loading model weights took 15.6270 GB


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


INFO 04-29 15:51:37 worker.py:267] Memory profiling takes 4.46 seconds
INFO 04-29 15:51:37 worker.py:267] the current vLLM instance can use total_gpu_memory (39.50GiB) x gpu_memory_utilization (0.80) = 31.60GiB
INFO 04-29 15:51:37 worker.py:267] model weights take 15.63GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.25GiB; the rest of the memory reserved for KV Cache is 14.63GiB.
INFO 04-29 15:51:38 executor_base.py:110] # CUDA blocks: 17119, # CPU blocks: 4681
INFO 04-29 15:51:38 executor_base.py:115] Maximum concurrency for 8192 tokens per request: 33.44x
INFO 04-29 15:51:43 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` 

Capturing CUDA graph shapes: 100%|████████████████████████████████████████████████████████████████████████████| 35/35 [00:20<00:00,  1.71it/s]

INFO 04-29 15:52:03 model_runner.py:1562] Graph capturing finished in 21 secs, took 0.37 GiB
INFO 04-29 15:52:03 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 30.72 seconds





In [4]:
sampling_params = SamplingParams(
    temperature=0.1,
    top_p=0.001,
    max_tokens=1024,
    stop_token_ids=[],
)

In [5]:
processor = AutoProcessor.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.padding_side = "left"
processor.tokenizer = tokenizer

In [6]:
OUTPUT_PATH = f"./src/r1-v/eval_results/eval_{dataset_name}_{file_name}_greedy_output.json"
PROMPT_PATH = f"./src/r1-v/Evaluation/eval_{dataset_name}.json"

if PROMPT_PATH.endswith('.jsonl'):
    with open(PROMPT_PATH, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
elif PROMPT_PATH.endswith('.json'):
    with open(PROMPT_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)
else:
    raise ValueError("Input file must be .json or .jsonl")

QUESTION_TEMPLATE = (
    "{Question}\n"
    "Please think about this question as if you were a human pondering deeply. "
    "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
    "It's encouraged to include self-reflection or verification in the reasoning process. "
    "Provide your detailed reasoning between the <think> and </think> tags, and then give your final answer between the <answer> and </answer> tags."
)

TYPE_TEMPLATE = {
    "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
    "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
    "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
    "free-form": " Please provide your text answer within the <answer> </answer> tags.",
    "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags."
}

In [7]:
# Helper functions
class StopOnSequence(transformers.StoppingCriteria):
    def __init__(self, target_sequences, tokenizer):
        # Encode the string so we have the exact token-IDs pattern
        self.target_ids = [tokenizer.encode(target_sequence, add_special_tokens=False) for target_sequence in target_sequences]
        self.target_lengths = [len(target_id) for target_id in self.target_ids]
        self._tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs):
        # Make sure the target IDs are on the same device
        targets = [torch.as_tensor(target_id, device=input_ids.device) for target_id in self.target_ids]

        if input_ids.shape[1] < min(self.target_lengths):
            return False

        # Compare the tail of input_ids with our target_ids
        for i, target in enumerate(targets):
            if torch.equal(input_ids[0, -self.target_lengths[i]:], target):
                return True

        return False

def extract_think(output_str):
    pattern = r'<think>\s*(.*?)\s*</think>'
    match = re.search(pattern, output_str, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

def extract_answer(text):
    pattern = r'<answer>\s*(.*?)\s*</answer>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

def normalize_number(num_str):
    try:
        num_str = num_str.replace(',', '')
        return float(num_str)
    except Exception as e:
        return None
    
def mean_relative_accuracy(pred, target, start=0.5, end=0.95, interval=0.05):

    if not torch.is_tensor(pred):
        pred = torch.tensor(pred, dtype=torch.float32)
    if not torch.is_tensor(target):
        target = torch.tensor(target, dtype=torch.float32)
    
    epsilon = 1e-8
    rel_error = torch.abs(pred - target) / (torch.abs(target) + epsilon)
    
    thresholds = torch.arange(start, end + interval/2, interval, dtype=torch.float32)
    
    conditions = rel_error < (1 - thresholds)  
    mra = conditions.float().mean()  
    return mra.item()


def reward_fn(sample, model_output, question_type):
    try:
        output_ans = extract_answer(model_output)
        if output_ans == '':
            output_ans = model_output
        gt_ans = extract_answer(sample.get("solution", ""))
        if question_type == "multiple choice":
            return 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
        elif question_type == "numerical":
            gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
            out_has_decimal = ("." in output_ans) or ("," in output_ans)
            if gt_has_decimal != out_has_decimal:
                return 0.0
            gt_number = normalize_number(gt_ans)
            out_number = normalize_number(output_ans)
            if gt_number is None or out_number is None:
                return 0.0
            return 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
        elif question_type == "regression":
            gt_number = normalize_number(gt_ans)
            out_number = normalize_number(output_ans)
            if gt_number is None or out_number is None:
                return 0.0
            mra = mean_relative_accuracy(out_number, gt_number)
            return mra
        else:
            return 0.0
    except Exception as e:
        return 0.0

In [8]:
messages = []
for x in data:
    if x["problem_type"] == 'multiple choice':
        question = x['problem'] + "Options:\n"
        for op in x["options"]:
            question += op + "\n"
    else:
        question = x['problem']

    msg = [{
        "role": "user",
        "content": [
            {
                "type": x['data_type'],
                x['data_type']: os.getcwd() + "/src/r1-v" + x['path'][1:]
            },
            {
                "type": "text",
                "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[x['problem_type']]
            }
        ]
    }]
    messages.append(msg)
    

final_output = []
start_idx = 0
if os.path.exists(OUTPUT_PATH):
    try:
        with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
            existing = json.load(f)
            final_output = existing.get("results", [])
            start_idx = len(final_output)
            print(f"Resuming from sample index {start_idx}")
    except Exception as e:
        print(f"Error reading existing output file: {e}")

In [9]:
mean_acc = []
mean_mra = []
for i in tqdm(range(start_idx, len(messages), BSZ), desc="Processing batches"):
    batch_messages = messages[i:i + BSZ]

    prompts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
    

    try:
        image_inputs, video_inputs, video_kwargs = process_vision_info(batch_messages, return_video_kwargs=True)
        
        image_idx = 0
        video_idx = 0

        llm_inputs = []

        
        for idx, prompt in enumerate(prompts):
            mm_type = batch_messages[idx][0]['content'][0]['type']
            sample_mm_data = {}
            sample_video_kw = {}
            if mm_type == 'image':
                sample_mm_data["image"] = image_inputs[image_idx]
                image_idx += 1
            elif mm_type == 'video':
                sample_mm_data["video"] = video_inputs[video_idx]
                for key, value in video_kwargs.items():
                    sample_video_kw[key] = value[video_idx]
                video_idx += 1
                    
            
            llm_inputs.append({
                "prompt": prompt,
                "multi_modal_data": sample_mm_data,
                "mm_processor_kwargs": sample_video_kw,
            })
            

        outputs = llm.generate(llm_inputs, sampling_params=sampling_params)
        batch_output_text = [out.outputs[0].text for out in outputs]
        
    except Exception as e:
        print('error:', data[i]['path'])
        batch_output_text = ['<answer>error</answer>'] * BSZ
        

    for j, (sample, model_output) in enumerate(zip(data[i:i+BSZ], batch_output_text), start=i):
        think_chain = extract_think(model_output)
        final_ans = extract_answer(model_output)
        if final_ans == "":
            final_ans = model_output
        sample["output"] = model_output
        sample["prediction"] = final_ans
        q_type = sample.get("problem_type", "")
        sample["reward"] = reward_fn(sample, model_output, q_type)
        sample['correct'] = True if sample["reward"]==1.0 else False
        if sample['problem_type'] != 'regression':
            mean_acc.append(sample["reward"])
        else:
            mean_mra.append(sample["reward"])
        if think_chain:
            sample["process"] = f"<think>{think_chain}</think>"
        final_output.append(sample)
    

    try:
        with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
            json.dump({"results": final_output}, f, indent=2, ensure_ascii=False)
        print(f"Processed batch {(i - start_idx)//BSZ + 1}, saved {len(final_output)} samples.")
    except Exception as e:
        print(f"Error writing to output file: {e}")

final_acc={'mean_acc': 0.0, 'mean_mra': 0.0}
final_acc['mean_acc'] = torch.tensor(mean_acc).mean().item()
if mean_mra != []:
    final_acc['mean_mra'] = torch.tensor(mean_mra).mean().item()

try:
    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump({"results": final_output, "final_acc": [final_acc]}, f, indent=2, ensure_ascii=False)
    print(f"Final accuracy saved to {OUTPUT_PATH}")
except Exception as e:
    print(f"Error writing final accuracy to output file: {e}")

print(f"Results saved to {OUTPUT_PATH}")

Processing batches:   0%|                                                                                              | 0/81 [00:00<?, ?it/s]qwen-vl-utils using decord to read video.

Processed prompts:   0%|                                           | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:   2%|▌                                 | 1/64 [00:13<13:57, 13.29s/it, est. speed input: 90.57 toks/s, output: 8.80 toks/s][A
Processed prompts:   5%|█▌                              | 3/64 [00:13<03:32,  3.49s/it, est. speed input: 269.53 toks/s, output: 26.64 toks/s][A
Processed prompts:   8%|██▌                             | 5/64 [00:13<01:42,  1.74s/it, est. speed input: 443.10 toks/s, output: 45.12 toks/s][A
Processed prompts:  11%|███▌                            | 7/64 [00:13<00:58,  1.03s/it, est. speed input: 615.59 toks/s, output: 63.84 toks/s][A
Processed prompts:  17%|█████▏                        | 11/64 [00:13<00:25,  2.08it/s

Processed batch 1, saved 64 samples.



Processed prompts:   0%|                                           | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:   2%|▌                                 | 1/64 [00:12<13:36, 12.96s/it, est. speed input: 92.90 toks/s, output: 8.26 toks/s][A
Processed prompts:   3%|█                               | 2/64 [00:13<05:36,  5.43s/it, est. speed input: 183.52 toks/s, output: 16.77 toks/s][A
Processed prompts:   5%|█▌                              | 3/64 [00:13<03:05,  3.04s/it, est. speed input: 271.43 toks/s, output: 25.55 toks/s][A
Processed prompts:   8%|██▌                             | 5/64 [00:13<01:19,  1.35s/it, est. speed input: 448.80 toks/s, output: 43.61 toks/s][A
Processed prompts:  11%|███▌                            | 7/64 [00:13<00:44,  1.27it/s, est. speed input: 619.84 toks/s, output: 62.22 toks/s][A
Processed prompts:  22%|██████▎                      | 14/64 [00:13<00:12,  3.96it/s, est. speed input: 1230.62 toks/s, out

Processed batch 2, saved 128 samples.


Processing batches:   2%|██                                                                                 | 2/81 [05:32<3:39:04, 166.39s/it]


KeyboardInterrupt: 