In [1]:
from datasets import load_dataset

# dataset = load_dataset("LLDDSS/Awesome_Spatial_VLMs")
dataset =load_dataset("LLDDSS/Awesome_Spatial_VQA_Benchmarks")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# dataset.save_to_disk("dataset")
dataset.keys()

dict_keys(['EgoOrientBench', 'GeoMeter_Real', 'SEED_Bench_Spatial', 'MM_Vet_Spatial', 'CV_Bench', 'Whats_Up', 'SRBench', 'MindCube', 'realworldqa', 'OmniSpatial'])

In [9]:

from PIL import Image
def resize_max_800(image: Image.Image) -> Image.Image:
    max_size = 800
    width, height = image.size
    # 如果最大边长已经<=1080，直接返回原图
    if max(width, height) <= max_size:
        return image

    if width >= height:
        new_width = max_size
        new_height = int(height * max_size / width)
    else:
        new_height = max_size
        new_width = int(width * max_size / height)

    resized_image = image.resize((new_width, new_height), Image.LANCZOS)
    return resized_image

def make_message_prompt(item):
    image_list=['image_0','image_1','image_2','image_3']
    message = [{
        "role": "user",
    }]
    content = []

    imgs=[]
    for image_key in image_list:
        if item[image_key] is not None:
            image = item[image_key].convert("RGB")
            image = resize_max_800(image)
            imgs.append(image)
            content.append({
                "type": "image", 
                "image": image
            })
    
    prompt= item["prompt"]
    content.append({
        "type": "text",
        "text": prompt
    })
    message[0]["content"] = content
    return message

item= dataset['MindCube'][0]
message = make_message_prompt(item)
print(message)

from qwen_vl_utils import process_vision_info

image_input=process_vision_info(message)
print(image_input )


[{'role': 'user', 'content': [{'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=480x640 at 0x7770B8729150>}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=480x640 at 0x7770993EDB50>}, {'type': 'text', 'text': 'Based on these two views showing the same scene: in which direction did I move from the first view to the second view? A. Directly left B. Diagonally forward and right C. Diagonally forward and left D. Directly right'}]}]
([<PIL.Image.Image image mode=RGB size=476x644 at 0x776C7CCF0390>, <PIL.Image.Image image mode=RGB size=476x644 at 0x776C7CCF0590>], None)


In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from qwen_vl_utils import process_vision_info
import argparse
from tqdm import tqdm
import json
import os
from datasets import load_dataset

def make_question_prompt(image, question, options):

    return (
        f"Answer the question based on the image:\n"
        f"Question: {question}\n"
        f"{options}\n"
        "Only return the answer (a word or a phrase)."
    )

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default="qwen/Qwen2.5-VL-7B-Instruct")
    parser.add_argument("--output_path", type=str, required=True)
    parser.add_argument("--batch_size", type=int, default=10)
    return parser.parse_args()

#make it batchsize = 50
def main():
    args = parse_args()
    #load dataset
    dataset = load_dataset("LLDDSS/Awesome_Spatial_VLMs")
    #load model
    processor = AutoProcessor.from_pretrained(args.model_path)
    model = AutoModelForVision2Seq.from_pretrained(args.model_path, torch_dtype=torch.float16, use_cache=True)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    messages = []
    id_list = []
    for bench in dataset.keys():
        print("Processing bench:", bench)

        for item in dataset[bench]:
            image = item["image"].convert("RGB")
            prompt= make_question_prompt(
                image=image,
                question=item["question"],
                options=item["options"]
            )
            message = [{
                "role": "user",
                "content": [
                    {
                        "type": "image", 
                        "image": image
                    },
                    {   
                        "type": "text",
                        "text": prompt
                    }
                ]
            }]
            messages.append(message)
            id_list.append({"bench_name": bench, "id": item["id"], "question": item["question"], "options": item["options"], "answer": item["GT"]})
        
        all_outputs = []
        for i in tqdm(range(0, len(messages), args.batch_size)):
            batch_messages = messages[i:i + args.batch_size]
            batch_id_list = id_list[i:i + args.batch_size]
            
            # Preparation for inference
            text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
            
            image_inputs,_= process_vision_info(batch_messages)
            inputs = processor(
                text=text,
                images=image_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to("cuda")
            
            # Inference: Generation of the output
            generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)
            
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            batch_output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )

            for output_text, item_id in zip(batch_output_text, batch_id_list):
                output_text = output_text.strip()
                all_outputs.append({
                    "bench_name": item_id["bench_name"],
                    "id": item_id["id"],
                    "question": item_id["question"],
                    "options": item_id["options"],
                    "GT": item_id["answer"],
                    "result": output_text
                })
        # Save the results to a file
        with open(f"{args.output_path}/{bench}_results.json", "w") as f:
            json.dump(all_outputs, f, indent=4)




  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 5 files: 100%|██████████| 5/5 [01:18<00:00, 15.70s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


In [1]:
#
import torch
from vllm import LLM, SamplingParams
from transformers import AutoProcessor, AutoTokenizer
from qwen_vl_utils import process_vision_info, fetch_image
import os

#use cuda :3
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# ---------------------
# 基本配置
# ---------------------
model_path = "inclusionAI/ViLaSR"   # 你要测的模型
device_count = torch.cuda.device_count()

# 初始化 vLLM 引擎
llm = LLM(
    model=model_path,
    dtype="bfloat16",
    tensor_parallel_size=device_count,
    limit_mm_per_prompt={"image": 10},   # 每个 prompt 最多带多少张图
    gpu_memory_utilization=0.85,
)

# Processor & Tokenizer
processor = AutoProcessor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.padding_side = "left"
processor.tokenizer = tokenizer

# ---------------------
# 构造输入
# ---------------------
image_path = "temp.png"  # 替换成你本地的图片
question = "What is in this image?"

messages = [
    {
        "role": "system",
        "content": "You are a helpful vision-language assistant."
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
                "max_pixels": 256*28*28   # 可选，控制图像缩放
            },
            {
                "type": "text",
                "text": question
            }
        ]
    }
]

# 转换成模型输入
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
print("Prompt:", prompt)

llm_inputs = [{
    "prompt": prompt,
    "prompt_token_ids": tokenizer.encode(prompt, add_special_tokens=False),
    "multi_modal_data": {"image": image_inputs},   # 传图像
}]

# ---------------------
# 推理
# ---------------------
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=512,
)

outputs = llm.generate(prompts=llm_inputs, sampling_params=sampling_params)
response = outputs[0].outputs[0].text

print("Model Answer:", response)



  from .autonotebook import tqdm as notebook_tqdm


INFO 09-11 13:55:14 [__init__.py:241] Automatically detected platform cuda.
INFO 09-11 13:55:15 [utils.py:326] non-default args: {'model': 'inclusionAI/ViLaSR', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.85, 'disable_log_stats': True, 'limit_mm_per_prompt': {'image': 10}}
INFO 09-11 13:55:22 [__init__.py:711] Resolved architecture: Qwen2_5_VLForConditionalGeneration


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-11 13:55:22 [__init__.py:2816] Downcasting torch.float32 to torch.bfloat16.
INFO 09-11 13:55:22 [__init__.py:1750] Using max model len 128000


2025-09-11 13:55:22,638	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 09-11 13:55:22 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:25 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:25 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='inclusionAI/ViLaSR', speculative_config=None, tokenizer='inclusionAI/ViLaSR', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=128000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=Observab

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 13842.59it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 17772.47it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 16384.00it/s]


[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:30 [gpu_model_runner.py:1953] Starting to load model inclusionAI/ViLaSR...
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:30 [gpu_model_runner.py:1985] Loading model from scratch...
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:30 [cuda.py:328] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:30 [weight_utils.py:296] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.25it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.37it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.14it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.12it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.20it/s]
[1;36m(EngineCore_0 pid=1389041)[0;0m 


[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:34 [default_loader.py:262] Loading weights took 3.48 seconds
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:35 [gpu_model_runner.py:2007] Model loading took 15.6264 GiB and 3.852578 seconds
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:35 [gpu_model_runner.py:2591] Encoder cache will be initialized with a budget of 98304 tokens, and profiled with 1 video items of the maximum feature size.
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:43 [backends.py:548] Using cache directory: /home/tuo/.cache/vllm/torch_compile_cache/8bfd6d20d0/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:43 [backends.py:559] Dynamo bytecode transform time: 3.78 s
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:46 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 3.049 s
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:5

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 15.79it/s]


[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:54 [gpu_model_runner.py:2708] Graph capturing finished in 5 secs, took 0.53 GiB
[1;36m(EngineCore_0 pid=1389041)[0;0m INFO 09-11 13:55:54 [core.py:214] init engine (profile, create kv cache, warmup model) took 19.17 seconds
INFO 09-11 13:55:55 [llm.py:298] Supported_tasks: ['generate']


Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 15252.01it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 2744.96it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10618.49it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11949.58it/s]


Prompt: <|im_start|>system
You are a helpful vision-language assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>What is in this image?<|im_end|>
<|im_start|>assistant



Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 13573.80it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12157.40it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11184.81it/s]
Adding requests: 100%|██████████| 1/1 [00:04<00:00,  4.37s/it]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it, est. speed input: 158.27 toks/s, output: 38.42 toks/s]

Model Answer: The image shows an aerial view of a cityscape, prominently featuring a large, iconic monument that resembles the Arc de Triomphe in Paris. The surrounding area is filled with buildings, roads, and green spaces, typical of a well-planned urban environment. The architecture and layout suggest this is a significant historical and cultural landmark.



