In [1]:
"""
Compare OpenVLA model outputs between HuggingFace and vLLM implementations.

This script runs inference with both implementations and compares:
1. Generated token IDs (from direct generate() calls)
2. Decoded actions (if available from HuggingFace predict_action)

Note: For fair comparison, both implementations use generate() directly.
The HuggingFace predict_action() method inserts a special token (29871) after "Out:",
but we compare the raw generate() outputs to match vLLM's behavior.

Based on: https://huggingface.co/openvla/openvla-7b

2025.11.XX
"""

import os
from pathlib import Path
from PIL import Image
from typing import Optional, Dict, Any, Tuple
import numpy as np
import torch
import torch.nn as nn
import json

from vllm import LLM, SamplingParams

ImportError: cannot import name 'DeepseekV3Config' from 'transformers' (/home/yq/dev/miniconda/envs/vllm/lib/python3.12/site-packages/transformers/__init__.py)

In [None]:
# Configuration
hf_model_name = "openvla/openvla-7b"
image_folder = "/home/yq/ssd/vllm-dir/vllm/yq/wip/pnp-soft-toys-traj6/images0"
image_paths = [p for p in Path(image_folder).iterdir()]
print(f"found {len(image_paths)} images in the folder")

instruction = "pick up the toy bear"
prompt: str = "In: What action should the robot take to {instruction}?\nOut:"

llm = LLM(
    model=hf_model_name,
    model_impl="vllm",
    dtype="bfloat16",
    enforce_eager=True,
    trust_remote_code=True,
    gpu_memory_utilization=0.8,
)
    
sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=7,
    stop=None,
)

vllm_generated_ids = []
for image_path in image_paths:
    image = Image.open(image_path).convert("RGB")
    outputs = llm.generate(
        [{
            "prompt": prompt,
            "multi_modal_data": {"image": image}
        }],
        sampling_params
    )
    
    output = outputs[0]
    generated_ids: list = output.outputs[0].token_ids
    vllm_generated_ids.append(generated_ids)

found 50 images in the folder
INFO 11-18 12:09:24 [utils.py:253] non-default args: {'trust_remote_code': True, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'enforce_eager': True, 'model_impl': 'vllm', 'model': 'openvla/openvla-7b'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-18 12:09:26 [model.py:631] Resolved architecture: OpenVLAForActionPrediction
INFO 11-18 12:09:26 [model.py:1745] Using max model len 2048
INFO 11-18 12:09:26 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-18 12:09:26 [vllm.py:487] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:26 [core.py:94] Initializing a V1 LLM engine (v0.11.1rc7.dev217+gbe263f764.d20251117) with config: model='openvla/openvla-7b', speculative_config=None, tokenizer='openvla/openvla-7b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', 

[1;36m(EngineCore_DP0 pid=46715)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:31 [gpu_model_runner.py:3047] Starting to load model openvla/openvla-7b...
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:31 [vllm.py:487] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:31 [cuda.py:418] Valid backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:31 [cuda.py:427] Using FLASH_ATTN backend.


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:33 [default_loader.py:314] Loading weights took 1.54 seconds
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:33 [gpu_model_runner.py:3126] Model loading took 14.0597 GiB memory and 2.127112 seconds
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:33 [gpu_model_runner.py:3876] Encoder cache will be initialized with a budget of 8192 tokens, and profiled with 32 image items of the maximum feature size.
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:36 [gpu_worker.py:353] Available KV cache memory: 2.01 GiB
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:36 [kv_cache_utils.py:1229] GPU KV cache size: 4,096 tokens
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:36 [kv_cache_utils.py:1234] Maximum concurrency for 2,048 tokens per request: 2.00x
[1;36m(EngineCore_DP0 pid=46715)[0;0m INFO 11-18 12:09:36 [core.py:247] init engine (profile, create kv cache, warmup model) took 2.57 seconds
[1;

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [None]:
import json
with open("result_vllm.json", "w") as f:
    json.dump([ids for ids in vllm_generated_ids], f)
print(f"Saved results to result_vllm.json")
    

Saved results to result_vllm.json


In [None]:
import json
def compare_results(hf_json_path: str, vllm_json_path: str):
    with open(hf_json_path, "r") as f:
        hf_results = json.load(f)
    with open(vllm_json_path, "r") as f:
        vllm_results = json.load(f)
    # Both are lists of lists of token ids.
    # Each element of vllm_results or hf_results is a list of token ids, possibly wrapped in another list.

    # Check length
    if len(vllm_results) != len(hf_results):
        print(f"Length mismatch: vllm_results has {len(vllm_results)}, hf_results has {len(hf_results)}")
    else:
        print(f"Both results have {len(vllm_results)} entries.")

    # Compare each pair
    all_match = True
    for i, (vllm_ids, hf_ids) in enumerate(zip(vllm_results, hf_results)):
        # Depending on the data structure, hf_ids may still be a list of lists (e.g. [[...]]), so flatten if needed
        if len(hf_ids) == 1 and isinstance(hf_ids[0], list):
            hf_ids = hf_ids[0]
        if vllm_ids == hf_ids:
            print(f"Entry {i}: Match")
        else:
            print(f"Entry {i}: MISMATCH")
            print(f"  vllm: {vllm_ids}")
            print(f"  hf:   {hf_ids}")
            all_match = False
    if all_match:
        print("All entries match.")
    else:
        print("Some entries do not match.")
        
compare_results("result_hf.json", "result_vllm.json")

Both results have 50 entries.
Entry 0: MISMATCH
  vllm: [31865, 31834, 31904, 31886, 31862, 31864, 31744]
  hf:   [31906, 31868, 31984, 31873, 31764, 31873, 31744]
Entry 1: MISMATCH
  vllm: [31874, 31873, 31887, 31877, 31908, 31882, 31872]
  hf:   [31928, 31847, 31935, 31894, 31924, 31846, 31744]
Entry 2: MISMATCH
  vllm: [31879, 31810, 31886, 31877, 31859, 31843, 31868]
  hf:   [31838, 31847, 31849, 31873, 31866, 31875, 31744]
Entry 3: MISMATCH
  vllm: [31865, 31889, 31883, 31881, 31890, 31882, 31872]
  hf:   [31882, 31865, 31911, 31860, 31855, 31863, 31872]
Entry 4: MISMATCH
  vllm: [31879, 31871, 31905, 31868, 31881, 31873, 31875]
  hf:   [31868, 31864, 31884, 31877, 31869, 31821, 31744]
Entry 5: MISMATCH
  vllm: [31865, 31864, 31876, 31871, 31857, 31874, 31872]
  hf:   [31823, 31831, 31946, 31935, 31984, 31843, 31872]
Entry 6: MISMATCH
  vllm: [31879, 31880, 31744, 31897, 31901, 31856, 31872]
  hf:   [31903, 31854, 31852, 31869, 31911, 31874, 31872]
Entry 7: MISMATCH
  vllm: [31865