In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import PIL

SPECIAL_TOKENLIST = {
    "im_start":'<|im_start|>',
    "eos":'<|im_end|>',
    "object_ref_start":'<|object_ref_start|>',
    "object_ref_end":'<|object_ref_end|>',
    "box_start":'<|box_start|>',
    "box_end":'<|box_end|>',
    "quad_start":'<|quad_start|>',
    "quad_end":'<|quad_end|>',
    "vision_start":'<|vision_start|>',
    "vision_end":'<|vision_end|>',
    "vision_pad":'<|vision_pad|>',
    "image_pad":'<|image_pad|>',
    "video_pad":'<|video_pad|>',
    }

def calculate_visible_tokens(tokenizer, text):
    try:
        tokens_with_special = tokenizer.encode(text, add_special_tokens=False)
        
        text_without_special = tokenizer.decode(tokens_with_special, skip_special_tokens=True)
        
        visible_tokens = tokenizer.encode(text_without_special, add_special_tokens=False)
        
        return len(visible_tokens)
        
    except Exception as e:
        print(f"decode-encode failed: {e}")
        
        tokens = tokenizer.encode(text, add_special_tokens=False)
        
        special_token_ids = set()
        
        if hasattr(tokenizer, 'all_special_ids'):
            special_token_ids.update(tokenizer.all_special_ids)
        
        special_attrs = ['pad_token_id', 'eos_token_id', 'bos_token_id', 'unk_token_id', 'sep_token_id', 'cls_token_id']
        for attr in special_attrs:
            if hasattr(tokenizer, attr):
                token_id = getattr(tokenizer, attr)
                if token_id is not None:
                    special_token_ids.add(token_id)
        
        if 'SPECIAL_TOKENLIST' in globals():
            for special_token_value in SPECIAL_TOKENLIST.values():
                try:
                    token_id = tokenizer.convert_tokens_to_ids(special_token_value)
                    if token_id is not None and token_id != tokenizer.unk_token_id:
                        special_token_ids.add(token_id)
                except:
                    pass
        
        special_token_ids.discard(None)
        
        visible_tokens = [token_id for token_id in tokens if token_id not in special_token_ids]
        
        return len(visible_tokens)
    
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained(model_path)

adv_image = "./adv_image/qwen_adv_img_0.bmp"
# adv_image = PIL.Image.open(adv_image)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": adv_image,
            },
            {"type": "text", "text": "Can you describe the image?"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)


generated_text_true = processor.decode(
    generated_ids[0][inputs['input_ids'].shape[-1]:], 
    skip_special_tokens=True
)
generated_text_false = processor.decode(
    generated_ids[0][inputs['input_ids'].shape[-1]:], 
    skip_special_tokens=False
)

token_count = len(generated_ids[0][inputs['input_ids'].shape[-1]:])
visible_tokens_count = calculate_visible_tokens(processor.tokenizer, generated_text_false)

print(f"visible_response: {generated_text_true}")
print(f"visible_tokens_count: {visible_tokens_count}")

print(f"actual_response: {generated_text_false}")
print(f"actual_tokens_count: {token_count}")


Loading checkpoint shards: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


visible_response: The image shows a well-lit bathroom with a modern and elegant design. The space features a walk-in shower on the left side, enclosed by a glass door. Adjacent to the shower is a bathtub with a beige tile surround and a red patterned curtain for privacy. A small step leads up to the tub area.

On the right side of the image, there is a vanity with two sinks, each equipped with a dark wooden cabinet underneath. Above the sinks, there are two mirrors framed in a dark, ornate style, reflecting part of the room's interior. The countertop appears to be white, providing a clean contrast to the darker wood tones. The walls are painted in a light beige color, complementing the overall warm and inviting atmosphere of the bathroom. The lighting includes a wall-mounted fixture above the mirror, casting a soft glow over the space. The floor has a patterned tile design that adds texture to the room. Overall, the bathroom exudes a cozy and stylish vibe.
/n
visible_tokens_count: 201/