In [1]:
import torch
import re
from PIL import Image
from transformers import AutoTokenizer
from modeling_qwen2_vl_vpt import VPT_Qwen2VLForConditionalGeneration, VPT_Qwen2VLProcessor


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "rp-yu/Qwen2-VL-7b-VPT-CLIP"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型
processor = VPT_Qwen2VLProcessor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = VPT_Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype=torch.bfloat16
).to(device)



`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 14.39it/s]


In [10]:
# 测试样例
image = Image.open("test.jpg").convert("RGB")
question = "Identify the region that can help you answer the question, and then answer the question: What is the left side of the green cup? <image>"

model_inputs = processor(images=image, text=question, return_tensors="pt").to(device)
output_ids = model.generate(**model_inputs, max_new_tokens=256)
answer = tokenizer.batch_decode(output_ids, skip_special_tokens=True)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [12]:
print("Final Answer:", answer[0])

Final Answer: Identify the region that can help you answer the question, and then answer the question: What is the left side of the green cup? <image><|region_token_start|><|x_0|><|y_0|><|x_7|><|y_7|><|y_7|><|region_token_end|>


In [15]:
Action_tokens = {
    "region_x": "<|x_0|>,<|x_1|>,<|x_2|>,<|x_3|>,<|x_4|>,<|x_5|>,<|x_6|>,<|x_7|>".split(","),
    "region_y": "<|y_0|>,<|y_1|>,<|y_2|>,<|y_3|>,<|y_4|>,<|y_5|>,<|y_6|>,<|y_7|>".split(","),
    "dino": "<|detection_action_start|>",
    "clip": "<|clip_action_start|>",
    "sam": "<|seg_action_start|>",
}

def check_region_tokens(text):

    pattern = re.compile(r'<\|region_token_start\|>(<\|[xy]_[01234567]\|>)+<\|region_token_end\|>')
    matches = pattern.finditer(text)

    found_tokens = []
    for match in matches:
        match_str = match.group()
        # match_str = match_str.replace("<|region_token_start|>","").replace("<|region_token_end|>","")
        match_tokens_x = [token for token in Action_tokens["region_x"] if token in match_str]
        match_tokens_y = [token for token in Action_tokens["region_y"] if token in match_str]
        found_tokens.append((match_tokens_x, match_tokens_y))

    if found_tokens:
        return found_tokens, True
    else:
        return None, False

text= check_region_tokens(answer[0])
print(text)

([(['<|x_0|>', '<|x_7|>'], ['<|y_0|>', '<|y_7|>'])], True)
