In [5]:
from datasets import load_dataset
import json
from tqdm.auto import tqdm

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

### Load sugarcrepe

In [2]:
folder_path = '/home/drdo/vlm-compositionality/data/raw/sugarcrepe/'

# add attribute
with open (folder_path+'add_att.json') as f:
    add_attribute = json.load(f)

# add object
with open (folder_path+'add_obj.json') as f:
    add_object = json.load(f)

# replace attribute
with open (folder_path+'replace_att.json') as f:
    replace_attribute = json.load(f)

# replace object
with open (folder_path+'replace_obj.json') as f:
    replace_object = json.load(f)

# replace relation
with open (folder_path+'replace_rel.json') as f:
    replace_relation = json.load(f)

# swap attribute
with open (folder_path+'swap_att.json') as f:
    swap_attribute = json.load(f)

# swap object
with open (folder_path+'swap_obj.json') as f:
    swap_object = json.load(f)

### Load model

In [6]:
# model_name
model_name = "Qwen/Qwen2-VL-2B-Instruct"

# load model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
)

# load processor
processor = AutoProcessor.from_pretrained(model_name)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Testing prompts

In [44]:
img_folder = '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/'
example = swap_object['2']

In [45]:
img_file = img_folder+example['filename']
caption = example['caption']
negative_caption = example['negative_caption']

In [46]:
example

{'filename': '000000287347.jpg',
 'caption': 'A woman prepares a pizza while a man watches.',
 'negative_caption': 'A man prepares a pizza while a woman watches.'}

In [None]:
#chat = [

  #{"role": "user", "content": "Hello, how are you?"},

  #{"role": "assistant", "content": "I'm doing great. How can I help you today?"},

  #{"role": "user", "content": "I'd like to show off how chat templating works!"},

#]

In [37]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_file,},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions : 1) "+negative_caption+" 2) "+caption+" Generate no other text."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['1) A man prepares a pizza while a woman watches.']


In [None]:
# observatiosn

# shuffle ordering
# few(even 1) shot helps -> need to make sure example do not contain answer
# how to ensure consistent output?

In [49]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_folder+'000000480021.jpg',},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: A man on a motorcycle is waving at two men. Caption 2: Two men on a motorcycle are waving at a man."},
        ],
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 1: A man on a motorcycle is waving at two men."}
        ],
        "role": "user",
        "content": [
            {"type": "image", "image": img_file,},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: A woman prepares a pizza while a man watches. Caption 2: A man prepares a pizza while a woman watches."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['Caption 1']
