In [1]:
import json
from tqdm.auto import tqdm
import random
random.seed(42)

from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

### Load sugarcrepe

In [2]:
folder_path = '/home/drdo/vlm-compositionality/data/raw/sugarcrepe/'

# add attribute
with open (folder_path+'add_att.json') as f:
    add_attribute = json.load(f)

# add object
with open (folder_path+'add_obj.json') as f:
    add_object = json.load(f)

# replace attribute
with open (folder_path+'replace_att.json') as f:
    replace_attribute = json.load(f)

# replace object
with open (folder_path+'replace_obj.json') as f:
    replace_object = json.load(f)

# replace relation
with open (folder_path+'replace_rel.json') as f:
    replace_relation = json.load(f)

# swap attribute
with open (folder_path+'swap_att.json') as f:
    swap_attribute = json.load(f)

# swap object
with open (folder_path+'swap_obj.json') as f:
    swap_object = json.load(f)

# collate together
dataset = {
    'add_attribute': add_attribute, 'add_object': add_object, 'replace_attribute': replace_attribute,
    'replace_object': replace_object, 'replace_relation': replace_relation,
    'swap_attribute': swap_attribute, 'swap_object': swap_object,
}

In [3]:
add_attribute['0']

{'filename': '000000085329.jpg',
 'caption': 'A drawing of a young woman with many facial piercings.',
 'negative_caption': 'A drawing of a tattooed young woman with many facial piercings.'}

### Load model

In [4]:
# model_name
model_name = "Qwen/Qwen2-VL-2B-Instruct"

# load model
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
)

# load processor
processor = AutoProcessor.from_pretrained(model_name)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Helper functions

In [None]:
#chat = [

  #{"role": "user", "content": "Hello, how are you?"},

  #{"role": "assistant", "content": "I'm doing great. How can I help you today?"},

  #{"role": "user", "content": "I'd like to show off how chat templating works!"},

#]

In [37]:
img_folder = '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/'

def get_random_sample(split_name, s_id):
    all_ids = list(dataset[split_name].keys())
    r_id = random.choice(all_ids)
    # make sure its not the same example
    while r_id == s_id:
        r_id = random.choice(all_ids)
    sample = dataset[split_name][r_id]
    return sample


# TODO: change prompt for few shot
def compose_prompt(split_name, s_id, sample, num_examples=0):
    
    # final prompt
    messages = []

    # prompt template
    prompt_instruct = "Choose and return the correct caption for the image from the following 2 captions. "
    add_instruct = "Generate no other text. "
    instruct_content = "Caption 1: {} Caption 2: {}"

    # first construct few shot examples
    for i in range(num_examples):
        
        # get a random example
        random_sample = get_random_sample(split_name, s_id)
        caption = random_sample['caption']
        negative_caption = random_sample['negative_caption']
        captions = [caption, negative_caption]

        # construct prompt with example
        # randomly choose ordering of the captions
        c_id = random.choice([1,0])
        prompt = (prompt_instruct + add_instruct + instruct_content).format(captions[c_id], captions[1-c_id])

        # add to content
        user_content = []
        user_content.append({"type": "image", "image": img_folder+random_sample['filename']}),
        user_content.append({"type": "text", "text": prompt}),

        # add to final message
        messages.append({"role": "user", "content": user_content})

        # construct response
        correct_caption = "Caption 1" if c_id == 0 else "Caption 2"
        bot_content = [{"type": "text", "text": correct_caption}]

        # add to final message
        messages.append({"role": "assistant", "content": bot_content})

    # append sample to messages
    caption = sample['caption']
    negative_caption = sample['negative_caption']
    captions = [caption, negative_caption]
    
    # randomly choose ordering of the captions
    c_id = random.choice([1,0])
    prompt = (prompt_instruct + add_instruct + instruct_content).format(captions[c_id], captions[1-c_id])

    # add to content
    user_content = []
    user_content.append({"type": "image", "image": img_folder+sample['filename']}),
    user_content.append({"type": "text", "text": prompt}),

    # add to final message
    messages.append({"role": "user", "content": user_content})

    label = "Caption 1" if c_id == 0 else "Caption 2"

    return messages, label


def prepare_inputs(messages):
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    return inputs


def generate_text(inputs, max_new_tokens=128):
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text

### Zero-Shot

In [56]:
# TODO: verify zero shot

total_len = sum([len(split) for name, split in dataset.items()])
bar = tqdm(range(total_len))
incorrect_outputs = []

for split_name, split in dataset.items():
    
    accuracy = 0
    for s_id, sample in split.items():
        
        # compose prompt with chat template
        messages, label = compose_prompt(split_name, s_id, sample, num_examples=0)

        # prepare for inference
        inputs = prepare_inputs(messages)

        # generate text
        output_text = generate_text(inputs)

        # eval
        if output_label == label: accuracy += 1
        elif output_label not in ['Caption 1', 'Caption 2']: incorrrect_outputs.append(output_label)

        bar.update(1)

    print("{} : {}".format(split_name, accuracy/len(split)))

print(len(incorrect_outputs))

  0%|          | 0/7511 [00:00<?, ?it/s]

add_attribute : 0.5158959537572254
add_object : 0.513094083414161
replace_attribute : 0.5
replace_object : 0.5024213075060533
replace_relation : 0.5064011379800853
swap_attribute : 0.5
swap_object : 0.5183673469387755
0


#### Testing prompts

In [30]:
img_folder = '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/'
example = swap_object['2']

In [33]:
img_file = img_folder+example['filename']
caption = example['caption']
negative_caption = example['negative_caption']

In [34]:
example

{'filename': '000000287347.jpg',
 'caption': 'A woman prepares a pizza while a man watches.',
 'negative_caption': 'A man prepares a pizza while a woman watches.'}

In [25]:
swap_object['3']

{'filename': '000000177934.jpg',
 'caption': 'Girls wash a motorcycle while men look on.',
 'negative_caption': 'Men wash a motorcycle while girls look on.'}

In [None]:
#chat = [

  #{"role": "user", "content": "Hello, how are you?"},

  #{"role": "assistant", "content": "I'm doing great. How can I help you today?"},

  #{"role": "user", "content": "I'd like to show off how chat templating works!"},

#]

In [7]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_file,},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions : 1) "+negative_caption+" 2) "+caption+" Generate no other text."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['1) A man prepares a pizza while a woman watches.']


In [46]:
len(messages)

1

In [45]:
messages[0].keys()

dict_keys(['role', 'content'])

In [None]:
# observation

# shuffle ordering
# few(even 1) shot helps -> need to make sure example do not contain answer
# how to ensure consistent output?

In [36]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_folder+'000000051309.jpg',},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: Three large horses eating hay while a small horse stands behind. Caption 2: A small horse eating hay while three large horses stand behind ."},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 1"}
        ]
    },
     {
        "role": "user",
        "content": [
            {"type": "image", "image": img_folder+'000000177934.jpg',},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: Men wash a motorcycle while girls look on. Caption 2: Girls wash a motorcycle while men look on."},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 1"}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_file,},
            {"type": "text", "text": "Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: A man prepares a pizza while a woman watches. Caption 2: A woman prepares a pizza while a man watches."},
        ]
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['Caption 1']


In [57]:
messages

[{'role': 'user',
  'content': [{'type': 'image',
    'image': '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/000000287347.jpg'},
   {'type': 'text',
    'text': 'Choose and return the more appropriate caption for the image from the following 2 captions. Generate no other text. Caption 1: A man prepares a pizza while a woman watches. Caption 2: A woman prepares a pizza while a man watches.'}]}]