In [46]:
import requests
from PIL import Image
import json

import torch
from transformers import AutoProcessor,  LlavaNextForConditionalGeneration

### Load model

In [47]:
# model_name
model_name = "llava-hf/llava-v1.6-mistral-7b-hf"

# load model
model =  LlavaNextForConditionalGeneration.from_pretrained(
    model_name,
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
)
model.eval()

# load processor
processor = AutoProcessor.from_pretrained(model_name)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   1%|1         | 62.9M/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



processor_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


### Load dataset

In [None]:
folder_path = '/home/drdo/vlm-compositionality/data/raw/sugarcrepe/'

# add attribute
with open (folder_path+'add_att.json') as f:
    add_attribute = json.load(f)

# add object
with open (folder_path+'add_obj.json') as f:
    add_object = json.load(f)

# replace attribute
with open (folder_path+'replace_att.json') as f:
    replace_attribute = json.load(f)

# replace object
with open (folder_path+'replace_obj.json') as f:
    replace_object = json.load(f)

# replace relation
with open (folder_path+'replace_rel.json') as f:
    replace_relation = json.load(f)

# swap attribute
with open (folder_path+'swap_att.json') as f:
    swap_attribute = json.load(f)

# swap object
with open (folder_path+'swap_obj.json') as f:
    swap_object = json.load(f)

# collate together
dataset = {
    'add_attribute': add_attribute, 'add_object': add_object, 'replace_attribute': replace_attribute,
    'replace_object': replace_object, 'replace_relation': replace_relation,
    'swap_attribute': swap_attribute, 'swap_object': swap_object,
}

### Helper functions

In [None]:
import random
random.seed(42) # 42, 33, 56
img_folder = '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/'

def get_random_sample(split_name, s_id):
    all_ids = list(dataset[split_name].keys())
    r_id = random.choice(all_ids)
    # make sure its not the same example
    while r_id == s_id:
        r_id = random.choice(all_ids)
    sample = dataset[split_name][r_id]
    return sample


def prepare_inputs(messages):
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    images = [Image.open(image) for image in [img_folder+img1, img_folder+img2, img_folder+img]]

    # We can simply feed images in the order they have to be used in the text prompt
    # Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
    inputs = processor(images=images, text=prompt, padding=True, return_tensors="pt").to(model.device)
    return inputs

 
def generate_text(inputs, max_new_tokens=128):
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text

In [55]:
img_folder = '/home/drdo/vlm-compositionality/data/raw/coco_val_2017/'

# Get 2 images for prompt
ids = ['4', '3']

img1 = swap_object[ids[0]]['filename']
caption1 = swap_object[ids[0]]['caption']
negative1 = swap_object[ids[0]]['negative_caption']
swap_object[id]

img2 = swap_object[ids[1]]['filename']
caption2 = swap_object[ids[1]]['caption']
negative2 = swap_object[ids[1]]['negative_caption']

id = '56'
img = swap_object[id]['filename']
caption = swap_object[id]['caption']
negative = swap_object[id]['negative_caption']

# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": ("Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: {} Caption 2: {}").format(negative1, caption1)},
            ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 2"},
            ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": ("Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: {} Caption 2: {}").format(caption2, negative2)},
            ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 1"},
            ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": ("Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: {} Caption 2: {}").format(negative, caption)},
            ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
prompt

'[INST] <image>\nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: A sailboat is riding the waves as a surfer surfs in the background. Caption 2: A surfer is riding the waves as a sailboat sails in the background. [/INST] Caption 2<\\s> [INST] <image>\nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: Girls wash a motorcycle while men look on. Caption 2: Men wash a motorcycle while girls look on. [/INST] Caption 1<\\s> [INST] <image>\nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: A baby elephant leads an elephant towards a door. Caption 2: An elephant leads a baby elephant towards a door. [/INST]'

In [56]:
images = [Image.open(image) for image in [img_folder+img1, img_folder+img2, img_folder+img]]

# We can simply feed images in the order they have to be used in the text prompt
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
inputs = processor(images=images, text=prompt, padding=True, return_tensors="pt").to(model.device)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['[INST]  \nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: A sailboat is riding the waves as a surfer surfs in the background. Caption 2: A surfer is riding the waves as a sailboat sails in the background. [/INST] Caption 2<\\s> [INST]  \nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: Girls wash a motorcycle while men look on. Caption 2: Men wash a motorcycle while girls look on. [/INST] Caption 1<\\s> [INST]  \nChoose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: A baby elephant leads an elephant towards a door. Caption 2: An elephant leads a baby elephant towards a door. [/INST] Caption 2<\\s> ']