In [1]:
import mlx.core as mx
from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config

import json
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model
model_path = "mlx-community/Qwen2-VL-7B-bf16"
model, mlx_processor = load(model_path)
config = load_config(model_path)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

Fetching 14 files:  29%|██▊       | 4/14 [00:03<00:07,  1.29it/s]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/c0/70/c0703c964d73eab9f5b7a0709efae49a694794fb025e6562c9989e5515769296/b02a67d5d46d20cf51f10ed9961a17d8bb600c2aa34406afbd5cc4558555efc1?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00003-of-00004.safetensors%3B+filename%3D%22model-00003-of-00004.safetensors%22%3B&Expires=1733388049&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzM4ODA0OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2MwLzcwL2MwNzAzYzk2NGQ3M2VhYjlmNWI3YTA3MDllZmFlNDlhNjk0Nzk0ZmIwMjVlNjU2MmM5OTg5ZTU1MTU3NjkyOTYvYjAyYTY3ZDVkNDZkMjBjZjUxZjEwZWQ5OTYxYTE3ZDhiYjYwMGMyYWEzNDQwNmFmYmQ1Y2M0NTU4NTU1ZWZjMT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Zq197FAocU273vtitJl0mDcLMCBCGt4O0qZf92bKGPZ6JHzot0etSp3CGovfgJampR0CMEtOMfHQtta212dioJcfHLkmO6lzrHK%7EviSPTF6qdj6sNSSHjq%7E-oARgdtGzxaf3eJfBOhkojopFuXw43sSITYvQomBwT

In [16]:
image = "http://images.cocodataset.org/val2017/000000039769.jpg"
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image,
            },
            {"type": "text", "text": "Describe the image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

In [17]:
text

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n<|im_start|>assistant\n'

In [19]:
output = generate(model, mlx_processor, image, text, verbose=False)
print(output)

The image shows a close-up of two cats sleeping on a pink blanket. One cat is lying down and the other is curled up in a ball. Both cats have stripes on their fur, and they are both looking towards the camera with their eyes closed. The cat lying down is wearing a green collar, and there are two remote controls on the blanket next to it. The cat curled up in a ball is wearing a pink collar, and there are also two remote controls on the blanket next to it


In [23]:
images = [
    "http://images.cocodataset.org/val2017/000000039769.jpg",
    "http://images.cocodataset.org/val2017/000000082807.jpg"
]
prompt = "Compare these two images."

formatted_prompt = apply_chat_template(
    processor, config, prompt, num_images=len(images)
)

In [24]:
formatted_prompt

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nCompare these two images.<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n'

In [26]:
output = generate(model, mlx_processor, images, formatted_prompt, verbose=False)
print(output)

The image shows a scene with three animals: two cats and one dog. Here's a detailed description:

1. **Cat on the left**: This cat is lying down and appears to be resting or sleeping. It has a black and white striped pattern, and it's lying on its back with its paws stretched out in front of it.

2. **Dog in the middle**: This dog is also lying down and appears to be resting or sleeping as well. It has a brown coat with black


In [3]:
folder_path = '/users/ujan/vlm-compositionality/data/raw/sugarcrepe/'

# add attribute
with open(folder_path+'add_att.json') as f:
    add_attribute = json.load(f)

# add object
with open(folder_path+'add_obj.json') as f:
    add_object = json.load(f)

# replace attribute
with open(folder_path+'replace_att.json') as f:
    replace_attribute = json.load(f)

# replace object
with open(folder_path+'replace_obj.json') as f:
    replace_object = json.load(f)

# replace relation
with open(folder_path+'replace_rel.json') as f:
    replace_relation = json.load(f)

# swap attribute
with open(folder_path+'swap_att.json') as f:
    swap_attribute = json.load(f)

# swap object
with open(folder_path+'swap_obj.json') as f:
    swap_object = json.load(f)

# collate together
dataset = {
    'add_attribute': add_attribute, 'add_object': add_object, 'replace_attribute': replace_attribute,
    'replace_object': replace_object, 'replace_relation': replace_relation,
    'swap_attribute': swap_attribute, 'swap_object': swap_object,
}

In [9]:
img_folder = '/users/ujan/vlm-compositionality/data/raw/coco_val_2017/'

id = '67'
img_file = swap_object[id]['filename']
caption = swap_object[id]['caption']
negative_caption = swap_object[id]['negative_caption']
swap_object[id]

{'filename': '000000021503.jpg',
 'caption': 'Crackers coated with spread, sitting on a plate, ready to eat.',
 'negative_caption': 'Spread coated with crackers, sitting on a plate, ready to eat.'}

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_folder+'000000177934.jpg', },
            {"type": "text", "text": "Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: Girls wash a motorcycle while men look on. Caption 2: Men wash a motorcycle while girls look on.'}"},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Caption 1"}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img_folder+img_file, },
            {"type": "text", "text": ("Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: {} Caption 2: {}").format(caption, negative_caption)},
        ]
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
text

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: Girls wash a motorcycle while men look on. Caption 2: Men wash a motorcycle while girls look on.'}<|im_end|>\n<|im_start|>assistant\nCaption 1<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Choose and return the correct caption for the image from the following 2 captions. Generate no other text. Caption 1: Crackers coated with spread, sitting on a plate, ready to eat. Caption 2: Spread coated with crackers, sitting on a plate, ready to eat.<|im_end|>\n<|im_start|>assistant\n"

In [11]:
images = [
    img_folder+'000000177934.jpg',
    img_folder+img_file
]

In [None]:
output = generate(model, mlx_processor, images, text, verbose=False)
print(output)