In [1]:
import numpy as np
import pandas as pd
import json
from PIL import Image
import matplotlib.pyplot as plt

### Inspect aokvqa dataset

In [2]:
# print all files from the data folder using os
import os
import glob

path = "/home/ubuntu/data/aokvqa/"
all_files = glob.glob(os.path.join(path, "*.json"))

In [3]:
all_files

['/home/ubuntu/data/aokvqa/aokvqa_v1p0_test.json',
 '/home/ubuntu/data/aokvqa/aokvqa_v1p0_train.json',
 '/home/ubuntu/data/aokvqa/aokvqa_v1p0_val.json']

In [4]:
# read all files into a list
data = []
for filename in all_files:
    with open(filename, 'r') as f:
        data.append(json.load(f))

val_aokvqa = data[2]

In [5]:
val_aokvqa = data[2]

In [6]:
sample = data[2][0]
sample

{'split': 'val',
 'image_id': 461751,
 'question_id': '22jbM6gDxdaMaunuzgrsBB',
 'question': "What is in the motorcyclist's mouth?",
 'choices': ['toothpick', 'food', 'popsicle stick', 'cigarette'],
 'correct_choice_idx': 3,
 'direct_answers': ['cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette'],
 'difficult_direct_answer': False,
 'rationales': ["He's smoking while riding.",
  'The motorcyclist has a lit cigarette in his mouth while he rides on the street.',
  'The man is smoking.']}

### Inspect coco

In [7]:
path = "/home/ubuntu/data/coco/annotations"
all_files = glob.glob(os.path.join(path, "*.json"))
all_files

['/home/ubuntu/data/coco/annotations/person_keypoints_train2017.json',
 '/home/ubuntu/data/coco/annotations/instances_val2017.json',
 '/home/ubuntu/data/coco/annotations/captions_train2017.json',
 '/home/ubuntu/data/coco/annotations/person_keypoints_val2017.json',
 '/home/ubuntu/data/coco/annotations/captions_val2017.json',
 '/home/ubuntu/data/coco/annotations/instances_train2017.json']

In [8]:
with open("/home/ubuntu/data/coco/annotations/captions_val2017.json", 'r') as f:
    coco_val_caption = json.load(f)

with open("/home/ubuntu/data/coco/annotations/instances_val2017.json", 'r') as f:
    coco_val_instance = json.load(f)

In [9]:
coco_val_caption.keys()

dict_keys(['info', 'licenses', 'images', 'annotations'])

In [10]:
coco_val_caption['images'][0]

{'license': 4,
 'file_name': '000000397133.jpg',
 'coco_url': 'http://images.cocodataset.org/val2017/000000397133.jpg',
 'height': 427,
 'width': 640,
 'date_captured': '2013-11-14 17:02:52',
 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg',
 'id': 397133}

In [11]:
coco_val_caption['annotations'][0]

{'image_id': 179765,
 'id': 38,
 'caption': 'A black Honda motorcycle parked in front of a garage.'}

In [12]:
coco_val_instance.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [13]:
coco_val_instance['images'][0]

{'license': 4,
 'file_name': '000000397133.jpg',
 'coco_url': 'http://images.cocodataset.org/val2017/000000397133.jpg',
 'height': 427,
 'width': 640,
 'date_captured': '2013-11-14 17:02:52',
 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg',
 'id': 397133}

In [14]:
coco_val_instance['categories'][0]

{'supercategory': 'person', 'id': 1, 'name': 'person'}

In [15]:
coco_val_instance['annotations'][0].keys()

dict_keys(['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'])

In [16]:
coco_val_instance['categories']

[{'supercategory': 'person', 'id': 1, 'name': 'person'},
 {'supercategory': 'vehicle', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'vehicle', 'id': 3, 'name': 'car'},
 {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'},
 {'supercategory': 'vehicle', 'id': 5, 'name': 'airplane'},
 {'supercategory': 'vehicle', 'id': 6, 'name': 'bus'},
 {'supercategory': 'vehicle', 'id': 7, 'name': 'train'},
 {'supercategory': 'vehicle', 'id': 8, 'name': 'truck'},
 {'supercategory': 'vehicle', 'id': 9, 'name': 'boat'},
 {'supercategory': 'outdoor', 'id': 10, 'name': 'traffic light'},
 {'supercategory': 'outdoor', 'id': 11, 'name': 'fire hydrant'},
 {'supercategory': 'outdoor', 'id': 13, 'name': 'stop sign'},
 {'supercategory': 'outdoor', 'id': 14, 'name': 'parking meter'},
 {'supercategory': 'outdoor', 'id': 15, 'name': 'bench'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'animal', 'id': 18, 'name': 'dog'},

In [17]:
coco_id_filename = {}

for d in coco_val_instance['images']:
    coco_id_filename[d['id']] = d['file_name']
    

In [10]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 5/5 [02:30<00:00, 30.04s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [12]:

total_num = 100
correct_num = 0


for i in trange(total_num):
    base_path = "/home/ubuntu/data/coco/val2017/"
    img_id = val_aokvqa[i]["image_id"]
    img_file = coco_id_filename[img_id]
    img_path = base_path + img_file  
    base_ans = val_aokvqa[i]["correct_choice_idx"] + 1
    question = val_aokvqa[i]["question"]
    choices = val_aokvqa[i]["choices"]
    prompt = f"Question: {question}\nChoices: 1. {choices[0]} 2. {choices[1]} 3. {choices[2]} 4. {choices[3]}\n please select the correct answer by typing the number of the correct choice."

    image = Image.open(img_path)
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]
    print(model.device)
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)

    output = model.generate(**inputs, max_new_tokens=30)
    ans = processor.decode(output[0])

    ans_num =  int(ans.split("\n")[-1][0])
    if ans_num == int(base_ans):
        correct_num += 1
    
    if i % 10 == 0:
        print(f"Correct number: {correct_num}")
        print(f"Total number: {i+1}")
        print(f"Accuracy: {correct_num/(i+1)}")


  1%|          | 1/100 [00:13<22:47, 13.81s/it]

Correct number: 1
Total number: 1
Accuracy: 1.0


 11%|█         | 11/100 [00:27<02:09,  1.46s/it]

Correct number: 10
Total number: 11
Accuracy: 0.9090909090909091


 21%|██        | 21/100 [00:40<01:46,  1.35s/it]

Correct number: 19
Total number: 21
Accuracy: 0.9047619047619048


 31%|███       | 31/100 [00:54<01:33,  1.35s/it]

Correct number: 27
Total number: 31
Accuracy: 0.8709677419354839


 41%|████      | 41/100 [01:07<01:20,  1.36s/it]

Correct number: 36
Total number: 41
Accuracy: 0.8780487804878049


 44%|████▍     | 44/100 [01:18<01:40,  1.80s/it]


ValueError: invalid literal for int() with base 10: 'T'

In [100]:
print(f"Correct number: {correct_num}")
print(f"Total number: {i+1}")
print(f"Accuracy: {correct_num/(i+1)}")

Correct number: 9
Total number: 10
Accuracy: 0.9


In [68]:
# img = Image.open(img_path)

# # Show the image using PIL's built-in method
# # img.show()

# # Alternatively, you can use matplotlib to display the image
# plt.imshow(img)
# plt.axis('off')  # To hide axes
# plt.show()

In [69]:
image = Image.open(img_path)
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": prompt}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=30)


In [80]:
ans = processor.decode(output[0])
print(ans)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Question: What is in the motorcyclist's mouth?
Choices: 1. toothpick 2. food 3. popsicle stick 4. cigarette
 please select the correct answer by typing the number of the correct choice.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

4.<|eot_id|>
