# Basic Model Command

In [1]:
from huggingface_hub import login
# login("")

In [1]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

["The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt and black pants, and appears to be smiling. She is holding the dog's paw in a high-five gesture. The dog, which is a large breed, is wearing a harness and is sitting on the sand with its front paws raised. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset. The overall atmosphere is peaceful and joyful."]


# Load data

In [1]:
import os
import json
aokvqa_dir = "aokvqa/datasets/aokvqa"
coco_dir = "aokvqa/datasets/coco"

aokvqa_dataset = json.load(open(
        os.path.join(aokvqa_dir, f"aokvqa_v1p0_val.json")
))

def get_coco_path(split, image_id, coco_dir):
    return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")

In [2]:
dataset_example = aokvqa_dataset[0]

print(dataset_example['question_id'])
# 22MexNkBPpdZGX6sxbxVBH

image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
print(image_path)
# ./datasets/coco/train2017/000000299207.jpg

print(dataset_example['question'])
print(dataset_example['choices'])
# What is the man by the bags awaiting?
# ['skateboarder', 'train', 'delivery', 'cab']

correct_choice = dataset_example['choices'][dataset_example['correct_choice_idx'] ]
# Corrrect: cab

print(dataset_example['rationales'][0])
# A train would not be on the street, he would not have luggage waiting for a delivery, and the skateboarder is there and not paying attention to him so a cab is the only possible answer.

22jbM6gDxdaMaunuzgrsBB
aokvqa/datasets/coco/val2017/000000461751.jpg
What is in the motorcyclist's mouth?
['toothpick', 'food', 'popsicle stick', 'cigarette']
He's smoking while riding.


# Run model

In [3]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
correct_count = 0
logits_probs_data = []

for dataset_example in aokvqa_dataset:
    question = dataset_example['question']
    choices = dataset_example['choices']
    correct_choice = choices[dataset_example['correct_choice_idx']]
    correct_idx = dataset_example['correct_choice_idx']
    
    image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    raw_image = Image.open(image_path)

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "I will give you a question and choices, return only the index of the choice\n"+"Question: "+question+"\nChoice: "\
                 +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "},
                # {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

    with torch.no_grad():
        output = model(**inputs)
        logits = output.logits
        
    generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False)
    model_response = processor.decode(generated_ids[0][2:], skip_special_tokens=True)

    token_probs = []
    sequence_length = min(generated_ids[0].size(0), logits.size(1))  # Adjust the loop length to avoid out-of-bounds
    for i, token_id in enumerate(generated_ids[0][:sequence_length]):
        token_logit = logits[0, i]  # logits for all tokens at position i
        token_prob = torch.nn.functional.softmax(token_logit, dim=-1)[token_id].item()
        token_probs.append(token_prob)
        
    logits_probs_data.append({
        "question_id": dataset_example['question_id'],
        "model_response": model_response,
        "logits": [logits[0, i, token_id].item() for i, token_id in enumerate(generated_ids[0][:sequence_length])],
        "probabilities": token_probs
    })

    # print(repr(correct_idx))
    # print(repr(model_response[-1]))
    # print(str(correct_idx).strip() == str(model_response[-1]).strip())
    if str(correct_idx).strip() == str(model_response[-1]).strip():
        correct_count += 1

print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")

# Print logits and probabilities for analysis
for data in logits_probs_data:
    print(f"Question ID: {data['question_id']}")
    print(f"Model Response: {data['model_response']}")
    print("Logits:", data["logits"])
    print("Probabilities:", data["probabilities"])
    print("\n")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [6]:
print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")


Number of accurate items: 710 out of 1145


In [7]:
mean_probability = sum(data["probabilities"]) / len(data["probabilities"])
print("Mean of probabilities:", mean_probability)


Mean of probabilities: 0.0005063655124073774


# V2

In [23]:
import torch

correct_count = 0
logits_probs_data = []

for dataset_example in aokvqa_dataset:
    question = dataset_example['question']
    choices = dataset_example['choices']
    correct_choice = choices[dataset_example['correct_choice_idx']]
    correct_idx = dataset_example['correct_choice_idx']
    
    image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    raw_image = Image.open(image_path)

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "I will give you a question and choices, return only the index of the choice\n"+"Question: "+question+"\nChoice: "\
                 +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

    with torch.no_grad():
        output = model(**inputs)
        
    generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False, output_scores=True, return_dict_in_generate=True)
    model_response = processor.decode(generated_ids.sequences[0][2:], skip_special_tokens=True)

    # token_probs = []

    # print(generated_ids.scores)
    # print(model_response)

    logits = generated_ids.scores
    import torch.nn.functional as F
    import numpy as np
    probabilities = [F.softmax(logit,dim=-1) for logit in logits]
    token_ids = generated_ids.sequences[0]
    local_confi = []
    for i in range(-len(probabilities),-1):
        prob_pos = token_ids[i]
        local_confi.append(probabilities[i].tolist()[0][prob_pos])
    print(local_confi)
    confi = np.mean(local_confi)

    # print(probabilities)
    # print(confi)
    
    logits_probs_data.append({
        "question_id": dataset_example['question_id'],
        "model_response": model_response,
        "logits": logits,
        "probabilities": confi
    })
    
    # Check if model's last token matches the correct index
    if str(correct_idx).strip() == str(model_response[-1]).strip():
        correct_count += 1

print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")

# Print logits and probabilities for analysis
for data in logits_probs_data:
    print(f"Question ID: {data['question_id']}")
    print(f"Model Response: {data['model_response']}")
    print("Logits:", data["logits"])
    print("Probabilities (Scores):", data["probabilities"])
    print("\n")


[0.759550154209137]
[0.4837842583656311]
[0.6420309543609619]
[0.9127040505409241]
[0.6132481098175049]
[0.36081188917160034]
[0.5127645134925842]
[0.34759756922721863]
[0.6350020170211792]
[0.7807750701904297]
[0.34483158588409424]
[0.7049771547317505]
[0.35798799991607666]
[0.8044421672821045]
[0.35116350650787354]
[0.3679564297199249]
[0.3432230055332184]
[0.48261380195617676]
[0.43415939807891846]
[0.82072514295578]
[0.6903148293495178]
[0.523057222366333]
[0.3814164400100708]
[0.7063387036323547]
[0.4583187401294708]
[0.3569020628929138]
[0.39619308710098267]
[0.5081065893173218]
[0.46444979310035706]
[0.3727923631668091]
[0.48385027050971985]
[0.4540097713470459]
[0.38984525203704834]
[0.33181634545326233]
[0.4344250559806824]
[0.4314092993736267]
[0.6494886875152588]
[0.7764911651611328]
[0.6959541440010071]
[0.8772019147872925]
[0.43081873655319214]
[0.8012680411338806]
[0.5817245841026306]
[0.5740807056427002]
[0.47582173347473145, 0.7548375129699707, 0.983513593673706, 0.9999

In [24]:
print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")


Number of accurate items: 710 out of 1145


In [25]:
np.mean(data["probabilities"])

0.4542112648487091