In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-1.5-7b-hf")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Example

In [2]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What are these?"},
            {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Load and preprocess the image
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

# Forward pass to get logits
with torch.no_grad():
    output = model(**inputs)
    logits = output.logits  # Shape: (batch_size, sequence_length, vocab_size)

# Generate the output tokens using model.generate
generated_ids = model.generate(**inputs, max_new_tokens=1, do_sample=False)

# Decode the generated tokens to get the actual output
output_text = processor.decode(generated_ids[0][2:], skip_special_tokens=True)
print("Model Output:", output_text)

# Extract logits for the generated tokens
# Here we take logits only for the generated tokens
token_logits = []
for i, token_id in enumerate(generated_ids[0]):
    token_logits.append(logits[0, i, token_id].item())

# Print logits for each generated token
print("\nLogits for each token in the output:")
for token, logit in zip(processor.tokenizer.convert_ids_to_tokens(generated_ids[0]), token_logits):
    print(f"Token: {token}, Logit: {logit}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Model Output: ER:  
What are these? ASSISTANT: These

Logits for each token in the output:
Token: <s>, Logit: 0.70751953125
Token: ▁US, Logit: -0.1290283203125
Token: ER, Logit: 4.140625
Token: :, Logit: 2.908203125
Token: ▁, Logit: 4.09375
Token: <image>, Logit: 0.2261962890625
Token: ▁, Logit: 7.23046875
Token: <0x0A>, Logit: 9.734375
Token: What, Logit: 5.03515625
Token: ▁are, Logit: 6.69921875
Token: ▁these, Logit: 6.8828125
Token: ?, Logit: 4.53125
Token: ▁A, Logit: 6.92578125
Token: SS, Logit: 0.43212890625
Token: IST, Logit: -0.95751953125
Token: ANT, Logit: -1.0126953125
Token: :, Logit: 8.71875
Token: ▁These, Logit: 4.5859375


# Testing

## Load AOKVQA

In [2]:
import os
import json
aokvqa_dir = "aokvqa/datasets/aokvqa"
coco_dir = "aokvqa/datasets/coco"

aokvqa_dataset = json.load(open(
        os.path.join(aokvqa_dir, f"aokvqa_v1p0_val.json")
))

def get_coco_path(split, image_id, coco_dir):
    return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")

In [3]:
dataset_example = aokvqa_dataset[0]

print(dataset_example['question_id'])
# 22MexNkBPpdZGX6sxbxVBH

image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
print(image_path)
# ./datasets/coco/train2017/000000299207.jpg

print(dataset_example['question'])
print(dataset_example['choices'])
# What is the man by the bags awaiting?
# ['skateboarder', 'train', 'delivery', 'cab']

correct_choice = dataset_example['choices'][dataset_example['correct_choice_idx'] ]
# Corrrect: cab

print(dataset_example['rationales'][0])
# A train would not be on the street, he would not have luggage waiting for a delivery, and the skateboarder is there and not paying attention to him so a cab is the only possible answer.

22jbM6gDxdaMaunuzgrsBB
aokvqa/datasets/coco/val2017/000000461751.jpg
What is in the motorcyclist's mouth?
['toothpick', 'food', 'popsicle stick', 'cigarette']
He's smoking while riding.


In [4]:
dataset_example

{'split': 'val',
 'image_id': 461751,
 'question_id': '22jbM6gDxdaMaunuzgrsBB',
 'question': "What is in the motorcyclist's mouth?",
 'choices': ['toothpick', 'food', 'popsicle stick', 'cigarette'],
 'correct_choice_idx': 3,
 'direct_answers': ['cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette',
  'cigarette'],
 'difficult_direct_answer': False,
 'rationales': ["He's smoking while riding.",
  'The motorcyclist has a lit cigarette in his mouth while he rides on the street.',
  'The man is smoking.']}

## Test loop

In [5]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
correct_count = 0

for dataset_example in aokvqa_dataset:
    question = dataset_example['question']
    choices = dataset_example['choices']
    correct_choice = choices[dataset_example['correct_choice_idx']]
    correct_idx = dataset_example['correct_choice_idx']
    
    image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    raw_image = Image.open(image_path)

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "I will give you a question and choices, return only the index of the choice\n"+"Question: "+question+"\nChoice: "\
                 +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "},
                # {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

    generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False)
    model_response = processor.decode(generated_ids[0][2:], skip_special_tokens=True)

    print(model_response)
    print(model_response[-1])
    print(correct_idx)
    
    if correct_idx == model_response[-1]:
        correct_count += 1

print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")

ER:  
I will give you a question and choices, return only the index of the choice
Question: What is in the motorcyclist's mouth?
Choice: 0.toothpick 1.food 2.popsicle stick 3.cigarette
Answer:  ASSISTANT: 3
3
3
ER:  
I will give you a question and choices, return only the index of the choice
Question: Which number birthday is probably being celebrated?
Choice: 0.one 1.ten 2.nine 3.thirty
Answer:  ASSISTANT: 3
3
3
ER:  
I will give you a question and choices, return only the index of the choice
Question: What best describes the pool of water?
Choice: 0.frozen 1.fresh 2.dirty 3.boiling
Answer:  ASSISTANT: 3
3
2
ER:  
I will give you a question and choices, return only the index of the choice
Question: What is the white substance on top of the cupcakes?
Choice: 0.butter 1.mayo 2.ice cream 3.icing
Answer:  ASSISTANT: 3
3
3
ER:  
I will give you a question and choices, return only the index of the choice
Question: What type of device is sitting next to the laptop?
Choice: 0.mouse 1.mobile p

KeyboardInterrupt: 

In [25]:
correct_count = 0
logits_probs_data = []

for dataset_example in aokvqa_dataset:
    question = dataset_example['question']
    choices = dataset_example['choices']
    correct_choice = choices[dataset_example['correct_choice_idx']]
    correct_idx = dataset_example['correct_choice_idx']
    
    image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    raw_image = Image.open(image_path)

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "I will give you a question and choices, return only the index of the choice\n"+"Question: "+question+"\nChoice: "\
                 +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "},
                # {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

    with torch.no_grad():
        output = model(**inputs)
        logits = output.logits
        
    generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False)
    model_response = processor.decode(generated_ids[0][2:], skip_special_tokens=True)

    token_probs = []
    for i, token_id in enumerate(generated_ids[0]):
        token_logit = logits[0, i]  # logits for all tokens at position i
        token_prob = torch.nn.functional.softmax(token_logit, dim=-1)[token_id].item()
        token_probs.append(token_prob)
        
    logits_probs_data.append({
        "question_id": dataset_example['question_id'],
        "model_response": model_response,
        "logits": [logits[0, i, token_id].item() for i, token_id in enumerate(generated_ids[0])],
        "probabilities": token_probs
    })

    # print(repr(correct_idx))
    # print(repr(model_response[-1]))
    # print(str(correct_idx).strip() == str(model_response[-1]).strip())
    if str(correct_idx).strip() == str(model_response[-1]).strip():
        correct_count += 1

print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")

# Print logits and probabilities for analysis
for data in logits_probs_data:
    print(f"Question ID: {data['question_id']}")
    print(f"Model Response: {data['model_response']}")
    print("Logits:", data["logits"])
    print("Probabilities:", data["probabilities"])
    print("\n")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [26]:
print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")


Number of accurate items: 740 out of 1145


In [29]:
mean_probability = sum(data["probabilities"]) / len(data["probabilities"])
print("Mean of probabilities:", mean_probability)


Mean of probabilities: 0.006042955737364919


In [6]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model and processor
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What are these?"},
            {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Load and preprocess the image
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

# Forward pass to get logits
with torch.no_grad():
    output = model(**inputs)
    logits = output.logits  # Shape: (batch_size, sequence_length, vocab_size)

# Generate the output tokens using model.generate
generated_ids = model.generate(**inputs, max_new_tokens=1, do_sample=False)

# Decode the generated tokens to get the actual output
output_text = processor.decode(generated_ids[0][2:], skip_special_tokens=True)
print("Model Output:", output_text)

# Extract probabilities specifically for the generated tokens
token_probs = []
for i, token_id in enumerate(generated_ids[0]):
    # Apply softmax only for the logits of this specific token's position
    token_logit = logits[0, i]  # logits for all tokens at position i
    token_prob = torch.nn.functional.softmax(token_logit, dim=-1)[token_id].item()  # probability of the specific token
    token_probs.append(token_prob)

# Print each generated token with its probability
print("\nToken probabilities for each token in the output:")
for token, prob in zip(processor.tokenizer.convert_ids_to_tokens(generated_ids[0]), token_probs):
    print(f"Token: {token}, Probability: {prob}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 21.98 GiB of which 6.44 MiB is free. Including non-PyTorch memory, this process has 21.96 GiB memory in use. Of the allocated memory 21.61 GiB is allocated by PyTorch, and 49.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)