# Resnet

In [1]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Use the model without requiring the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101", revision="no_timm")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# Set the target size for post-processing and the confidence threshold
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

# Collect detected objects in plain text format
plain_text_output = ""
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    if score.item() >= 0.9:
        box = [round(i, 2) for i in box.tolist()]
        object_name = model.config.id2label[label.item()]
        plain_text_output += f"Object: {object_name}, Location: {box}\n"

# Print the plain text output
print(plain_text_output)


Object: cat, Location: [344.06, 24.85, 640.34, 373.74]
Object: remote, Location: [328.13, 75.93, 372.81, 187.66]
Object: remote, Location: [39.34, 70.13, 175.56, 118.78]
Object: cat, Location: [15.36, 51.75, 316.89, 471.16]
Object: couch, Location: [-0.19, 0.71, 639.73, 474.17]



# Gemma2

In [2]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=32)
print(tokenizer.decode(outputs[0]))


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

<bos>Write me a poem about Machine Learning.

In silicon valleys, where data flows,
A new intelligence, silently grows.
Machine Learning, a name whispered low,
Algorithms dance, where patterns


In [3]:
import os
import json
aokvqa_dir = "aokvqa/datasets/aokvqa"
coco_dir = "aokvqa/datasets/coco"

aokvqa_dataset = json.load(open(
        os.path.join(aokvqa_dir, f"aokvqa_v1p0_val.json")
))

def get_coco_path(split, image_id, coco_dir):
    return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")

In [4]:
annotation_file = coco_dir+'/annotations/captions_val2017.json'
with open(annotation_file, 'r') as f:
    coco_data = json.load(f)

image_id_to_captions = {}
for annotation in coco_data['annotations']:
    image_id = annotation['image_id']
    caption = annotation['caption']
    
    # Add the caption to the list of captions for each image ID
    if image_id not in image_id_to_captions:
        image_id_to_captions[image_id] = []
    image_id_to_captions[image_id].append(caption)

In [5]:
dataset_example = aokvqa_dataset[0]

print(dataset_example['question_id'])

image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
print(image_path)

print(dataset_example['question'])
print(dataset_example['choices'])

correct_choice = dataset_example['choices'][dataset_example['correct_choice_idx'] ]

print(dataset_example['rationales'][0])
image_id = dataset_example['image_id']
captions = image_id_to_captions.get(image_id, [])

# Print captions
print("Captions for this image:")
for caption in captions:
    print("Caption:", caption)

22jbM6gDxdaMaunuzgrsBB
aokvqa/datasets/coco/val2017/000000461751.jpg
What is in the motorcyclist's mouth?
['toothpick', 'food', 'popsicle stick', 'cigarette']
He's smoking while riding.
Captions for this image:
Caption: The man is riding his motorcycle while smoking a cigarette. 
Caption: A man sitting on a motorcycle smoking a cigarette.
Caption: A man on a motorcycle driving beside a van.
Caption: A man is riding a motorcycle on a city street. 
Caption: The man on the motorcycle pulled up beside the car.


# Now run model

In [6]:
print(1)

1


In [7]:
my_data = []

for i in range(200):
    dataset_example = aokvqa_dataset[i]
    
    image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    image = Image.open(image_path)
    
    resnet_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
    resnet_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
    
    inputs = resnet_processor(images=image, return_tensors="pt")
    outputs = resnet_model(**inputs)
    
    # Set the target size for post-processing and the confidence threshold
    target_sizes = torch.tensor([image.size[::-1]])
    results = resnet_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
    
    # Collect detected objects in plain text format
    plain_text_output = ""
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        if score.item() >= 0.9:
            box = [round(i, 2) for i in box.tolist()]
            object_name = resnet_model.config.id2label[label.item()]
            plain_text_output += f"Object: {object_name}, Location: {box}\n"
    my_data.append(plain_text_output)

In [8]:
del resnet_processor, resnet_model
torch.cuda.empty_cache()

In [9]:
import torch

correct_count = 0
logits_probs_data = []

for i in range(200):
    dataset_example = aokvqa_dataset[i]
    
    # image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
    # image = Image.open(image_path)
    
    # resnet_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
    # resnet_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
    
    # inputs = resnet_processor(images=image, return_tensors="pt")
    # outputs = resnet_model(**inputs)
    
    # # Set the target size for post-processing and the confidence threshold
    # target_sizes = torch.tensor([image.size[::-1]])
    # results = resnet_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
    
    # # Collect detected objects in plain text format
    # plain_text_output = ""
    # for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    #     if score.item() >= 0.9:
    #         box = [round(i, 2) for i in box.tolist()]
    #         object_name = resnet_model.config.id2label[label.item()]
    #         plain_text_output += f"Object: {object_name}, Location: {box}\n"
    plain_text_output = my_data[i]
    question = dataset_example['question']
    choices = dataset_example['choices']
    correct_choice = choices[dataset_example['correct_choice_idx']]
    correct_idx = dataset_example['correct_choice_idx']
    
    image_id = dataset_example['image_id']
    captions = image_id_to_captions.get(image_id, [])
    
    prompt = "I will give you the object detected in a image, a caption of the image, a question and choices, answer the question based on the object and caption. \
Return only the index of the choice. Do not analyze the question. Do not include anything else."+\
"\nObject detected: \n"+plain_text_output+ "\nAnnotation of the Image: " + " ".join(captions) + "\nQuestion: "+question+"\nChoice: "\
             +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model(**inputs)
        
    generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False, output_scores=True, return_dict_in_generate=True)
    model_response = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)

    # token_probs = []

    # print(generated_ids.scores)
    print(model_response)

    logits = generated_ids.scores
    import torch.nn.functional as F
    import numpy as np
    probabilities = [F.softmax(logit,dim=-1) for logit in logits]
    token_ids = generated_ids.sequences[0]
    local_confi = []
    for i in range(-len(probabilities),-1):
        prob_pos = token_ids[i]
        local_confi.append(probabilities[i].tolist()[0][prob_pos])
    # print(local_confi)
    confi = np.mean(local_confi)

    # print(probabilities)
    # print(confi)

    
    logits_probs_data.append({
        "question_id": dataset_example['question_id'],
        "model_response": model_response,
        "logits": logits,
        "probabilities": confi
    })

    print(str(correct_idx).strip())
    print(str(model_response.strip()[-1]))
    # Check if model's last token matches the correct index
    if str(correct_idx).strip() == str(model_response.strip()[-1]):
        correct_count += 1

print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")

# Print logits and probabilities for analysis
for data in logits_probs_data:
    print(f"Question ID: {data['question_id']}")
    print(f"Model Response: {data['model_response']}")
    print("Logits:", data["logits"])
    print("Probabilities (Scores):", data["probabilities"])
    print("\n")


I will give you the object detected in a image, a caption of the image, a question and choices, answer the question based on the object and caption. Return only the index of the choice. Do not analyze the question. Do not include anything else.
Object detected: 
Object: person, Location: [60.43, 109.87, 339.84, 382.3]
Object: motorcycle, Location: [284.87, 253.13, 638.98, 566.69]
Object: person, Location: [478.57, 167.12, 640.35, 565.74]
Object: person, Location: [261.75, 85.21, 380.51, 224.3]

Annotation of the Image: The man is riding his motorcycle while smoking a cigarette.  A man sitting on a motorcycle smoking a cigarette. A man on a motorcycle driving beside a van. A man is riding a motorcycle on a city street.  The man on the motorcycle pulled up beside the car.
Question: What is in the motorcyclist's mouth?
Choice: 0.toothpick 1.food 2.popsicle stick 3.cigarette
Answer: 3

3
3
I will give you the object detected in a image, a caption of the image, a question and choices, answe

In [10]:
print(f"Number of accurate items: {correct_count} out of {len(aokvqa_dataset)}")


Number of accurate items: 116 out of 1145


In [11]:
np.mean(data["probabilities"])

0.6382903655370077

# One example

In [7]:
dataset_example = aokvqa_dataset[50]

image_path = get_coco_path('val', dataset_example['image_id'], coco_dir)
image = Image.open(image_path)

resnet_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101", revision="no_timm")
resnet_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101", revision="no_timm")

inputs = resnet_processor(images=image, return_tensors="pt")
outputs = resnet_model(**inputs)

# Set the target size for post-processing and the confidence threshold
target_sizes = torch.tensor([image.size[::-1]])
results = resnet_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

# Collect detected objects in plain text format
plain_text_output = ""
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    if score.item() >= 0.9:
        box = [round(i, 2) for i in box.tolist()]
        object_name = resnet_model.config.id2label[label.item()]
        plain_text_output += f"Object: {object_name}, Location: {box}\n"

# Print the plain text output
# print(plain_text_output)


question = dataset_example['question']
choices = dataset_example['choices']
correct_choice = choices[dataset_example['correct_choice_idx']]
correct_idx = dataset_example['correct_choice_idx']

image_id = dataset_example['image_id']
captions = image_id_to_captions.get(image_id, [])

prompt = "I will give you the object detected in a image, a caption of the image, a question and choices. The detected objects and caption provide context. Use them directly to pick the most accurate answer from the choices without additional reasoning.\
If you cannot decide which choice, make your best guess. Only respond with the number of the correct choice (e.g., '0'). Do not analyze or explain."+\
"\nObject detected: \n"+plain_text_output+ "\nAnnotation of the Image: " + " ".join(captions) + "\nQuestion: "+question+"\nChoice: "\
             +"0."+choices[0]+" 1."+choices[1]+" 2."+choices[2]+" 3."+choices[3]+"\nAnswer: "
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model(**inputs)
    
generated_ids = model.generate(**inputs, max_new_tokens=70, do_sample=False, output_scores=True, return_dict_in_generate=True)
model_response = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)

# token_probs = []

# print(generated_ids.scores)
print(model_response)

logits = generated_ids.scores
import torch.nn.functional as F
import numpy as np
probabilities = [F.softmax(logit,dim=-1) for logit in logits]
token_ids = generated_ids.sequences[0]
local_confi = []
for i in range(-len(probabilities),-1):
    prob_pos = token_ids[i]
    local_confi.append(probabilities[i].tolist()[0][prob_pos])
# print(local_confi)
confi = np.mean(local_confi)

if str(correct_idx).strip() == str(model_response.strip()[-1]):
        print("Correct!")
print("Correct idx: "+str(correct_idx))
print("Model idx: "+str(model_response.strip()[-1]))

I will give you the object detected in a image, a caption of the image, a question and choices. The detected objects and caption provide context. Use them directly to pick the most accurate answer from the choices without additional reasoning.If you cannot decide which choice, make your best guess. Only respond with the number of the correct choice (e.g., '0'). Do not analyze or explain.
Object detected: 
Object: bed, Location: [448.01, 226.77, 634.66, 337.68]
Object: remote, Location: [143.11, 236.77, 174.89, 246.26]
Object: sink, Location: [0.04, 266.09, 54.17, 288.94]
Object: tv, Location: [237.22, 123.93, 282.65, 163.61]

Annotation of the Image: A bathroom with a tub, sinks, lights and a television. A large white bathroom with two vanity sinks and a bathtub. A bathroom with a white tub and white cabinets has a black pattern on the floor. Large white bathroom showing sink, tub, TV, and countertops. Deep bathtub displayed in area of large tiled bathroom.
Question: What type of sinks