# Prompt Engineering

In [1]:
import sys, os

# sys path hack to allow importing the encoding functions and other modules
sys.path.insert(0, os.path.abspath('../src'))
sys.path.insert(0, os.path.abspath('../externals'))

In [2]:
import json

with open("../data/questions/val_sampled_questions_50000.json") as f:
   questions = list(json.load(f).items())

with open("../data/sceneGraphs/val_sceneGraphs.json") as f:
    scene_graphs = json.load(f)

## Data Extraction

In [3]:
import random 

def compute_object_size(scene_graph, object):
    image_size = scene_graph["width"] * scene_graph["height"]
    object_size = object["w"] * object["h"]
    return object_size / image_size

class_samples_positive = []
attr_samples_positive = []
rel_samples_positive = []

def object_within_image_bounds(scene_graph, object):
    return object["x"] >= 0 and object["y"] >= 0 and object["h"] > 0 and object["w"] > 0 and \
           object["x"] + object["w"] <= scene_graph["width"] and object["y"] + object["h"] <= scene_graph["height"]

for qid, question in questions:
    scene_graph = scene_graphs[question["imageId"]]
    for op in question["semantic"]:
        operation = op["operation"]
        argument = op["argument"].strip()
        objects = scene_graph["objects"]

        if operation == "select" and argument != "scene" and not argument.endswith("(-)"):
            matching_objects = [(oid, objects[oid]) for oid in argument.split("(")[1][:-1].split(",") if object_within_image_bounds(scene_graph, objects[oid])] 
            if len(matching_objects) > 0:
                oid, object = random.choice(matching_objects)
                object["object_id"] = oid
                class_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "class": argument.split("(")[0].strip(),
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })  

        elif operation.startswith("filter"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "filter" else "any"
            attr_value = argument[4:-1] if argument.startswith('not(') else argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph,o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })

        elif operation.startswith("verify"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "verify" else "any"
            attr_value = argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph, o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })
                
        elif operation.startswith("choose ") and argument != "":
            attr = " ".join(operation.split(" ")[1:])
            attr_value = random.choice([argument.split("|")[0], argument.split("|")[1]])
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph, o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })
                
        elif operation == "relate":
            relation_type = argument.split(',')[1]
            position = 'subject' if argument.split(',')[2].startswith('s') else 'object'
            target_object = argument.split('(')[1][:-1]

            if target_object != "-":
                if position == 'object':
                    matching_objects = [(oid, o) for oid, o in objects.items() if any(r["object"] == target_object and r["name"] == relation_type for r in o["relations"]) and object_within_image_bounds(scene_graph, o)]
                    if len(matching_objects) > 0:
                        oid0, object0 = random.choice(matching_objects)
                        object0["object_id"] = oid0
                        object1 = objects[target_object]
                        object1["object_id"] = target_object

                        rel_samples_positive.append({
                                "question_id": qid,
                                "question": question,
                                "image_id": question["imageId"],
                                "rel": relation_type,
                                "object0": object0,
                                "object1": object1,
                                "object0_size": compute_object_size(scene_graph, object0),
                                "object1_size": compute_object_size(scene_graph, object1),
                                "y": True
                            })
                        
                else:
                    matching_oids = [r["object"] for r in objects[target_object]["relations"] if r["name"] == relation_type]
                    matching_objects = [(oid, objects[oid]) for oid in matching_oids if object_within_image_bounds(scene_graph, objects[oid])]
                    if len(matching_objects) > 0:
                        object0 = objects[target_object]
                        object0["object_id"] = target_object
                        oid1, object1 = random.choice(matching_objects)
                        object1["object_id"] = oid1

                        rel_samples_positive.append({
                            "question_id": qid,
                            "question": question,
                            "image_id": question["imageId"],
                            "rel": relation_type,
                            "object0": object0,
                            "object1": object1,
                            "object0_size": compute_object_size(scene_graph, object0),
                            "object1_size": compute_object_size(scene_graph, object1),
                            "y": True
                        })

In [4]:
with open('../data/metadata/gqa_all_class.json') as f:
    categories = json.load(f)
class_to_category = {}
for category, classes in categories.items():
    for c in classes:
        if c not in class_to_category:
            class_to_category[c] = [category]
        else:
            class_to_category[c].append(category)

class_samples_negative = []
for sample in class_samples_positive:
    candidate = random.choice(class_samples_positive)
    while sample["class"] in [*class_to_category.get(candidate["object"]["name"], []), candidate["object"]["name"]]:
        candidate = random.choice(class_samples_positive)
    class_samples_negative.append({
        **candidate,
        "class": sample["class"],
        "y": False
    })
class_samples = [*class_samples_positive, *class_samples_negative]
random.shuffle(class_samples)

attr_samples_negative = []
for sample in attr_samples_positive:
    candidate = random.choice(attr_samples_positive)
    while sample["attr_value"] in candidate["object"]["attributes"]:
        candidate = random.choice(attr_samples_positive)
    
    attr_samples_negative.append({
        **candidate,
        "attr_value": sample["attr_value"],
        "y": False
    })
attr_samples = [*attr_samples_positive, *attr_samples_negative]
random.shuffle(attr_samples)

rel_samples_negative = []
for sample in rel_samples_positive:
    candidate = random.choice(rel_samples_positive)
    while any(r for r in candidate["object0"]["relations"] if r["name"] == sample["rel"] and r["object"] == candidate["object1"]["object_id"]):
        candidate = random.choice(rel_samples_positive)
    
    rel_samples_negative.append({
        **candidate,
        "rel": sample["rel"],
        "y": False
    })
rel_samples = [*rel_samples_positive, *rel_samples_negative]
random.shuffle(rel_samples)

In [5]:
import pandas as pd

class_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "class": s["class"],
    "y": s["y"]
} for s in class_samples]
class_samples_df = pd.DataFrame.from_dict(class_samples_flat)

attr_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "object_name": s["object"]["name"],
    "attr_value": s["attr_value"],
    "y": s["y"]
} for s in attr_samples]
attr_samples_df = pd.DataFrame.from_dict(attr_samples_flat)

rel_samples_flat = []
for s in rel_samples:
    object0, object1 = s['object0'], s['object1']
    joined_bbox = {
        "y": min(object0['y'], object1['y']),
        "x": min(object0['x'], object1['x']),
        "h": max(object0['y'] + object0['h'], object1['y'] + object1['h']) - min(object0['y'], object1['y']),
        "w": max(object0['x'] + object0['w'], object1['x'] + object1['w']) - min(object0['x'], object1['x']),
    }
    rel_samples_flat.append({
        "question_id": s["question_id"],
        "image_id": s["image_id"],
        "bbox_x": joined_bbox["x"],
        "bbox_y": joined_bbox["y"],
        "bbox_w": joined_bbox["w"],
        "bbox_h": joined_bbox["h"],
        "bbox_size": compute_object_size(scene_graphs[s["question"]["imageId"]], joined_bbox),
        "object0_name": s["object0"]["name"],
        "object1_name": s["object1"]["name"],
        "rel": s["rel"],
        "y": s["y"]
    })
rel_samples_df = pd.DataFrame.from_dict(rel_samples_flat)


In [12]:
len(class_samples)

97246

In [21]:
attr_samples_df.to_pickle("../data/finetuning/val_attr_samples_10k.pkl")
rel_samples_df.to_pickle("../data/finetuning/val_rel_samples_10k.pkl")

## Testing

In [6]:
import torch 

if torch.cuda.is_available():
    gpu = torch.device("cuda")
elif torch.backends.mps.is_available():
    gpu = torch.device("mps")
else:
    print("Warning: no GPU detected, falling back to CPU")
    gpu = torch.device("cpu")

In [7]:
# Clean old model from cache
if 'model' in locals() or 'model' in globals():
    del model
torch.cuda.empty_cache()

from model.clip_model import CLIPModel
model = CLIPModel(gpu, snapshot="model_snapshots/clip_finetune/checkpoint-66000")

# from model.xvlm_itr_coco_model import XVLMModel
# model = XVLMModel(gpu)

  from .autonotebook import tqdm as notebook_tqdm


### Target vs. Neutral

#### Classes

In [10]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a photo of a {e['class']}", 
    "a photo of an object"
], img_size=model.img_size)

In [11]:
from evaluator import Evaluator

class_evaluator = Evaluator(model, class_dataset, batch_size=128)
class_evaluator.evaluate()

Accuracy: 88.193%
Precision: 0.883
Recall: 0.881


{'accuracy': tensor(88.1928, device='cuda:0'),
 'precision': tensor(0.8830, device='cuda:0'),
 'recall': tensor(0.8805, device='cuda:0')}

#### Attributes

In [8]:
from prompt_dataset import PromptDataset

def get_article(name):
    return "an" if any(name.startswith(v) for v in ["a", "e", "i", "o", "u"]) else "a"

attr_datasets = {
    "a bad photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a bad photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a bad photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a blurry photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a blurry photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a pixelated photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a pixelated photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a low resolution photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a low resolution photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"{get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"{get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "nothing": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"{e['attr_value']} {e['object_name']}", 
        f"{e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "itap of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"itap of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"itap of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a bad picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a bad picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a bad picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a blurry picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a blurry picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a pixelated picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a pixelated picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a low resolution picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a low resolution picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad")
}


attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"a bad photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
    f"a bad photo of {get_article(e['object_name'])} {e['object_name']}"
], img_size=model.img_size, mode="pad")

In [36]:
next(iter(attr_dataset))[1]

['a bad photo of an overcast sky', 'a bad photo of a sky']

In [9]:
import reload_recursive

%reload evaluator
from evaluator import Evaluator

attr_results = {}
for key, dataset in attr_datasets.items():
    print(f"Evaluating prompt format '{key}'")
    attr_evaluator = Evaluator(model, dataset, batch_size=128)
    results = attr_evaluator.evaluate()
    attr_results[key] = results

Evaluating prompt format 'a bad photo of a'
Accuracy: 67.929%
Precision: 0.727
Recall: 0.574
Evaluating prompt format 'a blurry photo of a'
Accuracy: 68.187%
Precision: 0.712
Recall: 0.611
Evaluating prompt format 'a pixelated photo of a'
Accuracy: 68.676%
Precision: 0.686
Recall: 0.690
Evaluating prompt format 'a low resolution photo of a'
Accuracy: 67.637%
Precision: 0.708
Recall: 0.600
Evaluating prompt format 'a photo of a'
Accuracy: 68.337%
Precision: 0.719
Recall: 0.603
Evaluating prompt format 'a'
Accuracy: 67.991%
Precision: 0.737
Recall: 0.560
Evaluating prompt format 'nothing'
Accuracy: 67.608%
Precision: 0.732
Recall: 0.555
Evaluating prompt format 'itap of a'
Accuracy: 67.725%
Precision: 0.702
Recall: 0.616
Evaluating prompt format 'a bad picture of a'
Accuracy: 67.830%
Precision: 0.723
Recall: 0.578
Evaluating prompt format 'a blurry picture of a'
Accuracy: 68.064%
Precision: 0.710
Recall: 0.611
Evaluating prompt format 'a pixelated picture of a'
Accuracy: 68.825%
Precisio

In [41]:
attr_results1 = [{"schema": k, **{m: mv.item() for m,mv in v.items()} }  for k,v in attr_results.items()]

In [52]:
import pandas as pd

attr_results_pd = pd.DataFrame(attr_results1)
attr_results_pd["accuracy"] = attr_results_pd["accuracy"]/100

attr_results_pd = attr_results_pd.style.format(precision=4).hide(axis="index")
print(attr_results_pd.to_latex())

#attr_results_pd.to_pickle("prompt_engineering_results/attributes.pkl")

\begin{tabular}{lrrr}
schema & accuracy & precision & recall \\
a bad photo of a & 0.6583 & 0.6728 & 0.6165 \\
a blurry photo of a & 0.6590 & 0.6620 & 0.6495 \\
a pixelated photo of a & 0.6512 & 0.6492 & 0.6577 \\
a low resolution photo of a & 0.6555 & 0.6572 & 0.6499 \\
a photo of a & 0.6522 & 0.6621 & 0.6215 \\
a & 0.6533 & 0.6650 & 0.6178 \\
nothing & 0.6465 & 0.6776 & 0.5591 \\
itap of a & 0.6529 & 0.6741 & 0.5918 \\
a bad picture of a & 0.6561 & 0.6673 & 0.6227 \\
a blurry picture of a & 0.6549 & 0.6564 & 0.6501 \\
a pixelated picture of a & 0.6551 & 0.6518 & 0.6660 \\
a low resolution picture of a & 0.6571 & 0.6593 & 0.6502 \\
a picture of a & 0.6560 & 0.6568 & 0.6534 \\
\end{tabular}



#### Relations

In [55]:
from prompt_dataset import PromptDataset

def get_article(name):
    return "an" if any(name.startswith(v) for v in ["a", "e", "i", "o", "u"]) else "a"

rel_datasets = {
    "a bad photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a blurry photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a blurry photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a pixelated photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a pixelated photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a low resolution photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a low resolution photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"{get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"{get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "nothing": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"{e['object0_name']} {e['rel']} {e['object1_name']}", 
        f"{e['object0_name']} and {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "itap of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"itap of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"itap of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a bad picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a bad picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a bad picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a blurry picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a blurry picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a pixelated picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a pixelated picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a low resolution picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a low resolution picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad")
}

rel_dataset = PromptDataset(rel_samples_df, prompt_transform=lambda e: [
    f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
    f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
], img_size=model.img_size, mode="pad")

In [56]:
%reload evaluator
from evaluator import Evaluator

rel_results = {}
for key, dataset in rel_datasets.items():
    print(f"Evaluating prompt format '{key}'")
    rel_evaluator = Evaluator(model, dataset, batch_size=128)
    results = rel_evaluator.evaluate()
    rel_results[key] = results

Evaluating prompt format 'a bad photo of a'
Accuracy: 54.688%
Precision: 0.538
Recall: 0.657
Evaluating prompt format 'a blurry photo of a'
Accuracy: 53.798%
Precision: 0.529
Recall: 0.694
Evaluating prompt format 'a pixelated photo of a'
Accuracy: 55.334%
Precision: 0.549
Recall: 0.592
Evaluating prompt format 'a low resolution photo of a'
Accuracy: 54.822%
Precision: 0.548
Recall: 0.554
Evaluating prompt format 'a photo of a'
Accuracy: 55.444%
Precision: 0.543
Recall: 0.688
Evaluating prompt format 'a'
Accuracy: 54.933%
Precision: 0.537
Recall: 0.721
Evaluating prompt format 'nothing'
Accuracy: 56.321%
Precision: 0.553
Recall: 0.659
Evaluating prompt format 'itap of a'
Accuracy: 54.148%
Precision: 0.540
Recall: 0.556
Evaluating prompt format 'a bad picture of a'
Accuracy: 54.310%
Precision: 0.536
Recall: 0.643
Evaluating prompt format 'a blurry picture of a'
Accuracy: 53.951%
Precision: 0.532
Recall: 0.666
Evaluating prompt format 'a pixelated picture of a'
Accuracy: 55.240%
Precisio

In [57]:
import pandas as pd

rel_results1 = [{"schema": k, **{m: mv.item() for m,mv in v.items()} }  for k,v in rel_results.items()]
rel_results_pd = pd.DataFrame(rel_results1)
rel_results_pd["accuracy"] = rel_results_pd["accuracy"]/100

rel_results_pd = rel_results_pd.style.format(precision=4).hide(axis="index")
print(rel_results_pd.to_latex())

\begin{tabular}{lrrr}
schema & accuracy & precision & recall \\
a bad photo of a & 0.5469 & 0.5384 & 0.6569 \\
a blurry photo of a & 0.5380 & 0.5290 & 0.6939 \\
a pixelated photo of a & 0.5533 & 0.5495 & 0.5922 \\
a low resolution photo of a & 0.5482 & 0.5476 & 0.5542 \\
a photo of a & 0.5544 & 0.5430 & 0.6880 \\
a & 0.5493 & 0.5367 & 0.7213 \\
nothing & 0.5632 & 0.5530 & 0.6590 \\
itap of a & 0.5415 & 0.5403 & 0.5562 \\
a bad picture of a & 0.5431 & 0.5359 & 0.6429 \\
a blurry picture of a & 0.5395 & 0.5316 & 0.6656 \\
a pixelated picture of a & 0.5524 & 0.5493 & 0.5834 \\
a low resolution picture of a & 0.5499 & 0.5495 & 0.5541 \\
a picture of a & 0.5583 & 0.5469 & 0.6799 \\
\end{tabular}



### Target vs. Contrastive Ensemble

In [8]:
import json

with open("../data/metadata/gqa_all_class.json") as f:
    classes = json.load(f)
    
with open("../data/metadata/gqa_all_attribute.json") as f:
    attributes = json.load(f)
    
with open("../data/metadata/gqa_relation.json") as f:
    relations = json.load(f)

In [9]:
import random 

classes_sample = random.sample([item.replace('_', ' ') for items in classes.values() for item in items], 100)
attributes_sample = random.sample([item.replace('_', ' ') for items in attributes.values() for item in items], 100)
rels_sample = random.sample([item.replace('_', ' ') for item in relations], 100)

In [10]:
classes_prompts = [f"a bad photo of a {clazz}" for clazz in classes_sample]
attr_prompts = [f"a bad photo of a {attr} object" for attr in attributes_sample]
rel_prompts = [f"a bad photo of an object {rel} an object" for rel in rels_sample]

#### Classes

In [8]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['class']}", 
    "a bad photo of an object"
], img_size=model.img_size)

In [9]:
from evaluator import ContrastiveEvaluator

class_evaluator = ContrastiveEvaluator(model, class_dataset, classes_prompts, batch_size=64)
class_evaluator.evaluate()

NameError: name 'classes_prompts' is not defined

#### Attributes

In [13]:
from prompt_dataset import PromptDataset

attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['attr_value']} object", 
    "a bad photo of an regular object"
], img_size=model.img_size)

In [14]:
from evaluator import ContrastiveEvaluator

attr_evaluator = ContrastiveEvaluator(model, attr_dataset, attr_prompts, batch_size=64)
attr_evaluator.evaluate()

Accuracy: 56.668%
Precision: 0.567
Recall: 0.564


{'accuracy': 56.66811468288445,
 'precision': 0.5670110593713621,
 'recall': 0.5642195192586157}