# Prompt Engineering

In [2]:
import sys, os

# sys path hack to allow importing the encoding functions and other modules
sys.path.insert(0, os.path.abspath('../src'))
sys.path.insert(0, os.path.abspath('../externals'))

In [3]:
import json

with open("../data/questions/val_sampled_questions_50000.json") as f:
   questions = list(json.load(f).items())

with open("../data/sceneGraphs/val_sceneGraphs.json") as f:
    scene_graphs = json.load(f)

## Data Extraction

In [4]:
import random 

def compute_object_size(scene_graph, object):
    image_size = scene_graph["width"] * scene_graph["height"]
    object_size = object["w"] * object["h"]
    return object_size / image_size

class_samples_positive = []
attr_samples_positive = []
rel_samples_positive = []

def object_within_image_bounds(scene_graph, object):
    return object["x"] >= 0 and object["y"] >= 0 and object["h"] > 0 and object["w"] > 0 and \
           object["x"] + object["w"] <= scene_graph["width"] and object["y"] + object["h"] <= scene_graph["height"]

for qid, question in questions:
    scene_graph = scene_graphs[question["imageId"]]
    for op in question["semantic"]:
        operation = op["operation"]
        argument = op["argument"].strip()
        objects = scene_graph["objects"]

        if operation == "select" and argument != "scene" and not argument.endswith("(-)"):
            matching_objects = [(oid, objects[oid]) for oid in argument.split("(")[1][:-1].split(",") if object_within_image_bounds(scene_graph, objects[oid])] 
            if len(matching_objects) > 0:
                oid, object = random.choice(matching_objects)
                object["object_id"] = oid
                class_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "class": argument.split("(")[0].strip(),
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })  

        elif operation.startswith("filter"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "filter" else "any"
            attr_value = argument[4:-1] if argument.startswith('not(') else argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph,o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })

        elif operation.startswith("verify"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "verify" else "any"
            attr_value = argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph, o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })
                
        elif operation.startswith("choose ") and argument != "":
            attr = " ".join(operation.split(" ")[1:])
            attr_value = random.choice([argument.split("|")[0], argument.split("|")[1]])
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(scene_graph, o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(scene_graph, object),
                        "y": True
                    })
                
        elif operation == "relate":
            relation_type = argument.split(',')[1]
            position = 'subject' if argument.split(',')[2].startswith('s') else 'object'
            target_object = argument.split('(')[1][:-1]

            if target_object != "-":
                if position == 'object':
                    matching_objects = [(oid, o) for oid, o in objects.items() if any(r["object"] == target_object and r["name"] == relation_type for r in o["relations"]) and object_within_image_bounds(scene_graph, o)]
                    if len(matching_objects) > 0:
                        oid0, object0 = random.choice(matching_objects)
                        object0["object_id"] = oid0
                        object1 = objects[target_object]
                        object1["object_id"] = target_object

                        rel_samples_positive.append({
                                "question_id": qid,
                                "question": question,
                                "image_id": question["imageId"],
                                "rel": relation_type,
                                "object0": object0,
                                "object1": object1,
                                "object0_size": compute_object_size(scene_graph, object0),
                                "object1_size": compute_object_size(scene_graph, object1),
                                "y": True
                            })
                        
                else:
                    matching_oids = [r["object"] for r in objects[target_object]["relations"] if r["name"] == relation_type]
                    matching_objects = [(oid, objects[oid]) for oid in matching_oids if object_within_image_bounds(scene_graph, objects[oid])]
                    if len(matching_objects) > 0:
                        object0 = objects[target_object]
                        object0["object_id"] = target_object
                        oid1, object1 = random.choice(matching_objects)
                        object1["object_id"] = oid1

                        rel_samples_positive.append({
                            "question_id": qid,
                            "question": question,
                            "image_id": question["imageId"],
                            "rel": relation_type,
                            "object0": object0,
                            "object1": object1,
                            "object0_size": compute_object_size(scene_graph, object0),
                            "object1_size": compute_object_size(scene_graph, object1),
                            "y": True
                        })

In [5]:
with open('../data/metadata/gqa_all_class.json') as f:
    categories = json.load(f)
class_to_category = {}
for category, classes in categories.items():
    for c in classes:
        if c not in class_to_category:
            class_to_category[c] = [category]
        else:
            class_to_category[c].append(category)

class_samples_negative = []
for sample in class_samples_positive:
    candidate = random.choice(class_samples_positive)
    while sample["class"] in [*class_to_category.get(candidate["object"]["name"], []), candidate["object"]["name"]]:
        candidate = random.choice(class_samples_positive)
    class_samples_negative.append({
        **candidate,
        "class": sample["class"],
        "y": False
    })
class_samples = [*class_samples_positive, *class_samples_negative]
random.shuffle(class_samples)

attr_samples_negative = []
for sample in attr_samples_positive:
    candidate = random.choice(attr_samples_positive)
    while sample["attr_value"] in candidate["object"]["attributes"]:
        candidate = random.choice(attr_samples_positive)
    
    attr_samples_negative.append({
        **candidate,
        "attr_value": sample["attr_value"],
        "y": False
    })
attr_samples = [*attr_samples_positive, *attr_samples_negative]
random.shuffle(attr_samples)

rel_samples_negative = []
for sample in rel_samples_positive:
    candidate = random.choice(rel_samples_positive)
    while any(r for r in candidate["object0"]["relations"] if r["name"] == sample["rel"] and r["object"] == candidate["object1"]["object_id"]):
        candidate = random.choice(rel_samples_positive)
    
    rel_samples_negative.append({
        **candidate,
        "rel": sample["rel"],
        "y": False
    })
rel_samples = [*rel_samples_positive, *rel_samples_negative]
random.shuffle(rel_samples)

In [6]:
import pandas as pd

class_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "class": s["class"],
    "y": s["y"]
} for s in class_samples]
class_samples_df = pd.DataFrame.from_dict(class_samples_flat)

attr_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "object_name": s["object"]["name"],
    "attr_value": s["attr_value"],
    "y": s["y"]
} for s in attr_samples]
attr_samples_df = pd.DataFrame.from_dict(attr_samples_flat)

rel_samples_flat = []
for s in rel_samples:
    object0, object1 = s['object0'], s['object1']
    joined_bbox = {
        "y": min(object0['y'], object1['y']),
        "x": min(object0['x'], object1['x']),
        "h": max(object0['y'] + object0['h'], object1['y'] + object1['h']) - min(object0['y'], object1['y']),
        "w": max(object0['x'] + object0['w'], object1['x'] + object1['w']) - min(object0['x'], object1['x']),
    }
    rel_samples_flat.append({
        "question_id": s["question_id"],
        "image_id": s["image_id"],
        "bbox_x": joined_bbox["x"],
        "bbox_y": joined_bbox["y"],
        "bbox_w": joined_bbox["w"],
        "bbox_h": joined_bbox["h"],
        "bbox_size": compute_object_size(scene_graphs[s["question"]["imageId"]], joined_bbox),
        "object0_name": s["object0"]["name"],
        "object1_name": s["object1"]["name"],
        "rel": s["rel"],
        "y": s["y"]
    })
rel_samples_df = pd.DataFrame.from_dict(rel_samples_flat)


In [7]:
len(class_samples)

97246

In [8]:
attr_samples_df.to_pickle("../data/finetuning/val_attr_samples_10k.pkl")
rel_samples_df.to_pickle("../data/finetuning/val_rel_samples_10k.pkl")

KeyboardInterrupt: 

## Testing

In [9]:
import torch 

if torch.cuda.is_available():
    gpu = torch.device("cuda")
elif torch.backends.mps.is_available():
    gpu = torch.device("mps")
else:
    print("Warning: no GPU detected, falling back to CPU")
    gpu = torch.device("cpu")

In [10]:
# Clean old model from cache
if 'model' in locals() or 'model' in globals():
    del model
torch.cuda.empty_cache()

from model.clip_model import CLIPModel
model = CLIPModel(gpu, model="openai/clip-vit-base-patch32", snapshot="model_snapshots/clip_finetune/checkpoint-66000")

# from model.xvlm_itr_coco_model import XVLMModel
# model = XVLMModel(gpu)

### Target vs. Neutral

#### Classes

In [10]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a photo of a {e['class']}", 
    "a photo of an object"
], img_size=model.img_size)

In [11]:
from evaluator import Evaluator

class_evaluator = Evaluator(model, class_dataset, batch_size=128)
class_evaluator.evaluate()

Accuracy: 88.193%
Precision: 0.883
Recall: 0.881


{'accuracy': tensor(88.1928, device='cuda:0'),
 'precision': tensor(0.8830, device='cuda:0'),
 'recall': tensor(0.8805, device='cuda:0')}

#### Attributes

In [16]:
from prompt_dataset import PromptDataset

def get_article(name):
    return "an" if any(name.startswith(v) for v in ["a", "e", "i", "o", "u"]) else "a"

attr_datasets = {
    "a bad photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a bad photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a bad photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a blurry photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a blurry photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a pixelated photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a pixelated photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a low resolution photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a low resolution photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a photo of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a photo of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"{get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"{get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "nothing": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"{e['attr_value']} {e['object_name']}", 
        f"{e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "itap of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"itap of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"itap of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a bad picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a bad picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a bad picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a blurry picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a blurry picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a pixelated picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a pixelated picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a low resolution picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a low resolution picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a picture of a": PromptDataset(attr_samples_df, prompt_transform=lambda e: [
        f"a picture of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
        f"a picture of {get_article(e['object_name'])} {e['object_name']}"
    ], img_size=model.img_size, mode="pad")
}


attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"a bad photo of {get_article(e['attr_value'])} {e['attr_value']} {e['object_name']}", 
    f"a bad photo of {get_article(e['object_name'])} {e['object_name']}"
], img_size=model.img_size, mode="pad")

In [36]:
next(iter(attr_dataset))[1]

['a bad photo of an overcast sky', 'a bad photo of a sky']

In [17]:
import reload_recursive

%reload evaluator
from evaluator import Evaluator

attr_results = {}
for key, dataset in attr_datasets.items():
    print(f"Evaluating prompt format '{key}'")
    attr_evaluator = Evaluator(model, dataset, batch_size=256)
    results = attr_evaluator.evaluate()
    attr_results[key] = results

Evaluating prompt format 'a bad photo of a'
Accuracy: 67.863%
Precision: 0.723
Recall: 0.579
Evaluating prompt format 'a blurry photo of a'
Accuracy: 67.951%
Precision: 0.707
Recall: 0.614
Evaluating prompt format 'a pixelated photo of a'
Accuracy: 68.742%
Precision: 0.685
Recall: 0.693
Evaluating prompt format 'a low resolution photo of a'
Accuracy: 67.736%
Precision: 0.708
Recall: 0.604
Evaluating prompt format 'a photo of a'
Accuracy: 68.046%
Precision: 0.712
Recall: 0.606
Evaluating prompt format 'a'
Accuracy: 68.162%
Precision: 0.739
Recall: 0.562
Evaluating prompt format 'nothing'
Accuracy: 67.696%
Precision: 0.733
Recall: 0.557
Evaluating prompt format 'itap of a'
Accuracy: 67.871%
Precision: 0.703
Recall: 0.619
Evaluating prompt format 'a bad picture of a'
Accuracy: 67.732%
Precision: 0.720
Recall: 0.581
Evaluating prompt format 'a blurry picture of a'
Accuracy: 67.896%
Precision: 0.705
Recall: 0.615
Evaluating prompt format 'a pixelated picture of a'
Accuracy: 68.953%
Precisio

In [18]:
attr_results1 = [{"schema": k, **{m: mv.item() for m,mv in v.items()} }  for k,v in attr_results.items()]

In [19]:
import pandas as pd

attr_results_pd = pd.DataFrame(attr_results1)
# attr_results_pd = pd.read_pickle("prompt_engineering_results/attributes_base_patch32.pkl")
# attr_results_pd.to_pickle("prompt_engineering_results/attributes_large_patch14.pkl")
attr_results_pd["accuracy"] = attr_results_pd["accuracy"]

attr_results_pd = attr_results_pd.style.format(precision=4).hide(axis="index")
print(attr_results_pd.to_latex())



\begin{tabular}{lrrr}
schema & accuracy & precision & recall \\
a bad photo of a & 67.8634 & 0.7232 & 0.5787 \\
a blurry photo of a & 67.9509 & 0.7066 & 0.6139 \\
a pixelated photo of a & 68.7418 & 0.6855 & 0.6927 \\
a low resolution photo of a & 67.7358 & 0.7079 & 0.6040 \\
a photo of a & 68.0456 & 0.7120 & 0.6060 \\
a & 68.1623 & 0.7390 & 0.5615 \\
nothing & 67.6957 & 0.7326 & 0.5573 \\
itap of a & 67.8707 & 0.7031 & 0.6187 \\
a bad picture of a & 67.7322 & 0.7197 & 0.5809 \\
a blurry picture of a & 67.8962 & 0.7052 & 0.6150 \\
a pixelated picture of a & 68.9532 & 0.6839 & 0.7048 \\
a low resolution picture of a & 67.8124 & 0.7064 & 0.6096 \\
a picture of a & 68.6142 & 0.7074 & 0.6349 \\
\end{tabular}



#### Relations

In [11]:
from prompt_dataset import PromptDataset

def get_article(name):
    return "an" if any(name.startswith(v) for v in ["a", "e", "i", "o", "u"]) else "a"

rel_datasets = {
    "a bad photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a blurry photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a blurry photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a pixelated photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a pixelated photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a low resolution photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a low resolution photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a photo of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"{get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"{get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "nothing": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"{e['object0_name']} {e['rel']} {e['object1_name']}", 
        f"{e['object0_name']} and {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "itap of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"itap of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"itap of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a bad picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a bad picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a bad picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a blurry picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a blurry picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a blurry picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a pixelated picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a pixelated picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a pixelated picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a low resolution picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a low resolution picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a low resolution picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad"),
    "a picture of a": PromptDataset(rel_samples_df, prompt_transform=lambda e: [
        f"a picture of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
        f"a picture of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
    ], img_size=model.img_size, mode="pad")
}

rel_dataset = PromptDataset(rel_samples_df, prompt_transform=lambda e: [
    f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} {e['rel']} {get_article(e['object1_name'])} {e['object1_name']}", 
    f"a bad photo of {get_article(e['object0_name'])} {e['object0_name']} and {get_article(e['object1_name'])} {e['object1_name']}"
], img_size=model.img_size, mode="pad")

In [13]:
import reload_recursive

%reload evaluator
from evaluator import Evaluator

rel_results = {}
for key, dataset in rel_datasets.items():
    print(f"Evaluating prompt format '{key}'")
    rel_evaluator = Evaluator(model, dataset, batch_size=128)
    results = rel_evaluator.evaluate()
    rel_results[key] = results

Evaluating prompt format 'a bad photo of a'
Accuracy: 58.311%
Precision: 0.568
Recall: 0.695
Evaluating prompt format 'a blurry photo of a'
Accuracy: 57.640%
Precision: 0.559
Recall: 0.729
Evaluating prompt format 'a pixelated photo of a'
Accuracy: 59.287%
Precision: 0.588
Recall: 0.619
Evaluating prompt format 'a low resolution photo of a'
Accuracy: 57.749%
Precision: 0.577
Recall: 0.581
Evaluating prompt format 'a photo of a'
Accuracy: 58.462%
Precision: 0.570
Recall: 0.690
Evaluating prompt format 'a'
Accuracy: 59.840%
Precision: 0.578
Recall: 0.733
Evaluating prompt format 'nothing'
Accuracy: 59.250%
Precision: 0.578
Recall: 0.688
Evaluating prompt format 'itap of a'
Accuracy: 57.039%
Precision: 0.561
Recall: 0.648
Evaluating prompt format 'a bad picture of a'
Accuracy: 58.393%
Precision: 0.570
Recall: 0.683
Evaluating prompt format 'a blurry picture of a'
Accuracy: 58.085%
Precision: 0.566
Recall: 0.695
Evaluating prompt format 'a pixelated picture of a'
Accuracy: 59.596%
Precisio

In [20]:
import pandas as pd

rel_results1 = [{"schema": k, **{m: mv.item() for m,mv in v.items()} }  for k,v in rel_results.items()]
rel_results_pd = pd.DataFrame(rel_results1)
rel_results_pd["accuracy"] = rel_results_pd["accuracy"]/100

rel_results_pd = rel_results_pd.style.format(precision=4).hide(axis="index")
print(rel_results_pd.to_latex())

\begin{tabular}{lrrr}
schema & accuracy & precision & recall \\
a bad photo of a & 0.5831 & 0.5679 & 0.6950 \\
a blurry photo of a & 0.5764 & 0.5586 & 0.7286 \\
a pixelated photo of a & 0.5929 & 0.5882 & 0.6193 \\
a low resolution photo of a & 0.5775 & 0.5770 & 0.5805 \\
a photo of a & 0.5846 & 0.5699 & 0.6901 \\
a & 0.5984 & 0.5775 & 0.7331 \\
nothing & 0.5925 & 0.5777 & 0.6880 \\
itap of a & 0.5704 & 0.5610 & 0.6475 \\
a bad picture of a & 0.5839 & 0.5700 & 0.6834 \\
a blurry picture of a & 0.5808 & 0.5658 & 0.6954 \\
a pixelated picture of a & 0.5960 & 0.5926 & 0.6139 \\
a low resolution picture of a & 0.5800 & 0.5847 & 0.5524 \\
a picture of a & 0.5944 & 0.5824 & 0.6672 \\
\end{tabular}



### Target vs. Contrastive Ensemble

In [8]:
import json

with open("../data/metadata/gqa_all_class.json") as f:
    classes = json.load(f)
    
with open("../data/metadata/gqa_all_attribute.json") as f:
    attributes = json.load(f)
    
with open("../data/metadata/gqa_relation.json") as f:
    relations = json.load(f)

In [9]:
import random 

classes_sample = random.sample([item.replace('_', ' ') for items in classes.values() for item in items], 100)
attributes_sample = random.sample([item.replace('_', ' ') for items in attributes.values() for item in items], 100)
rels_sample = random.sample([item.replace('_', ' ') for item in relations], 100)

In [10]:
classes_prompts = [f"a bad photo of a {clazz}" for clazz in classes_sample]
attr_prompts = [f"a bad photo of a {attr} object" for attr in attributes_sample]
rel_prompts = [f"a bad photo of an object {rel} an object" for rel in rels_sample]

#### Classes

In [8]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['class']}", 
    "a bad photo of an object"
], img_size=model.img_size)

In [9]:
from evaluator import ContrastiveEvaluator

class_evaluator = ContrastiveEvaluator(model, class_dataset, classes_prompts, batch_size=64)
class_evaluator.evaluate()

NameError: name 'classes_prompts' is not defined

#### Attributes

In [13]:
from prompt_dataset import PromptDataset

attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['attr_value']} object", 
    "a bad photo of an regular object"
], img_size=model.img_size)

In [14]:
from evaluator import ContrastiveEvaluator

attr_evaluator = ContrastiveEvaluator(model, attr_dataset, attr_prompts, batch_size=64)
attr_evaluator.evaluate()

Accuracy: 56.668%
Precision: 0.567
Recall: 0.564


{'accuracy': 56.66811468288445,
 'precision': 0.5670110593713621,
 'recall': 0.5642195192586157}