# Prompt Engineering

In [1]:
import sys, os

# sys path hack to allow importing the encoding functions and other modules
sys.path.insert(0, os.path.abspath('../encoding'))
sys.path.insert(0, os.path.abspath('../externals/x_vlm'))

In [2]:
import json

with open("../data/questions/train_sampled_questions_50000.json") as f:
   questions = list(json.load(f).items())

## Data Extraction

In [3]:
import random 

def compute_object_size(scene_graph, object):
    image_size = scene_graph["width"] * scene_graph["height"]
    object_size = object["w"] * object["h"]
    return object_size / image_size

class_samples_positive = []
attr_samples_positive = []
rel_samples_positive = []

def object_within_image_bounds(scene_graph, object):
    return object["x"] >= 0 and object["y"] >= 0 and object["h"] > 0 and object["w"] > 0 and \
           object["x"] + object["w"] <= scene_graph["width"] and object["y"] + object["h"] <= scene_graph["height"]

for qid, question in questions:
    for op in question["semantic"]:
        operation = op["operation"]
        argument = op["argument"].strip()
        objects = question["sceneGraph"]["objects"]

        if operation == "select" and argument != "scene" and not argument.endswith("(-)"):
            matching_objects = [(oid, objects[oid]) for oid in argument.split("(")[1][:-1].split(",") if object_within_image_bounds(question["sceneGraph"], objects[oid])] 
            if len(matching_objects) > 0:
                oid, object = random.choice(matching_objects)
                object["object_id"] = oid
                class_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "class": argument.split("(")[0].strip(),
                        "object": object,
                        "object_size": compute_object_size(question["sceneGraph"], object),
                        "y": True
                    })  

        elif operation.startswith("filter"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "filter" else "any"
            attr_value = argument[4:-1] if argument.startswith('not(') else argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(question["sceneGraph"],o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(question["sceneGraph"], object),
                        "y": True
                    })

        elif operation.startswith("verify"):
            attr = ' '.join(operation.split(' ')[1:]) if operation != "verify" else "any"
            attr_value = argument
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(question["sceneGraph"], o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(question["sceneGraph"], object),
                        "y": True
                    })
                
        elif operation.startswith("choose ") and argument != "":
            attr = " ".join(operation.split(" ")[1:])
            attr_value = random.choice([argument.split("|")[0], argument.split("|")[1]])
            objects_with_attr = [(oid, o) for oid, o in objects.items() if attr_value in o["attributes"] and object_within_image_bounds(question["sceneGraph"], o)]
            if len(objects_with_attr) > 0 and attr not in ["hposition", "vposition"]:
                oid, object = random.choice(objects_with_attr)
                object["object_id"] = oid
                attr_samples_positive.append({
                        "question_id": qid,
                        "question": question,
                        "image_id": question["imageId"],
                        "attr_value": attr_value,
                        "object": object,
                        "object_size": compute_object_size(question["sceneGraph"], object),
                        "y": True
                    })
                
        elif operation == "relate":
            relation_type = argument.split(',')[1]
            position = 'subject' if argument.split(',')[2].startswith('s') else 'object'
            target_object = argument.split('(')[1][:-1]

            if target_object != "-":
                if position == 'object':
                    matching_objects = [(oid, o) for oid, o in objects.items() if any(r["object"] == target_object and r["name"] == relation_type for r in o["relations"]) and object_within_image_bounds(question["sceneGraph"], o)]
                    if len(matching_objects) > 0:
                        oid0, object0 = random.choice(matching_objects)
                        object0["object_id"] = oid0
                        object1 = objects[target_object]
                        object1["object_id"] = target_object

                        rel_samples_positive.append({
                                "question_id": qid,
                                "question": question,
                                "image_id": question["imageId"],
                                "rel": relation_type,
                                "object0": object0,
                                "object1": object1,
                                "object0_size": compute_object_size(question["sceneGraph"], object0),
                                "object1_size": compute_object_size(question["sceneGraph"], object1),
                                "y": True
                            })
                        
                else:
                    matching_oids = [r["object"] for r in objects[target_object]["relations"] if r["name"] == relation_type]
                    matching_objects = [(oid, objects[oid]) for oid in matching_oids if object_within_image_bounds(question["sceneGraph"], objects[oid])]
                    if len(matching_objects) > 0:
                        object0 = objects[target_object]
                        object0["object_id"] = target_object
                        oid1, object1 = random.choice(matching_objects)
                        object1["object_id"] = oid1

                        rel_samples_positive.append({
                            "question_id": qid,
                            "question": question,
                            "image_id": question["imageId"],
                            "rel": relation_type,
                            "object0": object0,
                            "object1": object1,
                            "object0_size": compute_object_size(question["sceneGraph"], object0),
                            "object1_size": compute_object_size(question["sceneGraph"], object1),
                            "y": True
                        })

In [4]:
with open('../data/metadata/gqa_all_class.json') as f:
    categories = json.load(f)
class_to_category = {}
for category, classes in categories.items():
    for c in classes:
        if c not in class_to_category:
            class_to_category[c] = [category]
        else:
            class_to_category[c].append(category)

class_samples_negative = []
for sample in class_samples_positive:
    candidate = random.choice(class_samples_positive)
    while sample["class"] in [*class_to_category.get(candidate["object"]["name"], []), candidate["object"]["name"]]:
        candidate = random.choice(class_samples_positive)
    class_samples_negative.append({
        **candidate,
        "class": sample["class"],
        "y": False
    })
class_samples = [*class_samples_positive, *class_samples_negative]
random.shuffle(class_samples)

attr_samples_negative = []
for sample in attr_samples_positive:
    candidate = random.choice(attr_samples_positive)
    while sample["attr_value"] in candidate["object"]["attributes"]:
        candidate = random.choice(attr_samples_positive)
    
    attr_samples_negative.append({
        **candidate,
        "attr_value": sample["attr_value"],
        "y": False
    })
attr_samples = [*attr_samples_positive, *attr_samples_negative]
random.shuffle(attr_samples)

rel_samples_negative = []
for sample in rel_samples_positive:
    candidate = random.choice(rel_samples_positive)
    while any(r for r in candidate["object0"]["relations"] if r["name"] == sample["rel"] and r["object"] == candidate["object1"]["object_id"]):
        candidate = random.choice(rel_samples_positive)
    
    rel_samples_negative.append({
        **candidate,
        "rel": sample["rel"],
        "y": False
    })
rel_samples = [*rel_samples_positive, *rel_samples_negative]
random.shuffle(rel_samples)

In [5]:
import tensorflow as tf
import pandas as pd

class_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "class": s["class"],
    "y": s["y"]
} for s in class_samples]
class_samples_df = pd.DataFrame.from_dict(class_samples_flat)

attr_samples_flat = [{
    "question_id": s["question_id"],
    "image_id": s["image_id"],
    "bbox_x": s["object"]["x"],
    "bbox_y": s["object"]["y"],
    "bbox_w": s["object"]["w"],
    "bbox_h": s["object"]["h"],
    "bbox_size": s["object_size"],
    "object_name": s["object"]["name"],
    "attr_value": s["attr_value"],
    "y": s["y"]
} for s in attr_samples]
attr_samples_df = pd.DataFrame.from_dict(attr_samples_flat)

rel_samples_flat = []
for s in rel_samples:
    object0, object1 = s['object0'], s['object1']
    joined_bbox = {
        "y": min(object0['y'], object1['y']),
        "x": min(object0['x'], object1['x']),
        "h": max(object0['y'] + object0['h'], object1['y'] + object1['h']) - min(object0['y'], object1['y']),
        "w": max(object0['x'] + object0['w'], object1['x'] + object1['w']) - min(object0['x'], object1['x']),
    }
    rel_samples_flat.append({
        "question_id": s["question_id"],
        "image_id": s["image_id"],
        "bbox_x": joined_bbox["x"],
        "bbox_y": joined_bbox["y"],
        "bbox_w": joined_bbox["w"],
        "bbox_h": joined_bbox["h"],
        "bbox_size": compute_object_size(s["question"]["sceneGraph"], joined_bbox),
        "object0_name": s["object0"]["name"],
        "object1_name": s["object1"]["name"],
        "rel": s["rel"],
        "y": s["y"]
    })
rel_samples_df = pd.DataFrame.from_dict(rel_samples_flat)


2023-05-06 18:53:40.264901: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-06 18:53:40.290279: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Testing

In [6]:
import torch 

if torch.cuda.is_available():
    gpu = torch.device("cuda")
elif torch.backends.mps.is_available():
    gpu = torch.device("mps")
else:
    print("Warning: no GPU detected, falling back to CPU")
    gpu = torch.device("cpu")

In [7]:
# Clean old model from cache
if 'model' in locals() or 'model' in globals():
    del model
torch.cuda.empty_cache()

# from models.clip_model import CLIPModel
# model = CLIPModel(gpu)

from model.xvlm_itr_coco_model import XVLMModel
model = XVLMModel(gpu)

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


load checkpoint from ../data/models/4m_base_finetune/itr_coco/checkpoint_best.pth
missing_keys:  []
unexpected_keys:  []


### Target vs. Neutral

#### Classes

In [8]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['class']}", 
    "a bad photo of an object"
], img_size=model.img_size)

In [9]:
from evaluator import Evaluator

class_evaluator = Evaluator(model, class_dataset, batch_size=8)
class_evaluator.evaluate()

Accuracy: 84.926%
Precision: 0.861
Recall: 0.833


{'accuracy': tensor(84.9264, device='cuda:0'),
 'precision': tensor(0.8610, device='cuda:0'),
 'recall': tensor(0.8330, device='cuda:0')}

#### Attributes

In [8]:
from prompt_dataset import PromptDataset

attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"{e['attr_value']} {e['object_name']}", 
    f"{e['object_name']}"
], img_size=model.img_size, mode="pad")

In [9]:
from evaluator import Evaluator

attr_evaluator = Evaluator(model, attr_dataset, batch_size=64)
attr_evaluator.evaluate()

Accuracy: 49.728%
Precision: 0.496
Recall: 0.300


{'accuracy': tensor(49.7283, device='cuda:0'),
 'precision': tensor(0.4955, device='cuda:0'),
 'recall': tensor(0.2999, device='cuda:0')}

#### Relations

In [8]:
from prompt_dataset import PromptDataset

rel_dataset = PromptDataset(rel_samples_df, prompt_transform=lambda e: [
    f"{e['object0_name']} {e['rel']} {e['object1_name']}", 
    f"{e['object0_name']} and {e['object1_name']}"
], img_size=model.img_size, mode="scale")

In [11]:
from evaluator import Evaluator

rel_evaluator = Evaluator(model, rel_dataset, batch_size=64)
rel_evaluator.evaluate()

Accuracy: 49.194%
Precision: 0.492
Recall: 0.477


{'accuracy': tensor(49.1941, device='cuda:0'),
 'precision': tensor(0.4917, device='cuda:0'),
 'recall': tensor(0.4765, device='cuda:0')}

### Target vs. Contrastive Ensemble

In [8]:
import json

with open("../data/metadata/gqa_all_class.json") as f:
    classes = json.load(f)
    
with open("../data/metadata/gqa_all_attribute.json") as f:
    attributes = json.load(f)
    
with open("../data/metadata/gqa_relation.json") as f:
    relations = json.load(f)

In [9]:
import random 

classes_sample = random.sample([item.replace('_', ' ') for items in classes.values() for item in items], 100)
attributes_sample = random.sample([item.replace('_', ' ') for items in attributes.values() for item in items], 100)
rels_sample = random.sample([item.replace('_', ' ') for item in relations], 100)

In [10]:
classes_prompts = [f"a bad photo of a {clazz}" for clazz in classes_sample]
attr_prompts = [f"a bad photo of a {attr} object" for attr in attributes_sample]
rel_prompts = [f"a bad photo of an object {rel} an object" for rel in rels_sample]

#### Classes

In [11]:
from prompt_dataset import PromptDataset

class_dataset = PromptDataset(class_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['class']}", 
    "a bad photo of an object"
], img_size=model.img_size)

In [12]:
from contrastive_evaluator import ContrastiveEvaluator

class_evaluator = ContrastiveEvaluator(model, class_dataset, classes_prompts, batch_size=64)
class_evaluator.evaluate()

Accuracy: 68.623%
Precision: 0.626
Recall: 0.924


{'accuracy': 68.62349428600845,
 'precision': 0.6262933742930951,
 'recall': 0.923545763409863}

#### Attributes

In [13]:
from prompt_dataset import PromptDataset

attr_dataset = PromptDataset(attr_samples_df, prompt_transform=lambda e: [
    f"a bad photo of a {e['attr_value']} object", 
    "a bad photo of an regular object"
], img_size=model.img_size)

In [14]:
from contrastive_evaluator import ContrastiveEvaluator

attr_evaluator = ContrastiveEvaluator(model, attr_dataset, attr_prompts, batch_size=64)
attr_evaluator.evaluate()

Accuracy: 56.668%
Precision: 0.567
Recall: 0.564


{'accuracy': 56.66811468288445,
 'precision': 0.5670110593713621,
 'recall': 0.5642195192586157}