# Visual Genome Preprocessing

This notebook preprocesses the VG dataset as was described in the supplementary of our paper. Please first make sure you have downloaded the datasset from https://visualgenome.org/api/v0/api_home.html at the latest revision.

Please also install the python driver from https://github.com/ranjaykrishna/visual_genome_python_driver

In [13]:
import visual_genome.local as vg
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from PIL import Image as PIL_Image
import json
import os
import cv2
from tqdm import tqdm

In [2]:
data_dir='/ssd/tobias/datasets/vg/'

In [3]:
attributes = json.load(open(data_dir + "attributes.json"))

In [5]:
objects = json.load(open(data_dir + "objects.json"))

In [6]:
# An example of the format of the annotations.
print(objects[0])

{'image_id': 1, 'objects': [{'synsets': ['tree.n.01'], 'h': 557, 'object_id': 1058549, 'merged_object_ids': [], 'names': ['trees'], 'w': 799, 'y': 0, 'x': 0}, {'synsets': ['sidewalk.n.01'], 'h': 290, 'object_id': 1058534, 'merged_object_ids': [5046], 'names': ['sidewalk'], 'w': 722, 'y': 308, 'x': 78}, {'synsets': ['building.n.01'], 'h': 538, 'object_id': 1058508, 'merged_object_ids': [], 'names': ['building'], 'w': 222, 'y': 0, 'x': 1}, {'synsets': ['street.n.01'], 'h': 258, 'object_id': 1058539, 'merged_object_ids': [3798578], 'names': ['street'], 'w': 359, 'y': 283, 'x': 439}, {'synsets': ['wall.n.01'], 'h': 535, 'object_id': 1058543, 'merged_object_ids': [], 'names': ['wall'], 'w': 135, 'y': 1, 'x': 0}, {'synsets': ['tree.n.01'], 'h': 360, 'object_id': 1058545, 'merged_object_ids': [], 'names': ['tree'], 'w': 476, 'y': 0, 'x': 178}, {'synsets': ['shade.n.01'], 'h': 189, 'object_id': 5045, 'merged_object_ids': [], 'names': ['shade'], 'w': 274, 'y': 344, 'x': 116}, {'synsets': ['van.

In [7]:
print(len(objects[0]["objects"]))
print(len(attributes[0]["attributes"]))

31
40


In [8]:
idlist = [obj["object_id"] for obj in objects[0]["objects"]]
seen_list = []
for obj in attributes[0]["attributes"]:
    if obj["object_id"] not in idlist:
        print(obj["object_id"], " not in objects file.")
    elif obj["object_id"] in seen_list:
        print(obj["object_id"], " seen twice.")
    else:
        seen_list.append(obj["object_id"])
        

5046  not in objects file.
1058529  not in objects file.
5048  not in objects file.
1058532  not in objects file.
1058536  not in objects file.
3798575  not in objects file.
1058540  not in objects file.
1058544  not in objects file.
3798578  not in objects file.


In [9]:
# A dictionary that maps the words to the corresponding synsets
mysynsets = json.load(open(data_dir + "attribute_synsets.json"))

In [10]:
print(mysynsets["sparse"])

sparse.s.01


In [14]:
from detectron2.structures import BoxMode

def get_vg_dicts(vg_dir, val=False):
    """ Return the detectron-style dict for the VG dataset. See
        https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html
        The last 10k images are used as val, if val=True.
    """
    im_list = vg.get_all_image_data(data_dir=vg_dir) # image metadata
    print(len(im_list))
    
    if val == False:
        appearing_objects = {} # maps synset_id -> int
        appearing_attributes = {} # maps synset_id -> int
    else:
        appearing_objects = json.load(open(vg_dir + "appearing_objects.json"))
        appearing_attributes = json.load(open(vg_dir + "appearing_attributes.json"))
        
    mysynsets = json.load(open(vg_dir + "attribute_synsets.json"))
    #objdata = json.load(open(vg_dir + "objects.json"))
    attributes = json.load(open(vg_dir + "attributes.json"))
    # Preprocess objects (make list searchable by id)
    #corresponding_objects = {objlist["image_id"] : objlist["objects"] for objlist in objdata}
    corresponding_obj_wattributes = {attobj["image_id"] : attobj["attributes"] for attobj in attributes}
    striplen = len("https://cs.stanford.edu/people/rak248/") # remove from image url                 
    print(f"Preprocessing done.")
    # Train, val split
    if val:
        lbegin, lend = 98077, 108077
    else:
        lbegin, lend = 0, 98077
    dataset_dicts = []
    
    curr_id = 0
    curr_att_id = 0
    for idx, image_meta in tqdm(enumerate(im_list[lbegin:lend])):
        record = {}
        #graph = vg.get_scene_graph(image_meta.id, images=vg_dir, image_data_dir=vg_dir + 'by-id/', synset_file= vg_dir + 'synsets.json')
        filename = os.path.join(vg_dir, image_meta.url[striplen:])
        height, width = cv2.imread(filename).shape[:2]
        
        record["file_name"] = filename
        record["image_id"] = image_meta.id
        record["height"] = height
        record["width"] = width
      
        shown_objects = corresponding_obj_wattributes[image_meta.id]
        objs = []
        for spec_object in shown_objects:
            for synsetname in spec_object["synsets"]:
                if synsetname not in appearing_objects.keys():
                    if val:
                        continue
                    appearing_objects[synsetname] = curr_id # add
                    curr_id += 1
                my_attlist = [] # list attribute ids here.
                if "attributes" in spec_object:
                    for att in spec_object["attributes"]:
                        attmod = att.lower().strip()
                        if attmod in mysynsets.keys():
                            att_id = mysynsets[attmod] # find the sysnset att_id
                        else:
                            #print(f"Attribute: {attmod} not found in synsets dict")
                            continue

                        if att_id not in appearing_attributes.keys():
                            if val:
                                continue
                            appearing_attributes[att_id] = curr_att_id # add
                            curr_att_id += 1
                        my_attlist.append(appearing_attributes[att_id])
                obj = {
                    "bbox": [spec_object["x"], spec_object["y"], spec_object["w"], spec_object["h"]],
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "category_id": appearing_objects[synsetname],
                    "attribute_ids": my_attlist
                }
                objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)
    return dataset_dicts, appearing_objects, appearing_attributes

## Create the trainset

In [15]:
dataset, appearing_objects, appearing_attributes = get_vg_dicts(data_dir, val=False)

108077
Preprocessing done.


98077it [04:58, 328.11it/s]


In [16]:
fname = data_dir + "detectron_train.json"
fhandle = open(fname, "w")
json.dump(dataset, fhandle)
fhandle.close()

print(f"Number of classes: {len(appearing_objects.keys())}")
fname = data_dir + "appearing_objects.json"
fhandle = open(fname, "w")
json.dump(appearing_objects, fhandle)
fhandle.close()

print(f"Number of attributes: {len(appearing_attributes.keys())}")
fname = data_dir + "appearing_attributes.json"
fhandle = open(fname, "w")
json.dump(appearing_attributes, fhandle)
fhandle.close()

Number of classes: 7842
Number of attributes: 6277


## Create the Testset

In [17]:
dataset, _, _ = get_vg_dicts(data_dir, val=True)

108077
Preprocessing done.


10000it [00:26, 374.04it/s]


In [19]:
fname = data_dir + "detectron_val.json"
fhandle = open(fname, "w")
json.dump(dataset, fhandle)
fhandle.close()

In [20]:
fname = data_dir + "appearing_objects.json"
appearing_objects = json.load(open(fname))
fname = data_dir + "appearing_attributes.json"
appearing_attributes = json.load(open(fname))
fname = data_dir + "detectron_train.json"
dataset = json.load(open(fname))

In [21]:
# Define a mapping from image_id -> dataset index.
id_to_idx = {inst["image_id"]: i for i, inst in enumerate(dataset)}

In [22]:
id_to_idx[2385321]

35938

## Now filter all objects with less than 8 occurances.
Because of the variety of things we filter out classes and attributes that appear so seldomly that the classifier has no chance to learn them. We didn't want to reduce the coverage too much, but classes below 8 items were almost never predicted anyway.
Technical remark ``appearing_objects``, ``appearing_attributes`` should still be defined from the previous step.

In [33]:
def get_vg_dicts_cached(vg_dir, val=False):
    """ Read the dataset dicts. """
    fname = vg_dir + "detectron_" + ("val" if val else "train") + ".json"
    fhandle = open(fname)
    data = json.load(fhandle)
    fhandle.close()
    return data

In [34]:
dataset = get_vg_dicts_cached(data_dir, val=False)

In [35]:
rev_appearing_objects = {v: k for k,v in appearing_objects.items()} # Reverse dict -> synset 
rev_appearing_attributes = {v: k for k,v in appearing_attributes.items()} 

In [36]:
occurance_count = {} # synset -> num_occurances(int)
occurance_attr = {} # synset -> num_occurances(int)
for item in dataset:
    for obj in item["annotations"]:
        if  obj["category_id"] in occurance_count:
            occurance_count[obj["category_id"]] += 1
        else:
            occurance_count[obj["category_id"]] = 1
        for att in obj["attribute_ids"]:
            if  att in occurance_attr:
                occurance_attr[att] += 1
            else:
                occurance_attr[att] = 1
                
print("First object  ", [rev_appearing_objects[k] for k in range(25)])               
print("First objects occurance counts:", list(occurance_count.values())[:25])

#print(list(occurance_attr.values())[:25])

First object   ['clock.n.01', 'street.n.01', 'shade.n.01', 'man.n.01', 'gym_shoe.n.01', 'headlight.n.01', 'car.n.01', 'bicycle.n.01', 'sign.n.02', 'building.n.01', 'trunk.n.01', 'sidewalk.n.01', 'shirt.n.01', 'back.n.01', 'spectacles.n.01', 'parking_meter.n.01', 'shoe.n.01', 'trouser.n.01', 'jacket.n.01', 'chin.n.01', 'guy.n.01', 'van.n.05', 'wall.n.01', 'tree.n.01', 'arm.n.01']
First objects occurance counts: [9394, 10717, 1723, 73410, 3574, 4802, 21942, 8489, 33450, 35032, 6918, 8950, 38316, 2993, 6043, 530, 16003, 14051, 11484, 564, 2800, 2172, 34511, 54270, 10094]


In [37]:
classes_remain = 0
attributes_remain = 0
n = 8
for k,v in occurance_count.items():
    if v > n:
        classes_remain += 1
for k,v in occurance_attr.items():
    if v > n:
        attributes_remain += 1
print(f"Remaining classes: {classes_remain}")
print(f"Remaining attributes: {attributes_remain}")

Remaining classes: 3434
Remaining attributes: 2979


In [38]:
# Define mapping that maps the object_ids to their new ids and the attribute_ids to their new ids
id_to_set = {v: k for k, v in appearing_objects.items()}
att_id_to_set = {v: k for k, v in appearing_attributes.items()}
new_ids = {}
idx = 0
for k,v in occurance_count.items():
    if v > n:
        new_ids[k] = idx
        idx +=1

idx = 0
new_ids_attr = {}
for k,v in occurance_attr.items():
    if v > n:
        new_ids_attr[k] = idx
        idx +=1
        
new_to_old = {v: k for k, v in new_ids.items()}
new_to_old_att = {v: k for k, v in new_ids_attr.items()}

We update the item-ids to the filtered version. Here's an example:

In [41]:
print("Old ID:", appearing_attributes["neck.n.01"])
print("New ID:", new_ids_attr[appearing_attributes["neck.n.01"]])

Old ID: 1934
New ID: 1735


In [42]:
def update_index(dataset):
    """ Update the dataset to the new ids of attributes and objects. Remove objects and attributes that do not apprear often enough.
        Additionally sort the attributes to have the most common attribute first.
    """
    for item in dataset:
        myobjects = item["annotations"]
        newobjects = []
        for obj in myobjects:
            if  obj["category_id"] in new_ids:
                obj["category_id"] = new_ids[obj["category_id"]]
                newobjects.append(obj)
            new_attrs = []
            for myattribute in obj["attribute_ids"]:
                if myattribute in new_ids_attr.keys(): # It has made the cut
                    my_count = occurance_attr[myattribute] # occurances of own attribute
                    # Sort the most common attribute first.
                    for idx, elem in enumerate(new_attrs):
                        if occurance_attr[new_to_old_att[elem]] < my_count:
                            new_attrs.insert(idx, new_ids_attr[myattribute])
                            break
                    if myattribute not in new_attrs: # Occurance is smaller than all previous
                        new_attrs.append(new_ids_attr[myattribute])
            obj["attribute_ids"] = new_attrs
        item["annotations"] = newobjects
    return dataset

## Update the train set

In [43]:
dataset = update_index(dataset)

In [44]:
fname = data_dir + "detectron_train_filtered.json"
fhandle = open(fname, "w")
json.dump(dataset, fhandle)
fhandle.close()

In [45]:
# Write new class labels.
classesdict = {nid: id_to_set[oldid] for nid, oldid in new_to_old.items()}
attrdict = {nid: att_id_to_set[oldid] for nid, oldid in new_to_old_att.items()}

In [46]:
fname = data_dir + "appearing_objects_filtered.json"
fhandle = open(fname, "w")
json.dump(classesdict, fhandle)
fhandle.close()

fname = data_dir + "appearing_attributes_filtered.json"
fhandle = open(fname, "w")
json.dump(attrdict, fhandle)
fhandle.close()

## Update validation dataset

In [48]:
dataset = get_vg_dicts_cached(data_dir, val=True)

dataset = update_index(dataset)

fname = data_dir + "detectron_val_filtered.json"
fhandle = open(fname, "w")
json.dump(dataset, fhandle)
fhandle.close()

Sanity check: Read files and print max ids

In [49]:
fname = data_dir + "detectron_train_filtered.json"
data = json.load(open(fname, "r"))
maxid = 0
maxattid = 0
for item in data:
    for objectann in item["annotations"]:
        maxid = max(maxid, objectann["category_id"])
        maxattid = max(maxattid, objectann["attribute_ids"][0] if len(objectann["attribute_ids"]) else 0)
print(maxid, maxattid)

3433 2978
