# VisualGenome

version newest (v1.2 for the region graphs)

### Setting imports and paths

In [24]:
import json
import pandas as pd
import networkx as nx
from collections import defaultdict

In [22]:
vg_path='raw/scene_graphs.json'

In [23]:
with open(vg_path, 'r') as f:
    images_data=json.load(f)

### Load the data into two lists: nodes (from objects with attributes) and edges (from relationships) without deduplication

In [14]:
single_image=images_data[0]
#print(single_image)

In [15]:
len(images_data)

108077

In [11]:
all_rels=[]
all_objs=[]

In [37]:
for an_image in images_data:
    rels=an_image['relationships']
    for rel in rels:
        rel_id=rel['relationship_id']
        pred=rel['predicate']
        sub_id=rel['subject_id']
        obj_id=rel['object_id']
        synsets=rel['synsets']
        if (len(synsets)>1): 
            print('rel with >1 synsets', synsets)
        rel_data=[rel_id, sub_id, pred, obj_id, synsets]
        all_rels.append(rel_data)
        
    objects=an_image['objects']
    for obj in objects:
        obj_id=obj['object_id']
        names=obj['names']
#        if (len(names)>1): 
#            print('obj with >1 names', names)
        synsets=obj['synsets']
#        if (len(synsets)>1): 
#            print('obj with >1 synsets', synsets)
        if 'attributes' in obj:
            attrs=obj['attributes']
        else:
            attrs=[]
        obj_data=[obj_id, names, synsets, attrs]
        all_objs.append(obj_data)

In [17]:
len(all_rels)

2316104

In [21]:
len(all_objs)

3802378

### Load the data into two tables: nodes (from objects with attributes) and edges (from relationships) WITH deduplication

In [29]:
all_rels_ded={}
for an_image in images_data:
    rels=an_image['relationships']
    for rel in rels:
        rel_id=rel['relationship_id']
        pred=rel['predicate']
        sub_id=rel['subject_id']
        obj_id=rel['object_id']
        synsets=rel['synsets']
        
        if rel_id not in all_rels_ded.keys():
            all_rels_ded[rel_id]={}
            all_rels_ded[rel_id]['pred']=set()
            all_rels_ded[rel_id]['sub_id']=set()
            all_rels_ded[rel_id]['obj_id']=set()
            all_rels_ded[rel_id]['synsets']=set()
            
        rel_data=all_rels_ded[rel_id]
        rel_data['pred'].add(pred)
        rel_data['obj_id'].add(obj_id)
        rel_data['sub_id'].add(sub_id)
        for s in synsets:
            rel_data['synsets'].add(s)

In [30]:
len(all_rels_ded.keys())

2316104

In [34]:
all_objs_ded={}
for an_image in images_data:
    objects=an_image['objects']
    for obj in objects:
        obj_id=obj['object_id']
        names=obj['names']
        synsets=obj['synsets']
        if 'attributes' in obj:
            attrs=obj['attributes']
        else:
            attrs=[]

        if obj_id not in all_objs_ded.keys():
            all_objs_ded[obj_id]={}
            all_objs_ded[obj_id]['names']=set()
            all_objs_ded[obj_id]['synsets']=set()
            all_objs_ded[obj_id]['attrs']=set()
        else:
            print('object reused:', obj_id, names, synsets, attrs)
        
        obj_data=all_objs_ded[obj_id]
        for n in names:
            obj_data['names'].add(n)    
        for s in synsets:
            obj_data['synsets'].add(s)    
        for a in attrs:
            obj_data['attrs'].add(a)    

In [35]:
len(all_objs_ded.keys())

3802374