# Visual Genome quesiton generation
This notebooks aims to filter Visual Genome data to get only specific domain data. The Visual Genome dataset contains objects annotations that we will use to filter our data. The result is images containing objects listed in keywords text file

## imports

In [0]:
#import drive, you can work without drive

from google.colab import drive
#from visual_genome import api
import json
import seaborn as sbs
import pandas as pd
from itertools import groupby
from collections import Counter
import os
import sys
import csv
#from gensim.models import Word2Vec
#import gensim





## variables

In [0]:
#TO Implement   
    #mounting drive path, this is optional
drive_path = ''
# root path as defined in install.ipynb 
data_root = ''

save_root = os.path.join(data_root, 'results/VGQA')
#file containing keywords for filtering images
keywords_file = os.path.join(data_root, 'keywords_files/indoor.txt')
#file containing negative keywords for filtering images
neg_keywords_file = os.path.join(data_root, 'keywords_files/Nindoor.txt')

VGobjects_path = os.path.join(data_root, 'VG/objects.json')
#file where to save image ids
writingPath = os.path.join(save_root,'VisualGenomeImageIds.txt')

#file where to save Questions Answers TODO
save_qas_file = os.path.join(save_root,'QAs.json')

#TODO :
save_GQA = os.path.join(save_root, 'GQA.csv')
filtred_GQA = os.path.join(save_root, 'GQA_filtered.csv')

VG_GQA = os.path.join(save_root, 'VG_GQA.csv')
VG_QAs_path = os.path.join(data_root, 'VG/question_answers.json')
train_questions_GQA = os.path.join(data_root, 'GQA/train_all_questions')
val_questions_GQA = os.path.join(data_root, 'GQA/val_all_questions.json')



## Mounting Drive (Don't do it if you don't use drive)

In [0]:
drive.mount(drive_path)

## Visual Genome API
Install a python Visual Genome API that let us interact with The Visual Genome dataset like getting objects in an image....


### Download all image Ids
Download them via API and count them

In [0]:
import matplotlib.pyplot as plt

from PIL import Image
import requests
from io import BytesIO

qas = api.get_QA_of_image(id=4)


## Loading and filtering keywords file
Remove extra-characters, empty strings and repeated words from keywords. Keywords are name of objects that must be present in the image

In [0]:
f = open(keywords_file, 'r')
lines = f.readlines()
lines = [x.lower().strip() for x in lines if x.lower().strip() != '']
#remove repeated
lines = list(set(lines))
f.close()

In [0]:
g = open(neg_keywords_file, 'r')
Nlines = g.readlines()
Nlines = [x.lower().strip() for x in Nlines if x.lower().strip() != '']
#remove repeated
Nlines = list(set(Nlines))
g.close()
Nlines

## Download objects 
Download objects present in the Visual Genome dataset. The file contains the graph scene of each image in Visual Genome. Each graph scene contains a list of objects and other informations.
idGraph will contain tupples of (image id, objects present in this image)

In [0]:
# put objects.json in drive

f = open(VGobjects_path,'r')
idGraphs= json.load(f)
f.close()
idGraph = []
for g in idGraphs:
  objs = []
  for obj in g['objects']:
    for name in obj['names']:
      objs.append(name)
  idGraph.append((g['image_id'],objs))
 

### copying idGraph in idObjs (this could be avoided but i don't want to refactor code)

In [0]:
idObjs = [(id, objects) for (id, objects) in idGraph]


## Filter objects
Here, we filter objects present in visual genome, we get the objects that are in 'lines' (keywords) and that are not in 'Nlines' (negative keywords).
idObjsLen is a list of tuples of (image id , list of objects, length of the list of objects). The tuples represent the images that have at least on object that passed the filter.

In [0]:
idObjsLen = []
for (id, objs) in idObjs:
  objs1 = [obj.lower().strip() for obj in objs]
  s = set(objs1).intersection(lines)
  s2 = set(objs1).intersection(Nlines)
  if (not len(s) == 0) and (len(s2) == 0):
    idObjsLen.append((id, objs,len(s)))
 

In [0]:
len(idObjsLen)


In [0]:
import pandas as pd
objects = [x  for y in idObjsLen for x in y[1]]
df = pd.Series(objects).value_counts()
df = df.sort_values(ascending =False).head(30)
df.to_csv('/content/df.csv', ";")

### occurences of objects in filtered images
idObjLen2 is idObjLen sorted by list of objects length, this was done in my case to fill the keyword file with new keywords based when i see some words that has high occurence needed for better filtering

In [0]:
idObjsLen2 = sorted(idObjsLen, key=lambda tup: tup[2],reverse = True)
#idObjsLen2 = [ i  for i in idObjsLen if i[2] == 1]
idObjsLen2

In [0]:

occurences = [ i[2]  for i in idObjsLen if i[2] >= 0]
d = pd.Series(occurences)
d.value_counts().plot(kind='pie')
#sbs.distplot(occurences)

### occurences of remaining words
This was used to fill negative keywords

In [0]:

ObjsP = [x[1] for x in idObjsLen]
ObjsP = [y for x in ObjsP for y in x if y not in lines]
FObjsP = Counter(ObjsP).most_common()
FObjsP

## Save filtered images in file
We will save in the filtered images IDs

In [0]:

imageIDs = [x[0] for x in idObjsLen2]
l  = open(writingPath,'w')
for i,x in enumerate(imageIDs):
  if i != 0 : 
    l.write('\n')
  l.write(str(x))
l.close

## Get QA and save them.



### Filter questions answers
We take questions answers that correspond to filtered images. We save that in a json file

In [0]:
with open(writingPath) as p : 
  imageIDs = [int(x.strip()) for x in p.readlines() if x != '']
  
Int_imageIDs = [int(x) for x in imageIDs]
with open(VG_QAs_path, 'r') as g : 
  QAs = json.load(g)
i = 0
for x in QAs : 
  if i == 10:
    break
  print(x)
  i +=1
objs = []
count = 0
for im in QAs : 
  image_id = im['id']
  qas = im['qas']
  
  if image_id in Int_imageIDs:
    count +=1
    filtered_QAs = [{'question' : x['question'], 'answer' : x['answer']} for x in qas ]
    obj = {
      'image_id' : image_id,
      'QAs' : filtered_QAs
    }
    objs.append(obj)
print(count)
with open(save_qas_file , 'w') as f : 
   json.dump(objs, f)

In [0]:
import pandas as pd
with open(VG_QAs_path, 'r') as g : 
  QAs = json.load(g)
i = 0

objs = []

for im in QAs :
  if im['id'] == 4 : 
  
    qas = im['qas']
    for qa in qas : 
      objs.append((qa['question'], qa['answer']))
    break

 

In [0]:
df = pd.DataFrame({'questions': [x[0] for x in objs], 'reponses': [x[1] for x in objs]})  
df.to_csv('/content/df.csv', ';')

In [0]:
'''
#!pip install gensim
#os.chdir('/content/gdrive/My Drive/Colab')
#!gunzip -f -k GoogleNews-vectors-negative300.bin.gz

#os.chdir('/content')
model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/My Drive/Colab/GoogleNews-vectors-negative300.bin', binary = True)
sim = []
out = []
for word in lines:
  for objs in idObjs:
    for obj in objs[1]:
      if (obj in model.wv.vocab) and (word in model.wv.vocab):
        sim.append((obj, model.similarity(obj, word)))
      else:
        if not obj in model.wv.vocab : 
          out.append((objs[0],obj))
sim = sim.sort(key=lambda tup: tup[1])    
'''

# QAs from GQA
GQA is a new dicipline that aims to enhance visual reasoning on VQA. There is a GQA dataset using visual genome in https://cs.stanford.edu/people/dorarad/gqa/download.html

## Download Questions from GQA dataset
Downlaod questions and load them

In [0]:
def readGQA(path_file):
  with open(path_file,'r') as f:
    qes = json.load(f)
  return ([(x['imageId'], x['question'], x['answer']) for x in qes.values()])
 
ques_path= [os.path.join(train_questions_GQA,x) for x in os.listdir(train_questions_GQA)]

with open(save_GQA, 'a') as f:
  writer = csv.writer(f)
  for file in ques_path : 
    data = readGQA(file)
    for i,d in enumerate(data) : 
      writer.writerow(d)



In [0]:
with open(save_GQA, 'r') as f : 
  reader = csv.reader(f)
  data = [x for x in reader]
len(data)
print(data[0])

## Filter questions and save them
We filter questions by searching for image id that are common in the image ids filtered before


In [0]:
def get_imageIDs(): 
  with open(save_qas_file) as f : 
    data = json.load(f)
  return set([int(x['image_id']) for x in data])

def read_csv(file) : 
  with open(file, 'r') as f : 
    reader = csv.reader(f)
    data = [x for x in reader]
  return data
  
'''with open(writingPath) as p : 
  int_imagesIDs = set([int(x.strip()) for x in p.readlines() if x != ''])'''

def get_filtered_questions_answers(qas_file, int_imagesIDs ):
  qas = read_csv(qas_file)
  return [(x[0], x[1], x[2]) for x in qas if int(x[0]) in int_imagesIDs]
ids =  get_imageIDs()
print('got ids')
GQA_qas = get_filtered_questions_answers(save_GQA, ids)
    

In [0]:
with open(filtred_GQA, 'w') as f : 
  writer = csv.writer(f)
  for x in GQA_qas : 
     writer.writerow(x)
