# Filtering questions from VQA dataset
This notebook aims to filter questions from the VQA dataset corresponding to a specific domain. This is done by domain related keyword search on questions. The keywords are provided in a text file.

## Variables
This section contains the variable used in this notebook, the variables must change to an environment to another so that the program can run.

In [0]:
#this variable represents the path to the imported drive in the notebook, if you don't import drive, ignore this variable 
drive_home = '/content/gdrive'

#path representing directory of train questions from VQA dataset 
train_questions_dir = '/content/gdrive/My Drive/Colab'
#path to the file containing the keywords to filter the questions
key_words_path = '/content/gdrive/My Drive/Colab/treated_indoors.txt'
#path_where to save the filtered tuples (questions ids , questions and images ids) of train questions
train_save_path = '/content/gdrive/My Drive/Colab/results/VQA/train_VQA_filteredImages.txt'
train_annotation_save_path = '/content/gdrive/My Drive/Colab/results/VQA/annotations.json'
#train_answers_dir is the directory where to put train answers
train_answers_dir = '/content/gdrive/My Drive/Colab'
#path representing the place of questions from VQA dataset (copied from val_download_path and pasted on this path)
val_questions_dir = '/content/gdrive/My Drive/Colab'
#path_where to save the filtered tuples (questions ids , questions and images ids) of val questions
val_save_path = '/content/gdrive/My Drive/Colab/results/VQA/val_VQA_filteredImages.txt'
val_annotation_save_path = '/content/gdrive/My Drive/Colab/results/VQA/val_annotations.json'
val_answers_dir =  '/content/gdrive/My Drive/Colab'
final_val_file = '/content/gdrive/My Drive/Colab/results/VQA/final_val.csv'
final_train_file = '/content/gdrive/My Drive/Colab/results/VQA/final_train.csv'

## Imports 

In [0]:
import os
#remove if you don't use drive
from google.colab import drive
import shutil
import json
from itertools import groupby
import csv

## Mount drive (remove line if you don't) and download train questions

### mount drive and import train questions

In [0]:
#remove if you don't use drive
drive.mount(drive_home)
#! wget -P '$train_questions_dir' https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip
os.chdir(train_questions_dir)

### unzip questions

In [0]:
#!unzip -P '$train_questions_dir' v2_Questions_Train_mscoco.zip

## Load train questions
The questions file is a bunch of lines containing the question id, the question and the image id related to the question in the VQA dataset.

In [0]:
train_questions_path = os.path.join(train_questions_dir, 'v2_OpenEnded_mscoco_train2014_questions.json')
f = open(train_questions_path)
q = json.load(f)
questionsIds = [(question['question_id'], question['question'], question['image_id']) for question in q['questions']]
f.close
questionsIds[:10]


In [0]:
questions = [x[1] for x in questionsIds]
questions[:10]

## filter questions
Filter the questions by keywords, we get from this methods only the questions containing at least on of the keywords

### filterQuestions function to filter questions by keywords

In [0]:
# filter questions and get the ones containing at least on keyword from key_words
def filterQuestions(questions, key_words):
  def contains(question, key_words):
    if not key_words:
      return False
    else :
      return (key_words[0] in question) or contains(question, key_words[1:])

  return [x for x in questions if contains(x[1], key_words)]



### get images ids with questions that have at least one questions in filtered questions
This is to catch all questions that have relation to indoor, with filterQuestions we got all questions containing keywords, that means that images filtered are indoor images (most of them) and that the other questions that don't have keywords in that images could also be indoor questions.

In [0]:
def get_filtered_images(questions_ids, questions_image):
  return [x for x in questions_image if x[0] in questions_ids]


### treat keywords
Treating keywods such as making them lowercase, removing extra-space ... and the loading them





In [0]:
f = open(key_words_path,'r')
key_words = f.readlines()
f.close
treated_key_words = [x.strip().lower() for x in key_words if x.strip() != ''] 
  
treated_key_words

### filter questions 
The result of applying the filter to questions is a tuple containing (question_id, question, image_id) where question contains of the keywords in key_words

In [0]:
train_filtred_questions = filterQuestions(questionsIds, treated_key_words)
t = train_filtred_questions
(q_id, q, im) = zip(*train_filtred_questions)
train_filtred_questions =get_filtered_images(q_id,questionsIds)
train_filtred_questions

In [0]:
ques = set([x[2] for x in train_filtred_questions])
train_questions_path = os.path.join(train_questions_dir, 'v2_OpenEnded_mscoco_train2014_questions.json')
with open(train_questions_path) as f :
  q = json.load(f)
questionsIds = [(question['question_id'], question['question'], question['image_id']) for question in q['questions'] if question['image_id'] in ques]
len(questionsIds)

In [0]:
len(list(set([x[2] for x in train_filtred_questions])))

### download annotations

In [0]:
!wget -P '$train_answers_dir' "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip"
os.chdir(train_answers_dir)
!unzip v2_Annotations_Train_mscoco.zip

In [0]:
train_answers_path = os.path.join(train_answers_dir, 'v2_mscoco_train2014_annotations.json')
with open(train_answers_path) as f :
  anno = json.load(f)


In [0]:
'''with open(train_save_path, 'r') as f :
  lines= [x.strip() for x in f.readlines() if x.strip() != '']
train_filtred_questions = []
for l in lines : 
  y = l.split(';')
  t = (y[0], y[1], y[2])
  train_filtred_questions.append(t)'''
  
annotations = [x for x in anno['annotations']]
fq_id, fq, f_im_id = zip(*train_filtred_questions)


In [0]:

filtered_annotaitons = [x for x in annotations if str(x['image_id']) in f_im_id]

In [0]:
with open(train_annotation_save_path, 'w') as f : 
  json.dump(filtered_annotaitons,f)


## save tuples of filtred questions

One line of the saved file is in the format :  question_id,question,image_id





In [0]:
def save(path, filtred_questions):
  f = open(path, 'w')
  for i,x in enumerate(filtred_questions):
    if i != 0 : 
      f.write('\n')
    f.write(str(x[0]) +';'+str(x[1]) +';'+str(x[2]))
  f.close
save(train_save_path,train_filtred_questions )

## Downlaod val question

### downlaod the file

In [0]:
#!wget -P '$val_questions_dir' https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip 

### copy from download path to working path

In [0]:
os.chdir(val_questions_dir)
#!unzip v2_Questions_Val_mscoco.zip 
val_questions_path = os.path.join(val_questions_dir, 'v2_OpenEnded_mscoco_val2014_questions.json')

## Load val questions (tuples)

In [0]:
f = open(val_questions_path)
q = json.load(f)
questionsIdsVal = [(question['question_id'], question['question'], question['image_id']) for question in q['questions']]
f.close

<function TextIOWrapper.close>

## Filter val questions

In [0]:
val_filtred_questions = filterQuestions(questionsIdsVal, treated_key_words)
(q_id, q, im) = zip(*val_filtred_questions)
val_filtred_questions =get_filtered_images(q_id,questionsIdsVal)
val_filtred_questions

In [0]:
ims = set([x[2] for x in val_filtred_questions])
ques = set([x[2] for x in train_filtred_questions])
f = open(val_questions_path)
q = json.load(f)
questionsIdsVal = set([question['question_id'] for question in q['questions'] if question['image_id'] in ims ])
f.close
len(questionsIds)
print(len(ques))

In [0]:
len(val_filtred_questions)

In [0]:
path = val_save_path
save(path, val_filtred_questions)

### Download annotations

In [0]:
#!wget -P '$train_answers_dir' "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip"
os.chdir(val_answers_dir)
!unzip v2_Annotations_Val_mscoco.zip

In [0]:
train_answers_path = os.path.join(train_answers_dir, 'v2_mscoco_val2014_annotations.json')


# Generate full dataset

In [0]:
def get_full_dataset(questions_file, annotation_file, save_file):
  with open(train_answers_path) as f :
    anno = json.load(f)
  with open(questions_file, 'r') as f :
    lines= [x.strip() for x in f.readlines() if x.strip() != '']
  filtred_questions = []
  for l in lines : 
    y = l.split(';')
    t = (y[0], y[1], y[2])
    filtred_questions.append(t)
  print('filtered questions len {}'.format(len(filtred_questions)))
  annotations = anno['annotations']
  annotations_map = {}
  for a in annotations : 
    if a['question_id'] in annotations_map : 
      annotations_map[int(a['question_id'])].append(a)
    else : 
      annotations_map[int(a['question_id'])] = [a]
  for k,v in annotations_map.items():
    if not len(v) == 1:
      print('danger {}'.format(k))
  
  
  print('len group by {} {}'.format(len(annotations_map), len(annotations)))

  data = []
  for ques in filtred_questions : 
    if int(ques[0]) in annotations_map : 
      answers = [x['answer'] for x in annotations_map[int(ques[0])][0]['answers']]
      data.append([int(ques[2]), ques[1], ','.join(answers)])
  print('len data {}'.format(len(data))) 
  imgs = [x[0] for x in data]
  with open(save_file, 'w') as f : 
    writer = csv.writer(f)
    for d in data : 
      writer.writerow(d)
      
                  

In [0]:
get_full_dataset(train_save_path,train_answers_path, final_train_file)
get_full_dataset(val_save_path,val_answers_path, final_val_file)