# Filtering questions from VQA dataset
This notebook aims to filter questions from the VQA dataset corresponding to a specific domain. This is done by domain related keyword search on questions. The keywords are provided in a text file.

## Variables
This section contains the variable used in this notebook, the variables must change to an environment to another so that the program can run.

In [0]:
# To Implement
    #this variable represents the path to the imported drive in the notebook, if you don't import drive, ignore this variable 
drive_home = ''
    #root directory 
root_directory =  ''


VQA_data_root = os.path.join(root_directory, 'VQA')
results_directory = os.path.join(root_directory, 'results/VQA')

#path representing the file of train questions from VQA dataset 
train_questions_path = os.path.join(VQA_data_root, 'v2_OpenEnded_mscoco_train2014_questions.json')
#path to the file containing the keywords to filter the questions
key_words_path = os.path.join(root_directory, 'keywords_files/indoor.txt')
#path_where to save the filtered tuples (questions ids , questions and images ids) of train questions
train_save_path = os.path.join(results_directory, 'train_VQA_filteredImages.txt')
train_annotation_save_path = os.path.join(results_directory,'/annotations.json')
#train_answers_path is the file where are located VQA V2 train annotations
train_answers_path = os.path.join(VQA_data_root, 'v2_mscoco_train2014_annotations.json')
#path representing the place of questions from VQA dataset (copied from val_download_path and pasted on this path)
val_questions_path = os.path.join(VQA_data_root, 'v2_mscoco_val2014_questions.json')
#path_where to save the filtered tuples (questions ids , questions and images ids) of val questions
val_save_path = os.path.join(results_directory,'val_VQA_filteredImages.txt')
val_annotation_save_path = os.path.join(results_directory,'val_annotations.json')
val_answers_path = os.path.join(VQA_data_root, 'v2_mscoco_val2014_annotations.json')
final_val_file = os.path.join(results_directory,'final_val.csv')
final_train_file = os.path.join(results_directory,'final_train.csv')

## Imports 

In [0]:
import os
#remove if you don't use drive
from google.colab import drive
import shutil
import json
from itertools import groupby
import csv

## Mount drive (remove line if you don't) and download train questions

### mount drive

In [0]:
#remove if you don't use drive
drive.mount(drive_home)

## Load train questions
The questions file is a bunch of lines containing the question id, the question and the image id related to the question in the VQA dataset.

In [0]:
f = open(train_questions_path)
q = json.load(f)
questionsIds = [(question['question_id'], question['question'], question['image_id']) for question in q['questions']]
f.close
questionsIds[:10]


In [0]:
questions = [x[1] for x in questionsIds]
questions[:10]

## filter questions
Filter the questions by keywords, we get from this methods only the questions containing at least on of the keywords

### filterQuestions function to filter questions by keywords

In [0]:
# filter questions and get the ones containing at least on keyword from key_words
def filterQuestions(questions, key_words):
  def contains(question, key_words):
    if not key_words:
      return False
    else :
      return (key_words[0] in question) or contains(question, key_words[1:])

  return [x for x in questions if contains(x[1], key_words)]



### get images ids with questions that have at least one questions in filtered questions
This is to catch all questions that have relation to indoor, with filterQuestions we got all questions containing keywords, that means that images filtered are indoor images (most of them) and that the other questions that don't have keywords in that images could also be indoor questions.

In [0]:
def get_filtered_images(questions_ids, questions_image):
  return [x for x in questions_image if x[0] in questions_ids]

def save_csv(save_file, data):
  with open(save_file, 'w') as f : 
    writer = csv.writer(f)
    for d in data : 
      writer.writerow(d)
      
      
def read_csv(file) : 
  with open(file, 'r') as f : 
    reader = csv.reader(f)
    data = [x for x in reader]
  return data


### treat keywords
Treating keywods such as making them lowercase, removing extra-space ... and the loading them





In [0]:
f = open(key_words_path,'r')
key_words = f.readlines()
f.close
treated_key_words = [x.strip().lower() for x in key_words if x.strip() != ''] 
  
treated_key_words

### filter questions 
The result of applying the filter to questions is a tuple containing (question_id, question, image_id) where question contains of the keywords in key_words

In [0]:
train_filtred_questions = filterQuestions(questionsIds, treated_key_words)
t = train_filtred_questions
(q_id, q, im) = zip(*train_filtred_questions)
train_filtred_questions =get_filtered_images(q_id,questionsIds)
train_filtred_questions

In [0]:
ques = set([x[2] for x in train_filtred_questions])
with open(train_questions_path) as f :
    q = json.load(f)
questionsIds = [(question['question_id'], question['question'], question['image_id']) for question in q['questions'] if question['image_id'] in ques]
len(questionsIds)

In [0]:
len(list(set([x[2] for x in train_filtred_questions])))

In [0]:
with open(train_answers_path) as f :
    anno = json.load(f)


## Save train dataset

In [0]:
annotations = [x for x in anno['annotations']]
fq_id, fq, f_im_id = zip(*train_filtred_questions)
qid_ans = dict([(x["question_id"], [y['answer'] for y in x['answers']]) for x in annotations])


In [0]:
final_train_dataset = []
for q_id, q, im in  train_filtred_questions :
  if q_id in qid_ans : 
    final_train_dataset.append((im, q, qid_ans[q_id]))
#filtered_annotations = [x for x in annotations if str(x['image_id']) in f_im_id]
save_csv(final_train_file, final_train_dataset)

## save tuples of filtred questions

One line of the saved file is in the format :  question_id,question,image_id





In [0]:
def save(path, filtred_questions):
  f = open(path, 'w')
  for i,x in enumerate(filtred_questions):
    if i != 0 : 
      f.write('\n')
    f.write(str(x[0]) +';'+str(x[1]) +';'+str(x[2]))
  f.close
save(train_save_path,train_filtred_questions )

## Load val questions (tuples)

In [0]:
f = open(val_questions_path)
q = json.load(f)
questionsIdsVal = [(question['question_id'], question['question'], question['image_id']) for question in q['questions']]
f.close

<function TextIOWrapper.close>

## Filter val questions

In [0]:
val_filtred_questions = filterQuestions(questionsIdsVal, treated_key_words)
(q_id, q, im) = zip(*val_filtred_questions)
val_filtred_questions =get_filtered_images(q_id,questionsIdsVal)


## Save val dataset

In [0]:
with open(val_answers_path) as f :
  anno = json.load(f)

  
annotations = [x for x in anno['annotations']]
fq_id, fq, f_im_id = zip(*val_filtred_questions)
qid_ans = dict([(x["question_id"], [y['answer'] for y in x['answers']]) for x in annotations])

In [0]:
final_val_dataset = []
for q_id, q, im in  val_filtred_questions :
  if q_id in qid_ans : 
    final_val_dataset.append((im, q, qid_ans[q_id]))
#filtered_annotations = [x for x in annotations if str(x['image_id']) in f_im_id]
save_csv(final_val_file, final_val_dataset)