# Preprocessing
The aim of this code is to generate a preprocessed VQA V2 dataset in binary files.

In [0]:
#VQA V2 images directory
train_images_root = ''
val_images_root = ''
#Directories where to put the result after train and validation preprocessing.
train_feats_root = ''
val_feats_root = ''
#VQA V2 train questions file
train_questions_file = ''
val_questions_file
#VQA V2 val questions file
train_answers_file = ''
val_answers_file = ''
#VQA complementary pair files : https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Complementary_Pairs_Train_mscoco.zip
pairs_file = ''
drive_root = ''
#number of rows in each serialized file
FILE_LENGTH = 100 * 40
#Batch length for generating preprocessed data
RESNET_BATCH_SIZE = 40
#minumum occurrence of words in questions
NUM_OCCURENCE = 5
#initalised in answerPreprocessing
ANSWER_DIM = 0

import os
import importlib.util
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
import re
import numpy as np
import json
from pathlib import Path
import pickle
from collections import Counter
import cv2
import time
from keras import regularizers
from keras.models import load_model

tf.enable_eager_execution()


# Mount drive

In [0]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount(drive_root)

# Question Preprocessing

In [0]:
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    sentence = sentence.strip().lower()
    sentence = (
        sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s'))
    tokens = SENTENCE_SPLIT_REGEX.split(sentence)
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens

In [0]:
class QuestionsPreprocessing : 
  
  def __init__(self, glove_file, questions_files, num_occurence) :
    self.gloveFile  = glove_file
    self.questionsFiles = questions_files
    self.word2Glove = self.readGloveFile()
    questions = self.readQuestionsFiles()
    self.questions_files = questions_files
    #filter words that have occurence less than occurence
    words = self.filter_words(questions,num_occurence)
    #partition words from weather they are in glove or not
    self.word2Index, self.index2Word = self.matchWordIndex(words)
    #consider the 0 padding
    self.vocab_length = len(self.word2Index) + 1 
     
    
  def readGloveFile(self):
      with open(self.gloveFile, 'r') as f:
          wordToGlove = {}  # map from a token (word) to a Glove embedding vector
          #wordToIndex = {}  # map from a token to an index
          #indexToWord = {}  # map from an index to a token 

          for line in f:
              record = line.strip().split()
              token = record[0] # take the token (word) from the text line
              wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

      return wordToGlove
    
  #return all words in questions
  #WARNING IMPORTANT : THIS FUNCTION IS ONLY FOR RETRIEVING QUESTIONS FROM VQA DATASET
  def readQuestionsFiles(self) : 
    questions = []
    for file in self.questionsFiles :
      with open(file, 'r') as f : 
        data = json.load(f)
        for x in data['questions']:
          questions.append(x['question'])
    return questions    
        
    
  def filter_words(self,questions, num_occurence):
    words = {}
    for question in questions : 
      for word in tokenize(question):
        if(word in words):
          words[word] +=1
        else :
          words[word] = 1
    return [k for k,v in words.items() if v > num_occurence]
   
    
  def matchWordIndex(self, words) : 
    word2Index = {}
    index2Word = {}
    for i, word in enumerate(words):
      word2Index[word] = i+1
      index2Word[i+1] =word
    return word2Index, index2Word
    
 
  # create embedding matrix
  def createPretrainedEmbeddingLayer(self):
      wordToIndex = self.word2Index
      wordToGlove = self.word2Glove
      vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
      glove_words = wordToGlove.keys()
      embDim = next(iter(wordToGlove.values())).shape[0] 
      embeddingMatrix = np.zeros((vocabLen, embDim), 'float64')  # initialize with zeros
      for word, index in wordToIndex.items():
        if word in glove_words:
          embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding
        else : 
          embeddingMatrix[index, :] = np.random.rand(1, embDim)
      return embeddingMatrix

  
  def get_embedding_matrix(self) : 
    
    embedding_matrix = self.createPretrainedEmbeddingLayer()
    return embedding_matrix
  
# TODO : check if there is another method like removing most common words  
  def preprocessBatch(self,questions, max_len):
    batch = []
    for question in questions : 
      words =self.preprocessElem(question)
      batch.append(num_words)
    return self.postTruncate(batch, max_len)

  def preprocessElem(self,question):
    words = self.preTruncate(question)
    words = tokenize(words)
    num_words = []
    for word in words :
      w = self.word2Index.get(word, 'not_found_word')
      if not w is 'not_found_word' :
        num_words.append(w)
    return num_words
  #private
  #TODO : implement
  def preTruncate(self, words): 
    return words
  
  def postTruncate(self,batch, max_len) : 
    return  keras.preprocessing.sequence.pad_sequences(batch, max_len,padding ='post', truncating = 'post')
    

# Answer preprocessing

In [0]:
class AnswerPreprocessing : 
  def __init__(self, questions_files, answers_files):
    global ANSWER_DIM
    assert len(questions_files) == len(answers_files), 'not the same length'
    self.answers, _ = self.get_answers(questions_files[0], answers_files[0])
    #for i, f in enumerate(questions_files):
     # s,m = self.get_answers(questions_files[i], answers_files[i])
     # self.answers = self.answers.union(s)
    self.word2Index = {}
    self.index2Word = {}
    self.word2Index, self.index2Word = self.matchWords2Indexes()
    ANSWER_DIM = len(self.word2Index)
    self.num_words = len(self.word2Index)
  
  def get_dim():
    return len(self.word2Index)
  #private
  def matchWords2Indexes(self): 
    wordToIndex = {}
    indexToWord = {}
    words = self.answers
    words = self.filterWords(words)
    wordToIndex['no idea'] = 0
    indexToWord[0] = 'no idea'
    i = 1
    for w in words:
      if not w == 'no idea' :
        wordToIndex[w] = i
        indexToWord[i] = w
        i += 1
    return (wordToIndex, indexToWord)
  #private
  def readWords(self) :
    words = []
    for file in self.answer_files : 
      with open(file, 'r') as f : 
        data = json.load(f)
        for annotation in data['annotations'] : 
          for answer in annotation['answers']:
            words.append(answer['answer'])
    return words
  
  #private
  def filterWords (self, words) : 
    return words
  
  def preprocessBatch(self, answers) : 
    batch  = []
    ans = np.array(answers)
    if ans.ndim == 2 : 
      for answers2 in answers : 
         batch.append([self.word2Index[x] for x in answers2])
    else : 
      for answer in answers: 
        batch.append(self.word2Index[answer])
    return batch
 
  
  def _preprocessElem(self,ans):
    # if the dataset contains multiple answers : Exemple VQA dataset
    answer = np.array([x for x in ans if not x == FILL_TOKEN])
    if answer.ndim == 1 :
      arr = np.zeros((ANSWER_DIM,))
      if len(answer) == 0 :
        arr[0] = 1.0
        return arr
      else :
      #arr = np.zeros(self.num_words,dtype=int)
        
        value = 1/len(answer)
        found = False
        for i, a in enumerate(answer):
          if a in self.word2Index:
            found = True
            arr[self.word2Index[a]] += value
        if not found :
          arr[0] = 1.0
        return arr
    #if the answer is unique in the dataset
    else :
      return self.word2Index[answer]
    
  def preprocessElem(self,ans):
    answer = np.array([x for x in ans if not x == FILL_TOKEN], 'str')
    arr = np.zeros((ANSWER_DIM,))
    if len(answer) == 0 :
      arr[0] =1
    else :
      c = Counter(answer)
      t = [x for (x,y) in c.items() if x in self.word2Index]
      if t == [] :
        arr[0] = 1
      else :
        for elem, occur in c.items() :
          if elem in self.word2Index :
            index = self.word2Index[elem]
            score = 1 if occur >=3 else 1/3* occur
            arr[index] = score
    return arr 
  
  def get_ques2(self,questions_file):
      with open(questions_file,'r') as f : 
        data = json.load(f)
      questions = data['questions']
      qsid_iq = { x['question_id']:  x['question'] for x in questions }
      return qsid_iq

    
  def get_answers( self, questions_file,answers_file) :
    global ANSWER_DIM
    numbers  = ["zero","one","two","three","four",
          "five","six","seven","eight","nine","ten",
          "eleven","twelve","thirteen","fourteen","fifteen",
          "sixteen","seventeen","eighteen","nineteen"];
    tens = ["Twenty","Thirty","Forty","Fifty",
          "Sixty","Seventy","Eighty","Ninety"]
    id_qs = self.get_ques2(questions_file)

    with open(answers_file,'r') as f : 
      data = json.load(f)
    resps = data['annotations']
    res = [] 
    ids = []
    conf =[]
    for x in resps:
      question = id_qs[x['question_id']].strip().lower()
      questionID = x['question_id']
      ans = [y['answer'].replace(',', ' ').replace('?', '').replace('\'s', ' \'s').strip().lower() for y in x['answers'] ]
      res1 =[]
      for word in ans :
          if word == 'no 1' or 'no one' in word :
            res1.append('no one')
          elif word in ['no clue', "i dont know", "i don't know", "cannot know", "can't know", "can't tell", "not sure", "don't know", "cannot tell", "unknown"]:
            res1.append('no idea')
          elif word == 'my best guess is no' or "it isn't" in word or  'it is not' in word:
            res1.append('no')
          elif 'many' in word or 'several' in word or 'lot' in word or 'numerous' in word:
            res1.append('many')
          elif word in numbers :
            res1.append(str(numbers.index(word)))
          elif word in tens:
            res1.append(str((ten.index(word) + 2) * 10))
          else :
            res1.append(word)



      if question.startswith('how many') or question.startswith('what is the number'):
        for word in res1 :
          if re.search('(\s|^)no ', word) or re.search(' no(\s|$)',word):
            if word == 'no idea':
              res.append('no idea')        
            else :
              res.append('0')
          elif word == 'o' :
            res.append(0)
          elif not len(re.findall('\d+', word)) == 0:
              res.append(re.findall('\d+', word)[0])         
          elif word == 'no' :
              res.append('0')
          elif word =='yes' :
              res.append('1')
          else :
              res.append(word)

      elif question.startswith('is') or question.startswith('are'):

        for word in res1 :
          if re.search('(\s|^)no ', word) or re.search(' no(\s|$)',word):
            res.append('no')
          elif word == 'it is' or 'yes' in word:
            res.append('yes')
          elif 'it is' in word :
            s = word.replace('it is', '').strip()
            res.append(s)
            continue
          else :
            res.append(word) 

      else :
        for word in res1 :
          if word == 'it is' or 'yes' in word:
              res.append('yes')
          elif 'it is' in word :
            s = word.replace('it is', '').strip()
            res.append(s)
          elif ('there is no' in word) or ("there's no" in word) or ('there are no' in word):
            res.append('not found')
          elif word.strip().startswith('no ') :
            ans_tokens = tokenize(word[2:])
            ques_tokens = tokenize(question)
            boo = True
            for t in ans_tokens:
              if not (t in ques_tokens or t+'s' in ques_tokens):
                boo = False
                break
            if boo :
              res.append('not found')
            else :
              res.append(word)
          else :
            res.append(word)  

      for s in ans:
        ids.append(questionID)

      #TODO remove this:    
      conf1 = [y['answer_confidence'] for y in x['answers'] ]
      conf.extend(conf1)

    newres = []
    newids = []
    for index in range(len(res)) :
      if conf[index] == 'yes' :
        newres.append(res[index])
        newids.append(ids[index])
    c = Counter(newres)
    resset = set([k for k,v in c.items() if v >= ANSWER_OCCURENCE_MIN])
    m = {}
    for index in range(len(newres)) :
      qid = newids[index]
      response = newres[index]
      if  response in resset : 
        if qid in m :
          m[qid].append(response)
        else :
          m[qid] = [response]
  #  queskeys = set(ques.keys())

   # m = {k : v for k,v in m.items() if k in queskeys} 
    return resset, m


# Dataset generation
includes image preprocessing. Generates serialised files containing train and validation preprocessed datasets.

In [0]:
USED = False
class VQA_DatasetGenerator2 : 
  def __init__(self,question_preprocessor, answer_preprocessor):
    self.question_preprocessor = question_preprocessor
    self.answer_preprocessor = answer_preprocessor
    #self.answers_file = answers_file
    #self.complementary_questions = self.__get_complementary_questions()

 
  def _bytes_feature(self,value):

    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

  def _float_feature(self,value):
    """Returns a float_list from a float / double."""

    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

  def _int64_feature(self,value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


  def _image_example(self,images_paths, questions, answers, model,file_path):
    
    def preprocess_image(image):
      image = tf.image.decode_jpeg(image, channels=3)
      image = tf.image.resize_images(image, [224, 224])
      return image
    def load_and_preprocess_image(path):
      image = tf.read_file(path)
      return preprocess_image(image)
    
    
    
    
    print('before dataset')
    data = tf.data.Dataset.from_tensor_slices(images_paths).apply(tf.data.experimental.map_and_batch(
      map_func =  load_and_preprocess_image,batch_size = RESNET_BATCH_SIZE, num_parallel_batches=5
      )).prefetch(RESNET_BATCH_SIZE ).make_one_shot_iterator()

    examples = []
    t = time.time()
    for ib, batch in enumerate(data) :
        print('batch {}'.format(ib))
        
        imgs_feats = model(batch)
        for ife, feat in enumerate(imgs_feats) :
          idx = ib * RESNET_BATCH_SIZE + ife
          answer = answers[idx]
          question = questions[idx]
          im = feat.numpy().flatten()

          answer = answer + self.fill_list(len(answer))
          answer = self.answer_preprocessor.preprocessElem(answer)
          feature = {
            'image': self._float_feature(im),
            'question' :self._int64_feature(question),
            'answers' :self._float_feature(answer)
           }   
          examples.append( tf.train.Example(features=tf.train.Features(feature=feature)) )

        '''if ib % 75 == 0 : 
          print('batch {} in {}'.format(ib, time.time() - t))
          t = time.time()
          for example in examples:       
            writer.write(example.SerializeToString())
          del examples
          examples = []'''
    print('open file  {}'.format(file_path))  
    with tf.python_io.TFRecordWriter(file_path) as writer:
      for example in examples:       
        writer.write(example.SerializeToString())
    print('file finished')
    del examples  

     
 
  #TODO :add reorder treatment 
  def get_data(self, image_folder, questions_file, answers_file, pairs_file,dataSubType,ext, train, reorder):

    

    
    def get_image_id(file_name):
      id = file_name.split('_')[2][:-4]
      return int(id)

    def get_path(image_id, dataSubType, ext):
      imgFilename = 'COCO_' + dataSubType + '_'+ str(image_id).zfill(12) + '.'+ext
      return imgFilename

    def get_im_ids(im_folder):
      ims = [get_image_id(f) for f in os.listdir(im_folder)]
      return set(ims)

    def get_ques(questions_file, pairs_file, image_ids, dataSubType,ext, reorder):
      if not USED :
        with open(pairs_file, 'r') as f : 
          data = json.load(f)
        pairs = [y for x in data for y in x ]

        with open(questions_file) as f : 
          data = json.load(f)
        ques = data['questions']

        ques_map = {x['question_id'] : x for x in ques}
        qids  = [x['question_id'] for x in ques]
        diff_set = set(qids) - set(pairs)
        for x in diff_set :
          pairs.append(x)
        questions = [ ques_map[x] for x in pairs]

        if reorder : 
          #TODO : implement
          print('reorder not implemented')
          #questions = self.__order(questions, 'question_id')

        '''with open('/content/gdrive/My Drive/nabih/questions.txt', 'w') as f:
          for i, q in enumerate(pairs) :
            if not i == 0 :
              f.write('\n')
            f.write(str(q))'''
      else :
        with open('/content/gdrive/My Drive/nabih/questions.txt', 'r') as f:
          pairs = [int(x) for x in f.readlines()]
        with open(questions_file) as f : 
          data = json.load(f)
        ques = data['questions']
        ques_map = {x['question_id'] : x for x in ques}
        questions = [ ques_map[x] for x in pairs]
        

      qsid_iq = { x['question_id']: [get_path(x['image_id'],dataSubType,ext), x['question']] for x in questions if x['image_id'] in image_ids }

      return qsid_iq
   
    def gen_ans(answers_file, ques):
      with open(answers_file) as f : 
        data = json.load(f)
      resps = data['annotations']
      resps = { x['question_id'] : [y['answer'] for y in x['answers']] for x in resps if int(x['question_id']) in ques}
      return resps

    #TODO : lot of word done here because j ai la flemme de tt refaire
    

    
    ti = time.time()
    ids = get_im_ids(image_folder)
    print('get data : get image {}'.format(time.time() - ti))
    ti = time.time()
    ques = get_ques(questions_file, pairs_file, ids, dataSubType,ext, reorder)
    print('get data : questions {} len of ques {}'.format(time.time() -  ti, len(ques)))
    ti = time.time()
    resset, ans = self.answer_preprocessor.get_answers(questions_file, answers_file)
    print('get data : ans {}'.format(time.time() -  ti))
    ti = time.time()
       # We get the map from just the first file
    dico = []
    for qid , iq in ques.items():
      if train :
        try :
          answers  = ans[qid]
        except :
          answers = []
        t = tuple([iq[0], iq[1], answers])
        #r a in answers :
         #newiq =  [x for x in iq]
         #newiq.append(a)
         #dico.append(newiq)
        dico.append(t)
      else :
        t = tuple(iq)
        dico.append(t)
    print('get data : dico {}'.format(time.time() -  ti))

    return zip(*dico) 
  
  def fill_list(self, leng) : 
    x = 10 - leng
    return [FILL_TOKEN for i in range(x)]

  #TODO : see if i must include the full image.:

  def generate(self, image_feats_file, questions_file, answers_file ,pairs_file, dataSubType, ext, feats_root, chunk_size):
    #im_qsids_qs = self.__get_questions(questions_file,reorder)
    images, questions, answers = self.get_data(image_feats_file, questions_file, answers_file,pairs_file, dataSubType,ext, True,False)
    print('get data done len of images {}'.format(len(images)))
    images_paths =  [os.path.join(image_feats_file, d) for d in images]
    questions = [self.question_preprocessor.preprocessElem(x) for x in questions]
    questions = self.question_preprocessor.postTruncate(questions, MAX_LEN)
    print('questions done')
    model = keras.applications.ResNet50(include_top=False, weights='imagenet', input_tensor=None, input_shape=(224, 224,3), pooling=None)
    t = time.time()
    #for file_index, chunk_start in enumerate(range(0, len(questions), chunk_size)):
    file_index = 65
    start = file_index*chunk_size
    t = time.time()
    last = len(questions)
    for chunk_start in range(start, last, chunk_size) :
      print('in chunk')
      print('total time {}'.format(time.time() - t))
      t = time.time()
      file_path = os.path.join(feats_root, 'TFREC_'+str(file_index)+'.tfrecord')
      chunk_end = chunk_start + chunk_size
      chunk_end = chunk_end if chunk_end <= last else last
      chunk_images_paths = images_paths[chunk_start : chunk_end ]
      chunk_questions = questions[chunk_start : chunk_end ]
      chunk_answers = answers[chunk_start : chunk_end ]      
      self._image_example(chunk_images_paths, chunk_questions, chunk_answers, model, file_path)
      print('treated {}  time {}'.format(chunk_end, time.time() - t))
      file_index +=1
          
        
      
    
    
   

    
        
  #TODO
  def __order(self, items,key): 
    return items
    
    


In [0]:
dataset_generator =  VQA_DatasetGenerator2(question_preprocessor, answer_preprocessor)
answer_preprocessor = AnswerPreprocessing([train_questions_file],[train_answers_file])
question_preprocessor = QuestionsPreprocessing(glove_file_path, [train_questions_file], NUM_OCCURENCE)


# Generate train dataset

In [0]:
dataset_generator.generate(train_images_root,train_questions_file, train_answers_file,pairs_file, 'train2014','jpg',train_feats_root,FILE_LENGTH)

# Generate validation dataset

In [0]:
dataset_generator.generate(val_images_root,val_questions_file, val_answers_file, 'val2014','jpg',val_feats_root,FILE_LENGTH )