# VQA Model
In this Notebook, we wil construct our VQA model.

## imports

In [0]:
import os
import importlib.util
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
import re
import numpy as np
import json
from pathlib import Path
import pickle
from collections import Counter
tf.enable_eager_execution()
import cv2
import time
from keras import regularizers
from keras.models import load_model



## Google drive

In [0]:
drive_root_path = ''
drive.mount(drive_root_path, force_remount= True, timeout_ms = 2147483647)


Mounted at /content/gdrive


## variables

In [0]:
#root path of data : same as in Install.ipynb
root_path ='' 

#Glove file path
glove_file_path =  os.path.join(root_path, 'glove/glove.6B.300d.txt')

#train and val images features
train_tfrec_dir = os.path.join(root_path, 'features/train_features')
val_tfrec_dir = os.path.join(root_path, 'features/val_features')

#Question embedding variables
BATCH_SIZE = 100
PREFETCH_SIZE = BATCH_SIZE
MAX_LEN = 14
IMAGE_FEAT_SHAPE = (100,120,3)
MONO_IMAGE_SIZE = (224, 224)
IMAGE_FEAT_NUM = 10
#initialized in AnswerPreprocessing
ANSWER_DIM = 0
IMAGE_FEATURES_LAST_DIM = 2048
MONO = True
FILL_TOKEN = 'FILL_TOKEN'
ANSWER_OCCURENCE_MIN = 15

#  Preprocessing

## Tokenizer

In [0]:
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    sentence = sentence.strip().lower()
    sentence = (
        sentence.replace(',', '').replace('?', '').replace('\'s', ' \'s'))
    tokens = SENTENCE_SPLIT_REGEX.split(sentence)
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens

## question preprocessor

    

In [0]:
class QuestionsPreprocessing : 
  
    def __init__(self, glove_file, questions_files, num_occurence) :
        self.gloveFile  = glove_file
        self.questionsFiles = questions_files
        self.word2Glove = self.readGloveFile()
        questions = self.readQuestionsFiles()
        self.questions_files = questions_files
        #filter words that have occurence less than occurence
        words = self.filter_words(questions,num_occurence)
        #partition words from weather they are in glove or not
        self.word2Index, self.index2Word = self.matchWordIndex(words)
        #consider the 0 padding
        self.vocab_length = len(self.word2Index) + 1 

    def readGloveFile(self):
        with open(self.gloveFile, 'r') as f:
            wordToGlove = {}  # map from a token (word) to a Glove embedding vector
            #wordToIndex = {}  # map from a token to an index
            #indexToWord = {}  # map from an index to a token 

            for line in f:
                record = line.strip().split()
                token = record[0] # take the token (word) from the text line
                wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

        return wordToGlove

  #return all words in questions
  #WARNING IMPORTANT : THIS FUNCTION IS ONLY FOR RETRIEVING QUESTIONS FROM VQA DATASET
    def readQuestionsFiles(self) : 
        questions = []
        for file in self.questionsFiles :
            with open(file, 'r') as f : 
                data = json.load(f)
                for x in data['questions']:
                    questions.append(x['question'])
        return questions    

    
    def filter_words(self,questions, num_occurence):
        words = {}
        for question in questions : 
            for word in tokenize(question):
                if(word in words):
                    words[word] +=1
                else :
                    words[word] = 1
        return [k for k,v in words.items() if v > num_occurence]

    
    def matchWordIndex(self, words) : 
        word2Index = {}
        index2Word = {}
        for i, word in enumerate(words):
            word2Index[word] = i+1
            index2Word[i+1] =word
        return word2Index, index2Word

 
  # create embedding matrix
    def createPretrainedEmbeddingLayer(self):
        wordToIndex = self.word2Index
        wordToGlove = self.word2Glove
        vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
        glove_words = wordToGlove.keys()
        embDim = next(iter(wordToGlove.values())).shape[0] 
        embeddingMatrix = np.zeros((vocabLen, embDim), 'float64')  # initialize with zeros
        for word, index in wordToIndex.items():
            if word in glove_words:
                embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding
            else : 
                embeddingMatrix[index, :] = np.random.rand(1, embDim)
        return embeddingMatrix

  
    def get_embedding_matrix(self) : 
        embedding_matrix = self.createPretrainedEmbeddingLayer()
        return embedding_matrix
  
    # TODO : check if there is another method like removing most common words  
    def preprocessBatch(self,questions, max_len):
        batch = []
        for question in questions : 
            words =self.preprocessElem(question)
            batch.append(num_words)
        return self.postTruncate(batch, max_len)

    def preprocessElem(self,question):
        words = self.preTruncate(question)
        words = tokenize(words)
        num_words = []
        for word in words :
            w = self.word2Index.get(word, 'not_found_word')
            if not w is 'not_found_word' :
                num_words.append(w)
        return num_words
  #private
  #TODO : implement
    def preTruncate(self, words): 
        return words
  
    def postTruncate(self,batch, max_len) : 
        return  keras.preprocessing.sequence.pad_sequences(batch, max_len,padding ='post', truncating = 'post')
    

## Answer preprocessor

In [0]:
class AnswerPreprocessing : 
    def __init__(self, questions_files, answers_files):
        global ANSWER_DIM
        assert len(questions_files) == len(answers_files), 'not the same length'
        self.answers, _ = self.get_answers(questions_files[0], answers_files[0])
        #for i, f in enumerate(questions_files):
         # s,m = self.get_answers(questions_files[i], answers_files[i])
         # self.answers = self.answers.union(s)
        self.word2Index = {}
        self.index2Word = {}
        self.word2Index, self.index2Word = self.matchWords2Indexes()
        ANSWER_DIM = len(self.word2Index)
        self.num_words = len(self.word2Index)

    def get_dim():
        return len(self.word2Index)
  #private
    def matchWords2Indexes(self): 
        wordToIndex = {}
        indexToWord = {}
        words = self.answers
        words = self.filterWords(words)
        wordToIndex['no idea'] = 0
        indexToWord[0] = 'no idea'
        i = 1
        for w in words:
            if not w == 'no idea' :
                wordToIndex[w] = i
                indexToWord[i] = w
                i += 1
        return (wordToIndex, indexToWord)
      #private
    def readWords(self) :
        words = []
        for file in self.answer_files : 
            with open(file, 'r') as f : 
                data = json.load(f)
                for annotation in data['annotations'] : 
                    for answer in annotation['answers']:
                        words.append(answer['answer'])
        return words

  #private
    def filterWords (self, words) : 
        return words
  
    def preprocessBatch(self, answers) : 
        batch  = []
        ans = np.array(answers)
        if ans.ndim == 2 : 
            for answers2 in answers : 
                batch.append([self.word2Index[x] for x in answers2])
        else : 
            for answer in answers: 
                batch.append(self.word2Index[answer])
        return batch


    def _preprocessElem(self,ans):
    # if the dataset contains multiple answers : Exemple VQA dataset
        answer = np.array([x for x in ans if not x == FILL_TOKEN])
        if answer.ndim == 1 :
          arr = np.zeros((ANSWER_DIM,))
          if len(answer) == 0 :
            arr[0] = 1.0
            return arr
          else :
          #arr = np.zeros(self.num_words,dtype=int)

            value = 1/len(answer)
            found = False
            for i, a in enumerate(answer):
              if a in self.word2Index:
                found = True
                arr[self.word2Index[a]] += value
            if not found :
              arr[0] = 1.0
            return arr
        #if the answer is unique in the dataset
        else :
          return self.word2Index[answer]

  def preprocessElem(self,ans):
    answer = np.array([x for x in ans if not x == FILL_TOKEN], 'str')
    arr = np.zeros((ANSWER_DIM,))
    if len(answer) == 0 :
      arr[0] =1
    else :
      c = Counter(answer)
      t = [x for (x,y) in c.items() if x in self.word2Index]
      if t == [] :
        arr[0] = 1
      else :
        for elem, occur in c.items() :
          if elem in self.word2Index :
            index = self.word2Index[elem]
            score = 1 if occur >=3 else 1/3* occur
            arr[index] = score
    return arr 
  
  def get_ques2(self,questions_file):
      with open(questions_file,'r') as f : 
        data = json.load(f)
      questions = data['questions']
      qsid_iq = { x['question_id']:  x['question'] for x in questions }
      return qsid_iq

    
  def get_answers( self, questions_file,answers_file) :
    global ANSWER_DIM
    numbers  = ["zero","one","two","three","four",
          "five","six","seven","eight","nine","ten",
          "eleven","twelve","thirteen","fourteen","fifteen",
          "sixteen","seventeen","eighteen","nineteen"];
    tens = ["Twenty","Thirty","Forty","Fifty",
          "Sixty","Seventy","Eighty","Ninety"]
    id_qs = self.get_ques2(questions_file)

    with open(answers_file,'r') as f : 
      data = json.load(f)
    resps = data['annotations']
    res = [] 
    ids = []
    conf =[]
    for x in resps:
      question = id_qs[x['question_id']].strip().lower()
      questionID = x['question_id']
      ans = [y['answer'].replace(',', ' ').replace('?', '').replace('\'s', ' \'s').strip().lower() for y in x['answers'] ]
      res1 =[]
      for word in ans :
          if word == 'no 1' or 'no one' in word :
            res1.append('no one')
          elif word in ['no clue', "i dont know", "i don't know", "cannot know", "can't know", "can't tell", "not sure", "don't know", "cannot tell", "unknown"]:
            res1.append('no idea')
          elif word == 'my best guess is no' or "it isn't" in word or  'it is not' in word:
            res1.append('no')
          elif 'many' in word or 'several' in word or 'lot' in word or 'numerous' in word:
            res1.append('many')
          elif word in numbers :
            res1.append(str(numbers.index(word)))
          elif word in tens:
            res1.append(str((ten.index(word) + 2) * 10))
          else :
            res1.append(word)



      if question.startswith('how many') or question.startswith('what is the number'):
        for word in res1 :
          if re.search('(\s|^)no ', word) or re.search(' no(\s|$)',word):
            if word == 'no idea':
              res.append('no idea')        
            else :
              res.append('0')
          elif word == 'o' :
            res.append(0)
          elif not len(re.findall('\d+', word)) == 0:
              res.append(re.findall('\d+', word)[0])         
          elif word == 'no' :
              res.append('0')
          elif word =='yes' :
              res.append('1')
          else :
              res.append(word)

      elif question.startswith('is') or question.startswith('are'):

        for word in res1 :
          if re.search('(\s|^)no ', word) or re.search(' no(\s|$)',word):
            res.append('no')
          elif word == 'it is' or 'yes' in word:
            res.append('yes')
          elif 'it is' in word :
            s = word.replace('it is', '').strip()
            res.append(s)
            continue
          else :
            res.append(word) 

      else :
        for word in res1 :
          if word == 'it is' or 'yes' in word:
              res.append('yes')
          elif 'it is' in word :
            s = word.replace('it is', '').strip()
            res.append(s)
          elif ('there is no' in word) or ("there's no" in word) or ('there are no' in word):
            res.append('not found')
          elif word.strip().startswith('no ') :
            ans_tokens = tokenize(word[2:])
            ques_tokens = tokenize(question)
            boo = True
            for t in ans_tokens:
              if not (t in ques_tokens or t+'s' in ques_tokens):
                boo = False
                break
            if boo :
              res.append('not found')
            else :
              res.append(word)
          else :
            res.append(word)  

      for s in ans:
        ids.append(questionID)

      #TODO remove this:    
      conf1 = [y['answer_confidence'] for y in x['answers'] ]
      conf.extend(conf1)

    newres = []
    newids = []
    for index in range(len(res)) :
      if conf[index] == 'yes' :
        newres.append(res[index])
        newids.append(ids[index])
    c = Counter(newres)
    resset = set([k for k,v in c.items() if v >= ANSWER_OCCURENCE_MIN])
    m = {}
    for index in range(len(newres)) :
      qid = newids[index]
      response = newres[index]
      if  response in resset : 
        if qid in m :
          m[qid].append(response)
        else :
          m[qid] = [response]
  #  queskeys = set(ques.keys())

   # m = {k : v for k,v in m.items() if k in queskeys} 
    return resset, m


# Model 

## Model Variables 

## Image Feature Extraction


### Objects' bounding boxes

In [0]:
def get_image_model(model_type, input_shape) :
  if model_type == 'resnext50':
    return keras.applications.ResNet50(include_top=False, weights='imagenet', input_tensor=None, input_shape=input_shape, pooling='avg')
  else : 
    raise NotImplementedError('Unknow extractor')

def get_image_module(model_type, mono) :
  if not mono :
    return ImageModel(model_type)
  else :
    return MonoImageModel(model_type)
  
class ImageModel(keras.Model) : 
  def __init__(self, model_type):
    super(ImageModel, self).__init__()
    self.model = get_image_model(model_type,IMAGE_FEAT_SHAPE)
    self.flat = keras.layers.Flatten()
  def call(self,inp) :
    arr = []
    for batch in inp: 
      x = tf.dtypes.cast(batch,tf.float64)
      x = self.model(x)
      x = self.flat(x)
      arr.append(x)
    return tf.convert_to_tensor(arr)
  
class MonoImageModel(keras.Model):
  def __init__(self, model_type):
    super(MonoImageModel, self).__init__()
    self.model = get_image_model(model_type, (224, 224,3))
    self.flat = keras.layers.Flatten()
  def call (self, inp):
    x = self.model(inp)
    x = self.flat(x)
    return x    

## Text features extraction

###  Reccurent networks

In [0]:
def get_question_module(model_type, kwargs): 
  if model_type == 'GRU':
    return MyGRU(kwargs)
  else :
    raise NotImplementedError('Unknown question module')



class MyGRU(keras.Model) : 
  def __init__(self, kwargs) :
    super(MyGRU, self).__init__()
    self.embedding_weights = kwargs['embedding_weights']
    self.embedding_size =kwargs['embedding_size']
    self.hidden_size = kwargs['hidden_size']
    self.num_layers = kwargs['num_layers']
    self.vocab_size = kwargs['vocab_len']
    tmp_dropout = kwargs['dropout']
    self.dropout = tmp_dropout if tmp_dropout else 0.
    
    #TODO Check trainable
    self.embedding = keras.layers.Embedding(input_dim = self.vocab_size,
                                            output_dim = self.embedding_size, weights = [self.embedding_weights], trainable = True)
    #TODO : see if we use args like : use_bias, activation, initilizers .... see also reset_after
    self.seq = keras.models.Sequential()
    input_size = self.hidden_size
    for i in range(self.num_layers):
      self.seq.add(keras.layers.GRU(units = self.hidden_size, dropout = self.dropout, recurrent_dropout= self.dropout))
    
    
  def call(self, x):
    x = self.embedding(x)    
    x = self.seq(x)
    return x
    
    
    

## Merging Before Attention

In [0]:
#Done : check how to do if merger_type is hadamard and we introduce region coordinates : 
### image features then text features
def join_features(merger_type):
  if merger_type == 'concat' : 
    return ConcatMerger()
  elif merger_type == 'hadamard':
    return HadamardMerger()
  else :
    raise NotImplementedError('Unknown Merger')



class ConcatMerger(keras.Model):
  def __init__(self):
    super(ConcatMerger,self).__init__()
    
  #TODO : borders is not necessary because I don't use it in the first merger, I must fix this case after training
  def call (self, imgs,borders, texts) :
    x= tf.concat([imgs,texts], -1)
    return x
    

  
class HadamardMerger(keras.Model) : 
  def __init__(self):
    super(HadamardMerger,self).__init__()
    self.mul = keras.layers.Multiply()
  def call(self,imgs, borders, texts):
    if not borders is None : 
      x=  self.mul([imgs,texts])
      x =  tf.concat([borders, x])
      return x
    else :
      #for after attention merging
      x=  self.mul([imgs,texts])
      return x


    

## Non linear function

In [0]:

def get_non_linear_function(func_type, kwargs) : 
  
  if func_type == 'activation':
    return Activation(kwargs)
  else :
    raise NotImplementedError('Unknown non linear function')


class Activation(keras.Model):
  #WARNING must be positive (num_layers), dims size > 1
  def __init__(self, kwargs):
    super(Activation, self).__init__()
    input_shape = kwargs['input_shape']
    dims = kwargs['dims']
    activation = kwargs['activation']
    dropout = kwargs['dropout']
    normalization = kwargs['normalization']
    regularisation = kwargs['regularisation']
    regularisation = regularizers.l2(regularisation) if regularisation else None
    
    num_layers = len(dims)
    self.model = keras.models.Sequential()
    for i in range(num_layers- 1):
      if input_shape and i == 0 :
        self.model.add(keras.layers.Dense(units=dims[i],activation=activation, input_shape = input_shape, kernel_regularizer=regularisation))
      else :
        self.model.add(keras.layers.Dense(units=dims[i],activation=activation, kernel_regularizer=regularisation))
      if normalization :
        self.model.add(keras.layers.BatchNormalization())
    if dropout:
      self.model.add(keras.layers.Dropout(dropout))
    self.model.add(keras.layers.Dense(dims[i+1]))
    
    #TODO : May be we need first dimension if we use this. Done ?
    
  def call(self,x) :
    return self.model(x)
      

## Attention mechanism

In [0]:

### Attention vector
#dims must end with the right dimention : img_feature dimention


class FixAttentionVector(keras.Model) : 
  #normalization : Softmax, hard tanh, sigmoid
  #dims, image_features_last_dim, activation, normalization
  def __init__(self, args, image_features_last_dim, mono):
    super(FixAttentionVector, self).__init__()
    self.image_features_last_dim = image_features_last_dim
    probs = args['probability_function']
    self.f = get_non_linear_function('activation', args)
    self.mono = mono
    if probs in ['sigmoid', 'softmax']: 
      self.prob = probs
    else :
      raise NotImplementedError('Unknown non normalization function')

  def call(self,x) :
    res = self.f(x)
    if self.prob == 'sigmoid':
      res = keras.activations.sigmoid(res)
    elif self.prob == 'softmax' : 
      #TODO : see dimension of softmax. normalment la dimension de sortie est 
      # (batch, 10,1) donc normalement softmax est appliquee sur le 10. Je ne c pas 
      # vraiment comment ca marche, Done ?
      print('shap of res before softmax {}'.format(res.shape))
      res = keras.activations.softmax(res)
    if not self.mono :
      res =  tf.tile(res , (1,1,self.image_features_last_dim))
    return res

  
  
  
### Image feature
class SumImageAttribute(keras.Model):
  
  def __init__(self, mono):
    super(SumImageAttribute,self).__init__()
    self.mul = keras.layers.Multiply()
    self.sum = keras.layers.Add()
    self.mono = mono
  def call(self,image_features, attention):
    x = self.mul([image_features, attention])
    if self.mono :
      print('applyer mono')
      return x
    else :
      return keras.backend.sum(x, axis = 1)
  
  
  
# todo check last dimention after softmax
class AttentionSystem(keras.Model): 
  def __init__(self, merger, attention_vector, attention_applyer):
    super(AttentionSystem,self).__init__()
    self.merger = merger
    self.attention_vector = attention_vector
    self.attention_applyer = attention_applyer
  def call(self, image_features, borders, text_features):
    common_features = self.merger(image_features, borders, text_features)
    vect = self.attention_vector(common_features)
    if borders :
      new_image_features =  tf.concat([borders, image_features], -1)
    else :
      new_image_features = image_features
    return self.attention_applyer(new_image_features, vect)
    
    



## Classifier

In [0]:
class Classifier(keras.Model):
  def __init__(self, kwargs):
    super(Classifier,self).__init__()
    self.f = get_non_linear_function(kwargs)
  def call(self,x):
    return self.f(x)
    

## model definition

In [0]:

ANSWER_TEST = []
class MyModel(keras.Model):
  def __init__(self, image_model, image_non_linear,
               text_model, text_non_linear, attention_system, merger, classifier, optimizer, mono) :
    super(MyModel,self).__init__()
    self.image_model = image_model   
    self.image_non_linear = image_non_linear
    self.text_model = text_model
    self.text_non_linear = text_non_linear
    self.merger = merger
    self.attention_system = attention_system
    self.classifier = classifier
    self.optimizer = optimizer
    self.mono = mono
    self.norm  = keras.layers.BatchNormalization()
    
   # print(self.image_non_linear.summary())
    print('model started')
    
  def call(self, img_features, borders, question, training = False):
    #x = time.time()
    
    print('image model dim {} '.format(img_features.shape))
    #print('image model {} '.format(time.time() - x))
    #x = time.time()
    imgs = self.image_non_linear(img_features)
    print('image model dim after non linear {} '.format(imgs.shape))

    #print('image non linear {} '.format(time.time() - x))
    #x = time.time()
    #Here I concatenate
    #TODO : I think I'll remove this ... Done ?
    #imgs = tf.concat([imgs, borders], -1)
    #img_feat_num = imgs.shape[-1] if self.mono else imgs.shape[1]

    text = self.text_model(question)
    print('text model dim {} '.format(text.shape))

    #print('text model {} '.format(time.time() - x))
    #x = time.time()
    text = self.text_non_linear(text)
    print('text model dim after non linear {} '.format(text.shape))

    #print('text non linear  {} '.format(time.time() - x))
    #x = time.time()
    if not self.mono :
      texts = tf.tile(tf.expand_dims(text,1), [1,img_feat_num,1])
    else : 
      texts  = text
    print('befor attention system {} {}'.format(imgs.shape, texts.shape))
    #TODO : Border instead of None
    new_img_feature = self.attention_system(imgs,None, texts)
    print('new image featuresdim {} '.format(new_img_feature.shape))

    #print('attention {} '.format(time.time() - x))
    #x = time.time()
    common = self.merger(new_img_feature,None, text)
    print('common shape {} '.format(common.shape))
    #print('merger {} '.format(time.time() - x))
    #x = time.time()


    res = self.classifier(common)
    print('res  dim {} '.format(res.shape))

    #print('classifier {} '.format(time.time() - x))
    return res
  
 
 
  def train(self, train, val, epochs) :
    global ANSWER_TEST
    #Tensorboard
    global_step = tf.train.get_or_create_global_step()
    writer = tf.contrib.summary.create_file_writer(logdir)
    writer.set_as_default()
    
    #Checkpoint
    ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, net=self)
    manager = tf.train.CheckpointManager(ckpt, CHECKPOINT_PATH, max_to_keep=5)
    ckpt.restore(manager.latest_checkpoint)
    
    if manager.latest_checkpoint:
      print("Restored from {}".format(manager.latest_checkpoint))
    else:
      print("Initializing from scratch.")
      
    for ie in range(epochs) : 
      print('epoch'+str(ie)+ '.............................................')
      train_losses = []
      train_accuracies = []
      
      val_losses = []
      val_accuracies = []
     #----------------TRAIN-------------------------#  
      x = train.make_one_shot_iterator()
      #remove this one
      if val : 
        y = val.make_one_shot_iterator()
      for ib, batch in enumerate(x) :
        t = time.time()
        if not self.mono :
          borders = batch[1]
          imgs = batch[0]
          ques = batch[2]
          ans = batch[3]
        else :
          borders = None
          imgs = batch[0]
          ques = batch[1]
          ans = batch[2] 
          imgs = self.image_model(imgs)
        
        with tf.GradientTape() as tape :
          #tape.watch(batch)                
          res = self.call(imgs, borders, ques, True)
          if ib == 2 :
            res.numpy().tofile(file ='/content/file2.txt', sep = '  ' )
            ans.numpy().tofile(file ='/content/fileAns.txt', sep = '  ' )
          elif ib == 1 :
            res.numpy().tofile(file ='/content/file1.txt', sep = '  ' )
          elif ib == 0:
            res.numpy().tofile(file ='/content/file0.txt', sep = '  ' )
            
          batch_loss = tf.losses.softmax_cross_entropy(ans,res)
        #Done : see batch accuracy
        print('ans shape, res shape {} {}'.format(ans.shape, res.shape))
        batch_accuracy = tf.math.reduce_sum(tf.math.multiply(tf.to_double(res >= tf.reduce_max(res)),ans)) / BATCH_SIZE
        x = time.time()
        with tf.contrib.summary.record_summaries_every_n_global_steps(1):
            tf.contrib.summary.scalar('train_batch_loss', batch_loss)
            tf.contrib.summary.scalar('train_batch_accuracy', batch_accuracy)
        gradients = tape.gradient(batch_loss, self.variables)
        optimizer.apply_gradients(zip(gradients, self.variables))
    
        train_losses.append(batch_loss)
        train_accuracies.append(batch_accuracy)
        print('train batch {} ............... loss : {} , accuracy : {} , time {} '.format( ib, batch_loss, batch_accuracy, time.time() -t))

      train_mean_epoch_loss = sum(train_losses) / len(train_losses)
      train_losses = []
      train_mean_epoch_accuracy = sum(train_accuracies) / len(train_accuracies)
      train_accuracies = []
      with tf.contrib.summary.record_summaries_every_n_global_steps(1):
        tf.contrib.summary.scalar('train_mean_epoch_loss', train_mean_epoch_loss)
        tf.contrib.summary.scalar('train_mean_epoch_accuracy', train_mean_epoch_accuracy)
      #----------------VAL-------------------------#  
      if val : 
        for ibv, val_batch in enumerate(y) : 
          x = time.time()
          if not self.mono :
            borders = batch[1]
            imgs = batch[0]
            ques = batch[2]
            ans = batch[3]
          else :
            borders = None
            imgs = batch[0]
            ques = batch[1]
            ans = batch[2]
          res = self.call(imgs, borders, ques)
          res = self.norm(res)
          loss = tf.losses.softmax_cross_entropy(ans,res)
          b = tf.reduce_all(tf.equal(ans, 0))
          if b :
            accuracy = tf.math.reduce_sum(tf.math.multiply(tf.to_double(res >= tf.reduce_max(res)),ans))
          else : 
            accuracy = 0
          with tf.contrib.summary.record_summaries_every_n_global_steps(1):
            tf.contrib.summary.scalar('val_batch_loss', loss)
            tf.contrib.summary.scalar('val_batch_accuracy', accuracy)
          val_losses.append(loss)
          val_accuracies.append(accuracy)
          print('val batch {} ............... loss : {} , accuracy : {} '.format(ibv, loss, accuracy))

        val_mean_epoch_accuracy = sum(val_accuracies) / len(val_accuracies)
        val_mean_epoch_loss = sum(val_losses) / len(val_losses)
        val_losses = []
        val_accuracies = []
        with tf.contrib.summary.record_summaries_every_n_global_steps(1):
          tf.contrib.summary.scalar('val_mean_epoch_loss', val_mean_epoch_loss)
          tf.contrib.summary.scalar('val_mean_epoch_accuracy', val_mean_epoch_accuracy)
        
      print('train epoch {} ............... mean loss : {} , mean accuracy : {} '.format(ie, train_mean_epoch_loss, train_mean_epoch_accuracy))
      if val : 
        print('val epoch {} ............... mean loss : {} , mean accuracy : {} '.format(ie, val_mean_epoch_loss, val_mean_epoch_accuracy))

      ckpt.step.assign_add(1)
      if int(ckpt.step) % 3 == 0:
        save_path = manager.save()
        print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
        
  # REMARK : Comment va se comporter le reseau avec batch = 1 ??
  def test(self, x) :
    
    steps = 0
    accuracies = []
    x = x.make_one_shot_iterator()
    for ib, batch in enumerate(x) : 
      print('test batch' + str(ib)+ '..............................................')
      imgs = batch[0]
      borders = batch[1]
      ques = batch[2]
      ans = batch[3]
      res = self.call(imgs, borders, ques, False)
      loss = tf.losses.softmax_cross_entropy(ans,res)
      batch_accuracy = tf.math.reduce_sum(tf.math.multiply(tf.to_double(res >= tf.reduce_max(res)),ans))
      accuracies.append(batch_accuracy)
      steps +=1
    test_accuracy = sum(accuracies) / steps
  
    

# define the model

## models variables

In [None]:
# train questions and answers file
train_questions_file = os.path.join(root_path, 'VQA_dataset/train/v2_Questions_Train_mscoco.json')
train_answers_file = os.path.join(root_path, 'VQA_dataset/train/v2_Annotations_Train_mscoco.json')

# val questions and answers file
val_questions_file =  os.path.join(root_path, 'VQA_dataset/val/v2_Questions_Val_mscoco.json')
val_answers_file = os.path.join(root_path, 'VQA_dataset/val/v2_Annotations_Val_mscoco.json')

# minimum words occurence in questions
num_occurence = 5
complementary_file = os.path.join(root_path, 'VQA_dataset/train/v2_Complementary_Pairs_Train_mscoco.zip)

#checkpoints path where to save the last models when training
CHECKPOINT_PATH = os.path.join(root_path, 'results/checkpoint_files')
#best model path
BEST_MODEL_PATH = os.path.join(root_path, 'results/best_model/best_model.h5')
# best model loss path, to compare with the best val losses when training even after the session go out
BEST_LOSS_MODEL_PATH = os.path.join(root_path, 'results/best_model/best_model_loss.txt'')
#path where to save tensorboard files
logdir = os.path.join(root_path, 'results/tensorboard_files')

#initialise question preprocessor and answer preprocessor
answer_preprocessor = AnswerPreprocessing([train_questions_file],[train_answers_file])
question_preprocessor = QuestionsPreprocessing(glove_file_path, [train_questions_file], num_occurence)


In [0]:
#WARNING : This is just for glove_300d
WORD_EMBEDDING_DIM = 300
IMAGE_FEAT_NUM = 10
IMAGE_NON_LINEAR_OUTPUT_DIM = TEXT_NON_LINEAR_OUTPUT_DIM = 512

question_args = {
    
    'embedding_size': WORD_EMBEDDING_DIM,
    'embedding_weights' : question_preprocessor.get_embedding_matrix(),
    'hidden_size' : 1024,
    'vocab_len' : question_preprocessor.vocab_length,
    'num_layers' : 1,
    #'dropout' : 0.3
    'dropout' : None
}

#Num regions, global variable and this one
image_args = {
    'num_regions' : 10
}

image_feature_transformer = {
    'input_shape' : (2048,),
    'dims' : [2048,IMAGE_NON_LINEAR_OUTPUT_DIM],
    'activation' :'relu',
    'dropout' : None,
    'normalization' : False,
    'regularisation' : False
}

text_feature_transformer = {
    'input_shape' : (1024,),
    'dims' : [1024, TEXT_NON_LINEAR_OUTPUT_DIM],
    'activation' : 'relu',
    'dropout' : None,
    'normalization' : False,
    'regularisation' : False
}

attention_vector_args = {
    #'dims' : [1024, 1],
    'input_shape' : (512,),
    'dims' : [1024, IMAGE_NON_LINEAR_OUTPUT_DIM],
    'activation' : 'relu',
    'dropout' : False,
    'probability_function' : 'softmax',
    'regularisation' : False
}


classifier_args = {
    'input_shape' : (1024,),
    'dims' : [1024, 512, ANSWER_DIM],
    'activation' : 'relu',
    'dropout' : False,
    'normalization' : False,
    'regularisation' : False
}
attention_merger_type = 'hadamard'
merger_type = 'concat'
EPOCHS = 2000

In [0]:
print(ANSWER_DIM)
print(WORD_EMBEDDING_DIM)
print(question_preprocessor.get_embedding_matrix().shape)
print(IMAGE_FEAT_NUM)

## model program

In [0]:


question_args = {
    
    'embedding_size': WORD_EMBEDDING_DIM,
    'embedding_weights' : question_preprocessor.get_embedding_matrix(),
    'hidden_size' : 1024,
    'vocab_len' : question_preprocessor.vocab_length,
    'num_layers' : 1,
    #'dropout' : 0.3
    'dropout' : None
}
text_args = {
    'input_shape' : (1024,),
    'dims' : [1024 ,512],
    'activation' : 'relu',
    'dropout' : False,
    'normalization' : False,
    'regularisation' : 0.01
}
image_args = {
    'input_shape' : (2048,),
    'dims' : [2048 ,512],
    'activation' : 'relu',
    'dropout' : False,
    'normalization' : False,
    'regularisation' : False
}

att_args = {
    'input_shape' : (512,),
    'dims' : [2048, 1],
    'activation' : 'relu',
    'dropout' : False,
    'normalization' : False, 
    'probability_function' : 'softmax',
    'regularisation' : 0.01

}


class_args = {
    #'input_shape' : (2560,),
    'input_shape' : (512,),
    'dims' : [ 2048,  ANSWER_DIM],
    'activation' : 'relu',
    'dropout' : False,
    'normalization' : True,
    'regularisation' : 0.01
}

class NewModel2(keras.Model) : 
  def __init__(self,question_args, text_args, image_args, att_args, class_args):
    super(NewModel2, self).__init__()
    self.text_model =  get_question_module('GRU', question_args)
    self.resnet =  keras.applications.ResNet50(include_top=False, weights='imagenet', input_tensor=None, input_shape=(224, 224,3), pooling=None)
    self.text_net = get_non_linear_function('activation', text_args )
    self.image_net = get_non_linear_function('activation', image_args ) 
    self.DImage = keras.layers.Dense(512)
    self.merger1 = keras.layers.Multiply()
    self.reshape1 = keras.layers.Reshape((7,7,2048))
    self.attention_net = get_non_linear_function('activation', att_args) 
    self.merger2 = keras.layers.Multiply()
    self.classifier = get_non_linear_function('activation', class_args) 
    self.merger3 = keras.layers.Multiply()
    self.optimizer = tf.train.AdamOptimizer(learning_rate = 0.1)
  
  def call(self,images, questions):
    images =  self.reshape1(images)
    images = tf.math.l2_normalize(images, axis = -1)
    questions = self.text_model(questions)
    #questions = tf.to_float(self.text_net(questions))
    questions = self.text_net(questions)
    questions_att = tf.keras.backend.expand_dims(
      questions,
      axis=1)
    questions_att = tf.keras.backend.expand_dims(
      questions_att,
      axis=1)
    questions_att = tf.cast(questions_att, tf.float32)
    questions_att = tf.keras.backend.tile( questions_att,(1,7,7,1))
    
    att_imgs = self.__func( images, self.DImage)
    common1 = self.merger1([questions, att_imgs])
    att_vec = self.__func(common1, self.attention_net)
    #att_vec = self.reshape1(att_vec)
    #att_vec = tf.keras.backend.expand_dims(
    #  att_vec,
    #  axis=0)
    att_vec =  tf.keras.backend.tile( att_vec, (1,1,1,2048))
    new_imgs = self.merger2([att_vec, images])
    new_imgs = tf.reduce_sum(new_imgs, 1) 
    new_imgs = tf.reduce_sum(new_imgs, 1) / 49
    new_imgs = self.image_net(new_imgs)
    common2 = self.merger3([new_imgs, questions])
    res = self.classifier(common2)
    return res
  
  @tf.contrib.eager.defun 
  def __func(self,tensor,fn):
    return tf.map_fn(fn, tensor, parallel_iterations=49)
  
  def loss(self, result, labels):
    loss1 = tf.nn.sigmoid_cross_entropy_with_logits(logits = tf.to_double(result), labels = tf.to_double(labels))
    s = loss1.shape[-1]
    loss11 = tf.reduce_mean(loss1)
    return loss11
  
  def train(self, trains, vals, epochs):
    global_step = tf.train.get_or_create_global_step()
    writer = tf.contrib.summary.create_file_writer(logdir)
    writer.set_as_default()
    best_loss = None
    #Best loss
    exists = os.path.isfile(BEST_LOSS_MODEL_PATH)
    if exists :
      print('exists')
      with open(BEST_LOSS_MODEL_PATH, 'r') as f :
        line = f.readlines()[0].strip()
        if not line == '' :
          best_loss = float(line)
          print('init best loss {}'.format(str(best_loss)))
      
    #Checkpoint
    #if not len(os.listdir(CHECKPOINT_PATH)) == 0:
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), optimizer=self.optimizer, net=self)
    manager = tf.train.CheckpointManager(ckpt, CHECKPOINT_PATH, max_to_keep=5)
    ckpt.restore(manager.latest_checkpoint)
    
    if manager.latest_checkpoint:
      print("Restored from {}".format(manager.latest_checkpoint))
    else:
      print("Initializing from scratch.")
    
    for i in range(epochs) :
      y = time.time()
      train_losses = []
      val_losses = []
      #TRAIN
      x = time.time()
      spe = time.time()
      for ib, (imgs, questions, answers) in enumerate(trains):
        if ib % 100 == 0 : 
          print('batch {}  in {}'.format(ib, time.time() - spe))
        x = time.time()
        with tf.GradientTape() as tape :
          res = self.call(imgs, questions)
          loss = self.loss(res, answers)
       
        gradients = tape.gradient(loss, self.variables)
        with tf.contrib.summary.record_summaries_every_n_global_steps(1):
          tf.contrib.summary.scalar('train_batch_loss', loss)
        self.optimizer.apply_gradients(zip(gradients, self.variables))
        #print('loss in train batch {} is : {} , time : {}'.format(ib, loss, time.time() - x))
        train_losses.append(loss)
      mean_train_epoch_loss =  sum(train_losses)/len(train_losses)
      print('epoch {} train loss {} in {} s'.format(i, mean_train_epoch_loss, time.time() - y))
      with tf.contrib.summary.record_summaries_every_n_global_steps(1):
        tf.contrib.summary.scalar('train_epoch_loss', mean_train_epoch_loss )  
        
        
      #VAL
      if i % 5 == 0 :
        y  = time.time()
        for iv, (imgs, questions, answers) in enumerate(vals):
          x = time.time()
          res = self.call(imgs, questions, answers)
          val_loss = self.loss(res, answers)    
          with tf.contrib.summary.record_summaries_every_n_global_steps(1):
            tf.contrib.summary.scalar('val_batch_loss', val_loss)
          #print('loss in val batch {} is : {} , time : {}'.format(iv, val_loss, time.time() - x))
          val_losses.append(val_loss)
        mean_val_epoch_loss = sum(val_losses)/len(val_losses)
        with tf.contrib.summary.record_summaries_every_n_global_steps(1):      
          tf.contrib.summary.scalar('val_epoch_loss', mean_val_epoch_loss)
        if (best_loss == None) or (best_loss > mean_val_epoch_loss) :
          best_loss = mean_val_epoch_loss
          print('best loss {}'.format(best_loss))
          self.save_weights(BEST_MODEL_PATH)
          with open(BEST_LOSS_MODEL_PATH, 'w') as f :
            f.write(str(float(mean_val_epoch_loss)))
        print('epoch {} val loss {} in {} s'.format(i, mean_val_epoch_loss, time.time() - y))
      #Saving epoch losses  
      
        
      
      
      
      #checking best weights and saving it
      
      
          
      #Saving model          
      if int(ckpt.step) % 3 == 0:
        save_path = manager.save()
      ckpt.step.assign_add(1)
    
  
model4 = NewModel2(question_args,text_args, image_args, att_args, class_args )

## load the dataset

In [0]:
image_feature_description = {
   
    'question': tf.FixedLenFeature([MAX_LEN], tf.int64),
    'image': tf.FixedLenFeature([2048*7*7], tf.float32),
    'answers': tf.FixedLenFeature([ANSWER_DIM], tf.float32)
}

def _parse_image_function(example_proto):
    example =  tf.parse_single_example(example_proto, image_feature_description)
    return (example['image'], example['question'], example['answers'])



train_recs = [os.path.join(train_tfrec_dir,x) for x in  os.listdir(train_tfrec_dir) if x[-9:] =='.tfrecord']
val_recs = [os.path.join(val_tfrec_dir,x) for x in  os.listdir(val_tfrec_dir) if x[-9:] =='.tfrecord']
train_dataset  = tf.data.TFRecordDataset(filenames= train_recs, num_parallel_reads=11).map(
    _parse_image_function).batch(BATCH_SIZE).prefetch(PREFETCH_SIZE)
#val_dataset = tf.data.TFRecordDataset(filenames= val_recs, num_parallel_reads=6).map(
#    _parse_image_function).batch(BATCH_SIZE).prefetch(PREFETCH_SIZE)


# train the model

In [0]:
#model2.load_weights(BEST_MODEL_PATH)

model4.train(train_dataset, val_dataset, EPOCHS)