In [2]:
import numpy as np
from keras.utils.np_utils import to_categorical
import json
import h5py
import os
from constants import *

from keras.models import model_from_json#load_model
from keras.callbacks import ModelCheckpoint
import argparse
import tensorflow
#from models import *
#from prepare_data import *

from keras.models import Sequential
from keras.models import Model
import spacy

from keras.layers import Dense, Activation, Dropout, LSTM, Flatten, Embedding, Merge
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D

from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input

Using TensorFlow backend.


In [3]:
def get_model(dropout_rate, model_weights_filename):
    print "Creating Model..."
    metadata = get_metadata()
    num_classes = len(metadata['ix_to_ans'].keys())
    num_words = len(metadata['ix_to_word'].keys())

    embedding_matrix = prepare_embeddings(num_words, 300, metadata) #embedding_dimesion_choose = 300
    model = vqa_model(embedding_matrix, num_words, 300, 26, dropout_rate, num_classes)
    if os.path.exists(model_weights_filename):
        print "Loading Weights..."
        model.load_weights(model_weights_filename)

    return model


In [4]:
def train(data_limit,epoch,batch_size):
    dropout_rate = 0.5
    train_X, train_y = read_data(data_limit)    
    model = get_model(dropout_rate,'/home/tushar/Desktop/Python/VQA/model_weights.h5') #Model_weight
    checkpointer = ModelCheckpoint(filepath='/home/tushar/Desktop/Python/VQA/model_check.ckpt',verbose=1) #ModelCheckPoint
    model.fit(train_X, train_y, epochs=epoch, batch_size=batch_size, callbacks=[checkpointer], shuffle="batch")
    model.save_weights('/home/tushar/Desktop/Python/VQA/model_weights.h5', overwrite=True)


In [5]:
def val():
    val_X, val_y, multi_val_y = get_val_data() 
    model = get_model(0.0,'/home/tushar/Desktop/Python/VQA/model_weights.h5')  #Model_weight
    print "Evaluating Accuracy on validation set:"
    metric_vals = model.evaluate(val_X, val_y)
    print ""
    for metric_name, metric_val in zip(model.metrics_names, metric_vals):
        print metric_name, " is ", metric_val

    # Comparing prediction against multiple choice answers
    true_positive = 0
    preds = model.predict(val_X)
    pred_classes = [np.argmax(_) for _ in preds]
    for i, _ in enumerate(pred_classes):
        if _ in multi_val_y[i]:
            true_positive += 1
    print "true positive rate: ", np.float(true_positive)/len(pred_classes)


In [76]:
def pred(image_tensor,question_tensor):
    model = get_model(0.0,'/home/tushar/Desktop/Python/VQA/model_weights.h5')
    pred_x = [image_tensor,question_tensor]
    preds = model.predict(pred_x)
    pred_classes = [np.argmax(_) for _ in value]
    pred_classes = pred_classes[0]
    metadata = get_metadata()
    meta = metadata['ix_to_ans']
    for char in meta:
        char = int(char)
        if pred_classes == char:
            char = str(char)
            print meta.get(char)

In [7]:
def Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate):
    print "Creating text model..."
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, 
        weights=[embedding_matrix], input_length=seq_length, trainable=False))
    model.add(LSTM(units=512, return_sequences=True, input_shape=(seq_length, embedding_dim)))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=512, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1024, activation='tanh'))
    return model


In [8]:
def img_model(dropout_rate):
    print "Creating image model..."
    model = Sequential()
    model.add(Dense(1024, input_dim=4096, activation='tanh'))
    return model


In [9]:
def vqa_model(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate, num_classes):
    vgg_model = img_model(dropout_rate)
    lstm_model = Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate)
    print "Merging final model..."
    fc_model = Sequential()
    fc_model.add(Merge([vgg_model, lstm_model], mode='mul'))
    fc_model.add(Dropout(dropout_rate))
    fc_model.add(Dense(1000, activation='tanh'))
    fc_model.add(Dropout(dropout_rate))
    fc_model.add(Dense(num_classes, activation='softmax'))
    fc_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
        metrics=['accuracy'])
    return fc_model


In [10]:
def right_align(seq,lengths):
    v = np.zeros(np.shape(seq))
    N = np.shape(seq)[1]
    for i in range(np.shape(seq)[0]):
        v[i][N-lengths[i]:N]=seq[i][0:lengths[i]]
    return v


In [11]:
def read_data(data_limit):
    print "Reading Data..."
    img_data = h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/data_img.h5') #data_img
    ques_data = h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/data_prepro1.h5') #data_prepro
  
    img_data = np.array(img_data['images_train'])
    img_pos_train = ques_data['img_pos_train'][:data_limit]
    train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
    # Normalizing images
    tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
    train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))

    #shifting padding to left side
    ques_train = np.array(ques_data['ques_train'])[:data_limit, :]
    ques_length_train = np.array(ques_data['ques_length_train'])[:data_limit]
    ques_train = right_align(ques_train, ques_length_train)

    train_X = [train_img_data, ques_train]
    # NOTE should've consturcted one-hots using exhausitve list of answers, cause some answers may not be in dataset
    # To temporarily rectify this, all those answer indices is set to 1 in validation set
    train_y = to_categorical(ques_data['answers'])[:data_limit, :]

    return train_X, train_y


In [12]:
def get_val_data():
    img_data = h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/data_img.h5')
    ques_data = h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/data_prepro.h5')
    metadata = get_metadata()
    with open('/home/tushar/Desktop/Python/VQA/mscoco_val2014_annotations.json', 'r') as an_file:  #val_annotations_path
        annotations = json.loads(an_file.read())

    img_data = np.array(img_data['images_test'])
    img_pos_train = ques_data['img_pos_test']
    train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
    tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
    train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))

    ques_train = np.array(ques_data['ques_test'])
    ques_length_train = np.array(ques_data['ques_length_test'])
    ques_train = right_align(ques_train, ques_length_train)

    # Convert all last index to 0, coz embeddings were made that way :/
    for _ in ques_train:
        if 12602 in _:
            _[_==12602] = 0

    val_X = [train_img_data, ques_train]

    ans_to_ix = {str(ans):int(i) for i,ans in metadata['ix_to_ans'].items()}
    ques_annotations = {}
    for _ in annotations['annotations']:
        idx = ans_to_ix.get(_['multiple_choice_answer'].lower())
        _['multiple_choice_answer_idx'] = 1 if idx in [None, 1000] else idx
        ques_annotations[_['question_id']] = _

    abs_val_y = [ques_annotations[ques_id]['multiple_choice_answer_idx'] for ques_id in ques_data['question_id_test']]
    abs_val_y = to_categorical(np.array(abs_val_y))

    multi_val_y = [list(set([ans_to_ix.get(_['answer'].lower()) for _ in ques_annotations[ques_id]['answers']])) for ques_id in ques_data['question_id_test']]
    for i,_ in enumerate(multi_val_y):
        multi_val_y[i] = [1 if ans in [None, 1000] else ans for ans in _]

    return val_X, abs_val_y, multi_val_y


In [13]:
def get_metadata():
    meta_data = json.load(open('/home/tushar/Desktop/Python/VQA/data_train_val/data_prepro.json', 'r')) #data_prepo_meta
    meta_data['ix_to_word'] = {str(word):int(i) for i,word in meta_data['ix_to_word'].items()}
    return meta_data


In [14]:
def prepare_embeddings(num_words, embedding_dim, metadata):
    if os.path.exists('/home/tushar/Desktop/Python/VQA/data_train_val/embedding_matrix.h5'): #embedding_matrix_filename
        with h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/embedding_matrix.h5') as f:  #embedding_matrix_filename
            return np.array(f['embedding_matrix'])

    print "Embedding Data..."
   # with open('/home/tushar/Desktop/Python/VQA/data_train_val/data_prepro.json', 'r') as qs_file:  #train_questions_path
    #    questions = json.loads(qs_file.read())
     #   texts = [str(_['question']) for _ in questions['questions']]
    
    embeddings_index = {}
    with open('/home/tushar/Desktop/Python/VQA/glove.6B/glove.6B.300d.txt', 'r') as glove_file:  #glove path
        for line in glove_file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((num_words, embedding_dim))
    word_index = metadata['ix_to_word']

    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
   
    with h5py.File('/home/tushar/Desktop/Python/VQA/data_train_val/embedding_matrix.h5', 'w') as f: #embedding_matrix_filename
        f.create_dataset('embedding_matrix', data=embedding_matrix)

    return embedding_matrix


In [15]:
def question_to_encode(question):

    question=question.split()
    
    l = np.array(question).size
    
    arr=np.zeros(26)
    metadata = get_metadata()
    word_index = metadata['ix_to_word']
    
    i=0

    for char in question:
        char = char.lower()
        if char in word_index.keys():
            arr[i] = word_index.get(char)
            i=i+1
    
    i = 0 
    arr1 = np.zeros(26)
    while i < l:
        arr1[25-l+1+i] = arr[i]
        i = i+1
    
    arr1=arr1.reshape((-1,26))
    return arr1

In [17]:
def image_extract(img_path):
    model = VGG19(weights='imagenet', include_top=True)
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    model_extractfeatures = Model(input=model.input, output=model.get_layer('fc2').output)
    fc2_features = model_extractfeatures.predict(x)
    fc2_features = fc2_features.reshape((-1,4096))
    fc2_features = np.array(fc2_features)
    return fc2_features