In [1]:
import tensorflow as tf
import numpy as np
import json
import pickle
import collections
import time
from math import log10
import random
import string
import re
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import unicodedata

In [2]:
SEED = 43
random.seed(SEED)

In [4]:
"""Convert GloVe file to a dictionary"""
def loadGloveModel(gloveFile):
    print "Loading Glove Model"
    f = open(gloveFile,'r')
    model = dict()
    embedding = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        model[word]=np.array([float(val) for val in splitLine[1:]])
    print "Done.",len(model)," words loaded!"
    return model

In [5]:
%%time
"""Load word as well as character embedding"""
model = loadGloveModel("GloVe/glove.840B.300d.txt")
model_char = loadGloveModel("glove.840B.300d-char.txt")

Loading Glove Model
Done. 2196016  words loaded!
Loading Glove Model
Done. 94  words loaded!
CPU times: user 3min 39s, sys: 6.61 s, total: 3min 45s
Wall time: 3min 48s


In [6]:
"""Reduce a sentence to its word embedding, character embedding, a boolean vector which 
tells if any word is in GloVe dictionary or not and an integer array of start positions of every word"""
def preprocess(sentence):
    wnl = WordNetLemmatizer()
    temp = pos_tag(word_tokenize(sentence))
    y = []
    word_emb = []
    char_emb = []
    word_in_glove = []
    count=0
    word_zeros = np.zeros((300),dtype=float)
    for i,j in temp:
        y.append(count)
        if i==u'``' or i==u"''":
            x='"'
            count+=1
        else:
            if j[0].lower() in ['a','n','v']:
                temp_i = wnl.lemmatize(i,j[0].lower())
                x=unicodedata.normalize('NFKD', temp_i).encode('ascii','ignore')
            else:
                temp_i = wnl.lemmatize(i)
                x=unicodedata.normalize('NFKD', temp_i).encode('ascii','ignore')
            count+=len(i)
        while count<len(sentence) and sentence[count]==' ':
            count+=1
        try:
            word_emb.append(model[x])
            word_in_glove.append(1)
        except:
            word_emb.append(word_zeros)
            word_in_glove.append(0)
        temp_char = []
        for k in range(len(x)):
            try:
                temp_char.append(vocab.index(x[k]))
            except:
                pass
        temp_char+=[len(vocab) for _ in range(max_word_len-len(temp_char))]
        char_emb.append(temp_char)
    char_emb=np.array(char_emb)
    word_emb=np.array(word_emb)
    return word_emb, char_emb, word_in_glove, y

In [7]:
"""Load vocabulary from character embedding GloVe dictionary"""
vocab_emb = []
for k in model_char.keys():
    vocab_emb.append(model_char[k])
vocab_emb.append(np.zeros((300)))
vocab_emb = np.array(vocab_emb)
print(vocab_emb.shape)

(95, 300)


In [8]:
vocab=list(model_char.keys())+[0]

In [9]:
"""Hyperparameters"""
max_word_len=40
num_gpus=4
word_embedding_dim=300
char_embedding_dim=300
q_words=50
c_words=399
c_words_u=400
embedding_dim = word_embedding_dim+char_embedding_dim
batch_size=8
batch_size_o=(num_gpus)*batch_size
model_encoder_layers=3
no_of_chars = len(vocab)
hidden_layer_size=96

In [10]:
"""Initializers and regularizers"""

initializer = lambda: tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                             mode='FAN_AVG',
                                                             uniform=True,
                                                             dtype=tf.float32)
initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
                                                             mode='FAN_IN',
                                                             uniform=False,
                                                             dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale = 3e-7)

In [11]:
"""Stochastic Depth"""
def layer_dropout(inputs, residual, dropout):
    pred = tf.random_uniform([]) < dropout
    pred = tf.cast(pred,tf.float32)
    ifdrop = tf.nn.dropout(inputs, 1.0 - dropout) + residual
    return pred*residual+(1-pred)*ifdrop

In [12]:
"""Applies a 2-dimensional convolution over a 3-d or 3-d matrix with bias and activation if specified"""
def conv(inputs, output_size, bias = None, activation = None, kernel_size = 1, name = "conv", reuse=None):
    with tf.variable_scope(name,reuse=reuse):
        def_shape=4
        if len(inputs.shape)==3:
            inputs = tf.expand_dims(inputs,axis=1)
            def_shape=3
        shapes = inputs.shape.as_list()
        filter_shape = [1,kernel_size,shapes[-1],output_size]
        bias_shape = [1,1,1,output_size]
        strides = [1,1,1,1]
        kernel_ = tf.get_variable("kernel_",
                        filter_shape,
                        dtype = tf.float32,
                        regularizer=regularizer,
                        initializer = initializer_relu() if activation is not None else initializer())
        outputs = tf.nn.conv2d(inputs, kernel_, strides, "VALID")
        if bias:
            outputs += tf.get_variable("bias_",
                        bias_shape,
                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                        initializer = tf.zeros_initializer())
        if def_shape==3:
            outputs = tf.squeeze(outputs,axis=1)
        if activation is not None:
            return activation(outputs)
        else:
            return outputs

In [13]:
"""Depthwise Separable Convolutions"""
def depthconv(x, kernel_size, output_filters, scope_name,reuse=None):
    with tf.variable_scope(scope_name,reuse=reuse):
        shapes = x.shape.as_list()
        depthwise_filter = tf.get_variable("depthwise_filter",
                                        (kernel_size[0], kernel_size[1], shapes[-1], 1),
                                        dtype = tf.float32,
                                        regularizer=regularizer,
                                        initializer = initializer_relu())
        pointwise_filter = tf.get_variable("pointwise_filter",
                                        (1,1,shapes[-1],output_filters),
                                        dtype = tf.float32,
                                        regularizer=regularizer,
                                        initializer = initializer_relu())
        outputs = tf.nn.separable_conv2d(x,
                                        depthwise_filter,
                                        pointwise_filter,
                                        strides = (1,1,1,1),
                                        padding = "SAME")
        b = tf.get_variable("bias",
                outputs.shape[-1],
                regularizer=tf.contrib.layers.l2_regularizer(scale = 3e-7),
                initializer = tf.zeros_initializer())
        outputs += b
        outputs = tf.nn.relu(outputs)
        return outputs

In [14]:
"""Applies a single head attention layer over V using softmax(QK/d**0.5)V"""
def dot_product_attention(q,k,v,dropout,scope="dot_product_attention"):
    """
    q: a Tensor with shape [batch, heads, length_q, depth_k]
    k: a Tensor with shape [batch, heads, length_kv, depth_k]
    v: a Tensor with shape [batch, heads, length_kv, depth_v]
    """
    with tf.variable_scope(scope):
        # [batch, num_heads, query_length, memory_length]
        logits = tf.matmul(q, k, transpose_b=True)
        logits = logits/(k.shape.as_list()[-1]**0.5)
        weights = tf.nn.softmax(logits, name="attention_weights")
        # dropping out the attention links for each of the heads
        weights = tf.nn.dropout(weights, 1.0 - dropout)
        return tf.matmul(weights, v)

In [15]:
"""Applies dot attention in parallel with multiplicity equal to the number of heads specified"""
def multihead_attention(queries, units, num_heads, dropout, memory = None, scope = "Multi_Head_Attention",reuse=None):
    with tf.variable_scope(scope,reuse=reuse):
        # Self attention
        if memory is None:
            memory = queries
        memory = conv(memory, 2 * units, name = "memory_projection",reuse=reuse)
        query = conv(queries, units, name = "query_projection",reuse=reuse)
        qshapes = query.shape.as_list()
        Q = tf.reshape(query, [qshapes[0],qshapes[1],num_heads,-1])
        Q = tf.transpose(Q,[0,2,1,3])

        mshapes = memory.shape.as_list()
        M = tf.reshape(memory, [qshapes[0],qshapes[1],num_heads*2,-1])
        M = tf.transpose(M,[0,2,1,3])
        K, V = tf.split(M,2,axis=1)

        x = dot_product_attention(Q,K,V,dropout)
        
        shapes = x.shape.as_list()
        return tf.reshape(tf.transpose(x,[0,2,1,3]),[shapes[0],shapes[2],-1])

In [16]:
""" Applies num_conv_layers convolutions after layer normalization in num_blocks, then a multihead attention
    and finally, a feed forward layer"""
def encoderblock(x, kernel_size, output_filters, num_conv_layers, scope_name, drop, num_blocks=1, reuse=None):
    with tf.variable_scope(scope_name, reuse=reuse):
        x = tf.expand_dims(x,axis=1)
        x = conv(x,output_filters,name="conv0",reuse=reuse)
        for _ in range(num_blocks):
            with tf.variable_scope("Block"+str(_)):
                for i in range(num_conv_layers):
                    if (i) % 2 == 0:
                        x = tf.nn.dropout(x, 1-drop)
                    if len(x.shape.as_list())==3:
                        x=tf.expand_dims(x,axis=1)
                    y = tf.contrib.layers.layer_norm(x,scope="layer_norm1_%d"%i,reuse=reuse)
                    y = depthconv(y, kernel_size, output_filters, 'dconv'+str(i),reuse=reuse)
                    x = layer_dropout(y,x,drop*float(_*(num_conv_layers+2)+i+1)/float((num_conv_layers + 2) * num_blocks))
                x_res1 = tf.squeeze(x, axis=1)
                x = tf.contrib.layers.layer_norm(x_res1,scope="layer_norm2",reuse=reuse)
                x = tf.nn.dropout(x,1-drop)
                x = multihead_attention(x,output_filters,6,reuse=reuse,dropout=drop)
                x_res2 = layer_dropout(x,x_res1,drop*float(_*(num_conv_layers+2)+num_conv_layers+1)/float((num_conv_layers + 2) * num_blocks))

                x = tf.contrib.layers.layer_norm(x_res2,scope="layer_norm3",reuse=reuse)
                x = tf.nn.dropout(x,1-drop)
                x = conv(x,output_filters,True,activation=tf.nn.relu,name="FFN1",reuse=reuse)
                x = conv(x,output_filters,True,activation=None,name="FFN2",reuse=reuse)
                x = layer_dropout(x,x_res2,drop*float(_*(num_conv_layers+2)+num_conv_layers+2)/float((num_conv_layers + 2) * num_blocks))

        return x

In [17]:
# q=tf.placeholder(tf.float32,[batch_size,q_words,embedding_dim])
# print encoderblock(q,(7,1),128,4,"random")

In [18]:
"""An n-layer Highway network with each layer consisting of 2 convolutions"""
def highway(x, drop, size = None, activation = None, num_layers = 2, scope = "highway",reuse=None):
    with tf.variable_scope(scope,reuse=reuse):
        if size is None:
            size = x.shape.as_list()[-1]
        else:
            x = conv(x, size, name = "input_projection",reuse=reuse)
        for i in range(num_layers):
            T = conv(x, size, bias = True, activation = tf.sigmoid,
                     name = "gate_%d"%i,reuse=reuse)
            H = conv(x, size, bias = True, activation = activation,
                     name = "activation_%d"%i,reuse=reuse)
            H = tf.nn.dropout(H, 1-drop)
            x = H * T + x * (1.0 - T)
        return x

In [19]:
def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
    tower_grads: List of lists of (gradient, variable) tuples. The outer list ranges
        over the devices. The inner list ranges over the different variables.
    Returns:
            List of pairs of (gradient, variable) where the gradient has been averaged
            across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):

        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads=[]
        for g, _ in grad_and_vars:
            if g is not None:
                grads.append(g)
        if grads==[]:
            print grad_and_vars[0][1]
            continue
        grad = tf.reduce_mean(tf.stack(grads), 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [20]:
"""Starting Learning Rate"""
starter_learning_rate = tf.placeholder(tf.float32,shape=[])
global_step = tf.train.get_or_create_global_step()
var_ema = tf.train.ExponentialMovingAverage(decay=0.9999)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           4000, 1, staircase=True)
# Passing global_step to minimize() will increment it at each step.
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

In [21]:
# def ema_getter(getter, name, *args, **kwargs):
#     var = getter(name, *args, **kwargs)
#     ema_var = ema.average(var)
#     return ema_var if ema_var else var

In [22]:
"""Placeholders"""
tower_grads = []
losses = []
drop = tf.placeholder_with_default(0.0, shape=())
question_bool = tf.placeholder(tf.float32,[batch_size,q_words],name="question_bool")
question_word = tf.placeholder(tf.float32,[batch_size,q_words,word_embedding_dim],name="question_word")
question_char = tf.placeholder(tf.int32,[batch_size,q_words,None],name="question_char")
context_bool = tf.placeholder(tf.float32,[batch_size,c_words],name="context_bool")
context_word = tf.placeholder(tf.float32,[batch_size,c_words,word_embedding_dim],name="context_word")
context_char = tf.placeholder(tf.int32,[batch_size,c_words,None],name="context_char")
start_ans = tf.placeholder(tf.int32,[batch_size],name="start_ans")
end_ans = tf.placeholder(tf.int32,[batch_size],name="end_ans")
impossible = tf.placeholder(tf.int32,[batch_size],name="impossible")
start_plausible = tf.placeholder(tf.int32,[batch_size],name="start_plausible")
end_plausible = tf.placeholder(tf.int32,[batch_size],name="end_plausible")

In [23]:
with tf.variable_scope("Model"):
    final_logits1_ans = []
    final_logits2_ans = []
    final_logits1_unans = []
    final_logits2_unans = []
    answer_v = []
    
    """Random initialization of some required variables"""

    universal = tf.get_variable("universal",(word_embedding_dim+hidden_layer_size),dtype = tf.float32,
                                            initializer = initializer())
    unk = tf.get_variable("unk",(word_embedding_dim),dtype = tf.float32,
                                            initializer = initializer())
    char_emb = tf.get_variable("char_emb",dtype = tf.float32,
                                            initializer =tf.constant(vocab_emb, dtype=tf.float32))
    zero_word = tf.constant(np.zeros((300)),dtype=tf.float32)
    flag=0
    with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
        universal_tiled = tf.expand_dims(tf.expand_dims(universal,axis=0),axis=0)
        universal_tiled = tf.tile(universal_tiled,[batch_size,1,1])
        unk_tiled = tf.expand_dims(tf.expand_dims(unk,axis=0),axis=0)
        zero_tiled = tf.expand_dims(tf.expand_dims(zero_word,axis=0),axis=0)

        question_bool1 = tf.expand_dims(question_bool,axis=-1)
        context_bool1 = tf.expand_dims(context_bool,axis=-1)
#         print question_bool1
        question_word = (question_bool1*(2-question_bool1)*question_word)+(0.5*(1-question_bool1)*(2-question_bool1)*unk_tiled)+(0.5*question_bool1*(question_bool1-1)*zero_tiled)
        print question_word
        question_word = tf.nn.dropout(question_word,1-drop)

        context_word = (context_bool1*(2-context_bool1)*context_word)+(0.5*(1-context_bool1)*(2-context_bool1)*unk_tiled)+(0.5*context_bool1*(context_bool1-1)*zero_tiled)
        context_word = tf.nn.dropout(context_word,1-drop)


#         print char_emb
        question_char_new = tf.nn.embedding_lookup(char_emb,question_char)
        question_char_new = tf.nn.dropout(question_char_new,1-(drop*0.5))

#         print question_char_new
        context_char_new = tf.nn.embedding_lookup(char_emb,context_char)
        context_char_new = tf.nn.dropout(context_char_new,1-(drop*0.5))

#         print context_char_new
        with tf.variable_scope("Input_Embedding_Layer"):
            """Convolution over char embedding, max pooling and highway network"""
            question_char1 = conv(question_char_new,hidden_layer_size,kernel_size=5,name="q_char_conv0",activation=tf.nn.relu)
            question_char1 = tf.reduce_max(question_char1,axis=2)
            question_emb = tf.concat([question_word,question_char1],axis=-1)
            question_emb = highway(question_emb, drop, scope="highway")
#                     print question_emb

            """Convolution over char embedding, max pooling, universal node concatenation and highway network"""
            context_char1 = conv(context_char_new,hidden_layer_size,kernel_size=5,name="c_char_conv0",activation=tf.nn.relu)
            context_char1 = tf.reduce_max(context_char1,axis=2)
            context_emb = tf.concat([context_word,context_char1],axis=-1)
            context_emb = tf.concat([universal_tiled,context_emb],axis=1)
            print(context_emb)
            context_emb = highway(context_emb, drop, scope="highway",reuse=True)
#             print context_emb


        with tf.variable_scope("Embedding_encoding_layer"):
            question = encoderblock(question_emb, (1,7), hidden_layer_size,scope_name="encoder_block",num_conv_layers=4,drop=drop)
#             print question

            context = encoderblock(context_emb, (1,7), hidden_layer_size,scope_name="encoder_block",num_conv_layers=4,reuse=True,drop=drop)
#             print context

        with tf.variable_scope("Context-query_attention"):
            """Similarity matrix computation"""
            question = tf.nn.dropout(question,1-drop)
            q = tf.tile(tf.expand_dims(question,axis=1),[1,c_words_u,1,1])
            context = tf.nn.dropout(context,1-drop)
            c = tf.tile(tf.expand_dims(context,axis=2),[1,1,q_words,1])
            s = conv(tf.concat([q,c,tf.multiply(q,c)],axis=-1),1,name="similarity_matrix")
            s = tf.squeeze(s,axis=-1)
            print s
            s_ = tf.nn.softmax(s,axis=-1)
            a = tf.matmul(s_,question)
#             print a
            b = tf.matmul(s_, tf.matmul(tf.transpose(tf.nn.softmax(s,axis=1),[0,2,1]),context))
#             print b

        with tf.variable_scope("Model_encoder_layer"):
            """3 encoder blocks with shared parameters"""
            enc_input = tf.concat([context,a,context*a,context*b],axis=-1)
#             print enc_input
            enc_input = conv(enc_input,hidden_layer_size,name="conv0")
            enc_input = encoderblock(enc_input, kernel_size=(1,7), output_filters=hidden_layer_size,scope_name="encoder_layer", num_conv_layers=2, num_blocks=8,drop=drop)
#             print enc_input
            output_list = [enc_input]
            for i in range(model_encoder_layers-1):
                temp = encoderblock(output_list[i], kernel_size=(1,7), output_filters=hidden_layer_size,scope_name="encoder_layer", num_conv_layers=2, num_blocks=8,drop=drop,reuse=True)
                print temp
                output_list.append(temp)

        with tf.variable_scope("Output_Layer"):
            """Softmax followed by loss function calculation"""
            start_logits_ans = tf.squeeze(conv(tf.concat([output_list[0], output_list[1]],axis = -1),1, bias = False, name = "start_pointer_ans"),-1)
            end_logits_ans = tf.squeeze(conv(tf.concat([output_list[0], output_list[2]],axis = -1),1, bias = False, name = "end_pointer_ans"), -1)

            start_logits_unans = tf.squeeze(conv(tf.concat([output_list[0], output_list[1]],axis = -1),1, bias = False, name = "start_pointer_unans"),-1)
            end_logits_unans = tf.squeeze(conv(tf.concat([output_list[0], output_list[2]],axis = -1),1, bias = False, name = "end_pointer_unans"), -1)

            logits1_ans = tf.nn.softmax(start_logits_ans)
            logits2_ans = tf.nn.softmax(end_logits_ans)
            print logits1_ans
            final_logits1_ans.append(logits1_ans)
            final_logits2_ans.append(logits2_ans)


            logits1_unans = tf.nn.softmax(start_logits_unans)
            logits2_unans = tf.nn.softmax(end_logits_unans)
            print logits1_unans
            final_logits1_unans.append(logits1_unans)
            final_logits2_unans.append(logits2_unans)                    


            imp = tf.cast(impossible,tf.float32)
            start = tf.one_hot(start_ans,c_words_u)
            end = tf.one_hot(end_ans,c_words_u)
            start_plaus = tf.one_hot(start_plausible,c_words_u)
            end_plaus = tf.one_hot(end_plausible,c_words_u)
            print start
            loss1 = tf.nn.softmax_cross_entropy_with_logits_v2(logits=start_logits_ans,labels=start)
            loss2 = tf.nn.softmax_cross_entropy_with_logits_v2(logits=end_logits_ans,labels=end)
            loss3 = tf.nn.softmax_cross_entropy_with_logits_v2(logits=start_logits_unans,labels=start_plaus)
            loss4 = tf.nn.softmax_cross_entropy_with_logits_v2(logits=end_logits_unans,labels=end_plaus)

            loss5 = -imp*(tf.math.log(logits1_ans[:,0]+1e-06)+tf.math.log(logits2_ans[:,0]+1e-06))
            loss6 = -(1-imp)*(tf.math.log((1-logits1_ans[:,0])+1e-06)+tf.math.log((1-logits2_ans[:,0])+1e-06))
            answer_verifier = tf.concat([output_list[0], output_list[1], output_list[2]],axis=-1)
            verifier_start = tf.expand_dims(logits1_ans,axis=-1)
            verifier_end = tf.expand_dims(logits2_ans,axis=-1)
            av = tf.concat([verifier_start*answer_verifier,verifier_end*answer_verifier],axis=-1)
            av = conv(av,1,bias=False,name="av1",activation=tf.nn.tanh)
            av = tf.transpose(av,[0,2,1])
            print av
            av = conv(av,1,bias=False,activation=tf.nn.sigmoid,name="av2")
            av = tf.squeeze(av)
            print av
            answer_v.append(av)
            loss7 = (-imp*tf.math.log(1-av+1e-06)-(1-imp)*tf.math.log(av+1e-06))
            print loss7
            loss = (1-imp)*(loss1+loss2)+imp*(loss3+loss4)+loss5+loss6+loss7
            print loss
            if flag==0:
                """Apply exponential moving average for better training of variables"""
                print("Exponential Moving Average")
                ema_op = var_ema.apply(tf.trainable_variables())
                flag=1
            with tf.control_dependencies([ema_op]):
                loss = tf.identity(loss)

            grads = optimizer.compute_gradients(loss)
            tower_grads.append(grads)



            losses.append(loss)


Tensor("Model/Model/add_1:0", shape=(8, 50, 300), dtype=float32)
Tensor("Model/Model/Input_Embedding_Layer/concat_2:0", shape=(8, 400, 396), dtype=float32)
Tensor("Model/Model/Context-query_attention/Squeeze:0", shape=(8, 400, 50), dtype=float32)
Tensor("Model/Model/Model_encoder_layer/encoder_layer_1/Block7/add_7:0", shape=(8, 400, 96), dtype=float32)
Tensor("Model/Model/Model_encoder_layer/encoder_layer_2/Block7/add_7:0", shape=(8, 400, 96), dtype=float32)
Tensor("Model/Model/Output_Layer/Softmax:0", shape=(8, 400), dtype=float32)
Tensor("Model/Model/Output_Layer/Softmax_2:0", shape=(8, 400), dtype=float32)
Tensor("Model/Model/Output_Layer/one_hot:0", shape=(8, 400), dtype=float32)
Tensor("Model/Model/Output_Layer/transpose:0", shape=(8, 1, 400), dtype=float32)
Tensor("Model/Model/Output_Layer/Squeeze_4:0", shape=(8,), dtype=float32)
Tensor("Model/Model/Output_Layer/sub_5:0", shape=(8,), dtype=float32)
Tensor("Model/Model/Output_Layer/add_13:0", shape=(8,), dtype=float32)
Exponential

In [24]:
"""Apply regularization to loss, and assign average values from ema to all the variables"""

gradients = average_gradients(tower_grads)
avg_loss = tf.reduce_mean(tf.stack(losses))
apply_gradient_op = optimizer.apply_gradients(gradients, global_step)
logits1_f = tf.concat(final_logits1_ans, axis=0)
logits2_f = tf.concat(final_logits2_ans, axis=0)
answer_v_f = tf.concat(answer_v, axis=0)
# params = tf.trainable_variables()
l2_loss = tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(scale = 3e-7),tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
avg_loss+=l2_loss

assign_vars = []
for var in tf.global_variables():
    v = var_ema.average(var)
    if v:
        assign_vars.append(tf.assign(var,v))

In [27]:
"""Random variables initialization"""
init_op = tf.global_variables_initializer()
config=tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(init_op)

In [28]:
# train_writer = tf.summary.FileWriter( './logs/1/train ', sess.graph)

In [29]:
saver = tf.train.Saver()

In [30]:
saver.restore(sess, "./saved_model_unans_final/model.ckpt")

INFO:tensorflow:Restoring parameters from ./saved_model_unans_final/model.ckpt


In [31]:
"""F1 score calculation from predicted answer and actual answer in list format"""
def compute_f1(ans_predicted, ans_actual):
    tp = 0
    fp = 0
    tn = 0
    for gold_toks, pred_toks in zip(ans_actual, ans_predicted):
        common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
        num_same = sum(common.values())
        tp+=num_same
        fp+=(len(pred_toks)-num_same)
        tn+=(len(gold_toks)-num_same)
    return tp,fp,tn

In [68]:
with open("test_data_unans_demo.pkl") as f:
    test_data=pickle.load(f)

In [33]:
# train_data=test_data

In [34]:
# no_of_batches = int(len(train_data)/batch_size_o)
epoch = 50
prev_f1=0
cur_f1=0
max_f1=0
# print no_of_batches

In [35]:
"""Converts a mini batch of training dataset, applies required padding and returns a feed dict"""
def train(x):
    feed_dict={}
    q_w = []
    q_c = []
    c_w = []
    c_c = []
    a_s = []
    a_e = []
    q_b = []
    c_b = []
    s_p = []
    e_p = []
    imp = []
    for i in x:
        zero_w = np.zeros((q_words-len(i['questionword']),word_embedding_dim))
        q_w.append(np.concatenate((i['questionword'],zero_w),axis=0))
        zero_w = np.zeros((c_words-len(i['contextword']),word_embedding_dim))
        c_w.append(np.concatenate((i['contextword'],zero_w),axis=0)) 
        q_b.append(np.pad(i['questionbool'],[0,q_words-len(i['questionword'])],'constant', constant_values=(2)))
        c_b.append(np.pad(i['contextbool'],[0,c_words-len(i['contextword'])],'constant', constant_values=(2)))
        zero_c = (no_of_chars-1)*np.ones((q_words-len(i['questionchar']),max_word_len))
        q_c.append(np.concatenate((i['questionchar'],zero_c),axis=0))
        zero_c = (no_of_chars-1)*np.ones((c_words-len(i['contextchar']),max_word_len))
        c_c.append(np.concatenate((i['contextchar'],zero_c),axis=0))
        a_s.append(i['answer_start'])
        a_e.append(i['answer_end'])
        s_p.append(i['plausible_start'])
        e_p.append(i['plausible_end'])
        imp.append(i['is_impossible'])
    feed_dict = {
        question_word:np.array(q_w),
        context_word:np.array(c_w),
        question_char:np.array(q_c),
        context_char:np.array(c_c),
        start_ans:np.array(a_s),
        end_ans:np.array(a_e),
        question_bool:np.array(q_b),
        context_bool:np.array(c_b),
        impossible:np.array(imp),
        start_plausible:np.array(s_p),
        end_plausible:np.array(e_p)
    }

    return feed_dict

In [36]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [64]:
"""Evaluate results"""
def test(x_test,max_f1,testing=True,threshold=0.7):
        tp=0
        fp=0
        tn=0
        print len(x_test)
        no_of_test_batches=len(x_test)/(batch_size)
        print no_of_test_batches
        t_ptr=0
        for tbatch in range(min(no_of_test_batches,100)):
            test_feed = train(x_test[t_ptr:t_ptr+batch_size])
            np_ans_st = sess.run(logits1_f,test_feed)
            np_ans_e = sess.run(logits2_f,test_feed)
            np_av = sess.run(answer_v_f,test_feed)
            ans_st = np.argmax(np_ans_st[:,1:],axis=-1)
            ans_e = np.argmax(np_ans_e[:,1:],axis=-1)
            ans_pred=[]
            ans_ac=[]
            for i in range(batch_size):
                if np_av[i]<=threshold:
                    ans_pred.append([''])
                else:
                    temp = x_test[t_ptr+i]['context']
                    temp2 = x_test[t_ptr+i]['context_indices']
                    try:
                        ans_pred.append(normalize_answer(temp[temp2[ans_st[i]]:temp2[ans_e[i]]]).split())
                    except:
                        try:
                            ans_pred.append(normalize_answer(temp[temp2[ans_st[i]]:]).split())
                        except:
                            ans_pred.append([-1])
                if x_test[t_ptr+i]['is_impossible']==1:
                    ans_ac.append([''])
                else:
                    temp = x_test[t_ptr+i]['context']
                    temp2 = x_test[t_ptr+i]['context_indices']
                    try:
                        ans_ac.append(normalize_answer(temp[temp2[test_feed[start_ans][i]-1]:temp2[test_feed[end_ans][i]-1]]).split())
                    except:
                        try:
                            ans_ac.append(normalize_answer(temp[temp2[test_feed[start_ans][i]-1]:]).split())
                        except:
                            ans_ac.append([-1])
            x,y,z=compute_f1(ans_pred, ans_ac)
            tp+=x
            fp+=y
            tn+=z
            t_ptr+=batch_size

            if tbatch%10==0:
                print("Testing %d" % tbatch),
                if testing:
                    print "On testing data"
                else:
                    print "On training data"
                print "Start: ",np.argmax(np_ans_st[:,1:],axis=-1)[0],
                print "End: ",np.argmax(np_ans_e[:,1:],axis=-1)[0],
                print "Start Actual: ",test_feed[start_ans][0]-1,
                print "End Actual: ",test_feed[end_ans][0]-1
                print "Answer verifier: ",np_av[0]
                print "Answer predicted ",ans_pred[0]
                print "Answer Actual: ",ans_ac[0]

        print tp,fp,tn
        try:
            precision = 1.0 * tp / (tp+fp)
            recall = 1.0 * tp / (tp+tn)
            f1 = (2 * precision * recall) / (precision + recall)
            print("F1 Score:")+str(f1)
        except:
            print("F1 Score:N/A")
            if testing:
                pass
#                 save_path = saver.save(sess, "./saved_model_unans/model.ckpt")
#                 print("Model saved in path: %s" % save_path)
            return max_f1
        if (f1>max_f1 or sess.run(global_step) <15000) and testing:
#             save_path = saver.save(sess, "./saved_model_unans_final/model.ckpt")
#             print("Model saved in path: %s" % save_path)
            max_f1 = max(f1,max_f1)
        else:
            print "Not Saved"
        return max(max_f1,f1)

In [69]:
test(test_data,0,False,0.6)

800
100
Testing 0 On training data
Start:  0 End:  1 Start Actual:  0 End Actual:  1
Answer verifier:  0.28542066
Answer predicted  ['']
Answer Actual:  [u'fresno']
Testing 10 On training data
Start:  24 End:  30 Start Actual:  24 End Actual:  30
Answer verifier:  0.91820663
Answer predicted  [u'kings', u'canyon', u'avenue', u'and', u'clovis', u'avenue']
Answer Actual:  [u'kings', u'canyon', u'avenue', u'and', u'clovis', u'avenue']
Testing 20 On training data
Start:  66 End:  67 Start Actual:  89 End Actual:  91
Answer verifier:  0.97746086
Answer predicted  [u'235']
Answer Actual:  [u'300', u'acres']
Testing 30 On training data
Start:  0 End:  1 Start Actual:  0 End Actual:  1
Answer verifier:  0.5586515
Answer predicted  ['']
Answer Actual:  [u'fresno']
Testing 40 On training data
Start:  87 End:  89 Start Actual:  87 End Actual:  89
Answer verifier:  0.9957301
Answer predicted  [u'sweyn', u'forkbeard']
Answer Actual:  [u'sweyn', u'forkbeard']
Testing 50 On training data
Start:  5 En

0.5537435137138621

In [38]:
def learning(global_step):
    if global_step>1000:
        return 0.0002
    return (0.001/3)*log10(global_step+1)

In [39]:
def testing_func(x):
    feed_dict={}
    q_w = []
    q_c = []
    c_w = []
    c_c = []
    a_s = []
    a_e = []
    q_b = []
    c_b = []
    for i in x:
        zero_w = np.zeros((q_words-len(i['questionword']),word_embedding_dim))
        q_w.append(np.concatenate((i['questionword'],zero_w),axis=0))
        zero_w = np.zeros((c_words-len(i['contextword']),word_embedding_dim))
        c_w.append(np.concatenate((i['contextword'],zero_w),axis=0)) 
        q_b.append(np.pad(i['questionbool'],[0,q_words-len(i['questionword'])],'constant', constant_values=(2)))
        c_b.append(np.pad(i['contextbool'],[0,c_words-len(i['contextword'])],'constant', constant_values=(2)))
        zero_c = (no_of_chars-1)*np.ones((q_words-len(i['questionchar']),max_word_len))
        q_c.append(np.concatenate((i['questionchar'],zero_c),axis=0))
        zero_c = (no_of_chars-1)*np.ones((c_words-len(i['contextchar']),max_word_len))
        c_c.append(np.concatenate((i['contextchar'],zero_c),axis=0))
    feed_dict = {
        question_word:np.array(q_w),
        context_word:np.array(c_w),
        question_char:np.array(q_c),
        context_char:np.array(c_c),
        question_bool:np.array(q_b),
        context_bool:np.array(c_b)
    }

    return feed_dict

In [40]:
"""Returns the answer to the question given context"""
kabidef get_answer(context,question):
    c_word,c_char,c_bool,c=preprocess(context)
    q_word,q_char,q_bool,q=preprocess(question)
    data = {'contextword':c_word,'contextchar':c_char,'contextbool':c_bool,'questionword':q_word,'questionchar':q_char,'questionbool':q_bool}
    example_feed = [data for _ in range(batch_size)]
    temp1 = np.argmax(sess.run(logits1_ans,testing_func(example_feed))[0])
    temp2 = np.argmax(sess.run(logits2_ans,testing_func(example_feed))[0])
    if sess.run(answer_v,testing_func(example_feed))[0][0]<0.6:
        return "Impossible question"
    try:
        return context[c[temp1-1]:c[temp2-1]]
    except:
        return context[c[temp1-1]:]

In [85]:
context=u"""Michael John Clarke (born 2 April 1981) is a former Australian international cricketer. He led Australia to their 5th Cricket World Cup triumph, when his team were victorious in the final of the ICC Cricket World Cup 2015 at the MCG.[2]. He is the first cap for Australia in Twenty20 Internationals. His ODI shirt number of 23 was passed on to him by Shane Warne after his international retirement. Nicknamed "Pup",[3] he is a right-handed middle-order batsman, an occasional left-arm orthodox spin bowler and also a slip catcher. He represented New South Wales at a domestic level. In January 2011, Clarke stood down as captain of the Australian Twenty20 cricket team to concentrate on his Test and ODI performance. After announcing he would retire from One Day cricket after the end of the 2015 Cricket World Cup, Clarke starred in the final against New Zealand top scoring with a score of 74 off 72 balls, as Australia won their fifth World Cup title. He was bowled when nine runs were required to win and received a standing ovation from the 93,013 strong MCG crowd after his dismissal. On 8 August 2015, Clarke announced that he would retire from all forms of cricket after the final Test of the 2015 Ashes series following a difficult series in terms of both his and the team's performance. Australia suffered a crushing defeat of an innings and 78 runs thus losing the Ashes. This was Clarke's fourth successive Ashes loss in England overall and his second as captain."""

In [88]:
question=u"""What is the nickname of Michael Clarke?"""

In [89]:
print(get_answer(context,question))

Impossible question
