In [None]:
import tensorflow as tf
import numpy as np
import json
import pickle

In [None]:
with open('vocab&wordlen.pkl', 'rb') as f:
    vocab,max_word_len = pickle.load(f)

In [None]:
max_word_len

In [None]:
word_embedding_dim=300
char_embedding_dim=200
q_words=50
c_words=400
embedding_dim = word_embedding_dim+char_embedding_dim
batch_size=64
model_encoder_layers=3
no_of_chars = len(vocab)

In [None]:
vocab.insert(0, 0)

In [None]:
vocab

In [None]:
def loadGloveModel(gloveFile):
    print "Loading Glove Model"
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Done.",len(model)," words loaded!"
    return model

In [None]:
# %%time
# model = loadGloveModel("glove.840B.300d.txt")

In [None]:
def conv(inputs, output_size, bias = None, activation = None, kernel_size = 1, name = "conv"):
    with tf.variable_scope(name):
        def_shape=4
        if len(inputs.shape)==3:
            inputs = tf.expand_dims(inputs,axis=1)
            def_shape=3
        shapes = inputs.shape.as_list()
        filter_shape = [1,kernel_size,shapes[-1],output_size]
        bias_shape = [1,1,1,output_size]
        strides = [1,1,1,1]
        kernel_ = tf.get_variable("kernel_",
                        filter_shape,
                        dtype = tf.float32,
                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                        initializer = tf.contrib.layers.xavier_initializer())
        outputs = tf.nn.conv2d(inputs, kernel_, strides, "VALID")
        if bias:
            outputs += tf.get_variable("bias_",
                        bias_shape,
                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                        initializer = tf.zeros_initializer())
        if def_shape==3:
            outputs = tf.squeeze(outputs,axis=1)
        if activation is not None:
            return activation(outputs)
        else:
            return outputs

In [None]:
def depthconv(x, kernel_size, output_filters, scope_name):
    with tf.variable_scope(scope_name):
        shapes = x.shape.as_list()
        depthwise_filter = tf.get_variable("depthwise_filter",
                                        (kernel_size[0], kernel_size[1], shapes[-1], 1),
                                        dtype = tf.float32,
                                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                                        initializer = tf.contrib.layers.xavier_initializer())
        pointwise_filter = tf.get_variable("pointwise_filter",
                                        (1,1,shapes[-1],output_filters),
                                        dtype = tf.float32,
                                        regularizer=tf.contrib.layers.l2_regularizer(scale = 3e-7),
                                        initializer = tf.contrib.layers.xavier_initializer())
        outputs = tf.nn.separable_conv2d(x,
                                        depthwise_filter,
                                        pointwise_filter,
                                        strides = (1,1,1,1),
                                        padding = "SAME")
        b = tf.get_variable("bias",
                outputs.shape[-1],
                regularizer=tf.contrib.layers.l2_regularizer(scale = 3e-7),
                initializer = tf.zeros_initializer())
        outputs += b
        outputs = tf.nn.relu(outputs)
        return outputs

In [None]:
def dot_product_attention(q,k,v,scope="dot_product_attention"):
    """
    q: a Tensor with shape [batch, heads, length_q, depth_k]
    k: a Tensor with shape [batch, heads, length_kv, depth_k]
    v: a Tensor with shape [batch, heads, length_kv, depth_v]
    """
    with tf.variable_scope(scope):
        # [batch, num_heads, query_length, memory_length]
        logits = tf.matmul(q, k, transpose_b=True)
        logits = logits/(k.shape.as_list()[-1]**0.5)
        weights = tf.nn.softmax(logits, name="attention_weights")
        # dropping out the attention links for each of the heads
        return tf.matmul(weights, v)

In [None]:
def multihead_attention(queries, units, num_heads, memory = None, scope = "Multi_Head_Attention"):
    with tf.variable_scope(scope):
        # Self attention
        if memory is None:
            memory = queries
        memory = conv(memory, 2 * units, name = "memory_projection")
        query = conv(queries, units, name = "query_projection")
        qshapes = query.shape.as_list()
        Q = tf.reshape(query, [qshapes[0],qshapes[1],num_heads,-1])
        Q = tf.transpose(Q,[0,2,1,3])

        mshapes = memory.shape.as_list()
        M = tf.reshape(memory, [qshapes[0],qshapes[1],num_heads*2,-1])
        M = tf.transpose(M,[0,2,1,3])
        K, V = tf.split(M,2,axis=1)

        x = dot_product_attention(Q,K,V)
        
        shapes = x.shape.as_list()
        return tf.reshape(tf.transpose(x,[0,2,1,3]),[shapes[0],shapes[2],-1])

In [None]:
def encoderblock(x, kernel_size, output_filters, num_conv_layers, scope_name, num_blocks=1, reuse=False):
    with tf.variable_scope(scope_name, reuse=reuse):
        x = tf.expand_dims(x,axis=1)
        x = conv(x,output_filters,name="conv0")
        for _ in range(num_blocks):
            with tf.variable_scope("Block"+str(_)):
                for i in range(num_conv_layers):
                    if len(x.shape.as_list())==3:
                        x=tf.expand_dims(x,axis=1)
                    y = tf.contrib.layers.layer_norm(x)
                    y = depthconv(y, kernel_size, output_filters, 'dconv'+str(i))

                    x = x+y
                x = tf.squeeze(x, axis=1)
                x = tf.contrib.layers.layer_norm(x)
                x = multihead_attention(x,output_filters,8)
                x = tf.contrib.layers.layer_norm(x)
                x = conv(x,output_filters,True,activation=tf.nn.relu,name="FFN")
        return x

In [None]:
# q=tf.placeholder(tf.float32,[batch_size,q_words,embedding_dim])
# print encoderblock(q,(7,1),128,4,"random")

In [None]:
def highway(x, size = None, activation = None, num_layers = 2, scope = "highway"):
    with tf.variable_scope(scope):
        if size is None:
            size = x.shape.as_list()[-1]
        else:
            x = conv(x, size, name = "input_projection")
        for i in range(num_layers):
            T = conv(x, size, bias = True, activation = tf.sigmoid,
                     name = "gate_%d"%i)
            H = conv(x, size, bias = True, activation = activation,
                     name = "activation_%d"%i)
            x = H * T + x * (1.0 - T)
        return x

In [None]:
unk = tf.get_variable("unk",(word_embedding_dim),dtype = tf.float32,
                                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                                        initializer = tf.contrib.layers.xavier_initializer())
char_emb = tf.get_variable("char_emb",(no_of_chars,char_embedding_dim),dtype = tf.float32,
                                        regularizer= tf.contrib.layers.l2_regularizer(scale = 3e-7),
                                        initializer = tf.contrib.layers.xavier_initializer())
question_word = tf.placeholder(tf.float32,[batch_size,q_words,word_embedding_dim])
question_char = tf.placeholder(tf.int32,[batch_size,q_words,None])
context_word = tf.placeholder(tf.float32,[batch_size,c_words,word_embedding_dim])
context_char = tf.placeholder(tf.int32,[batch_size,c_words,None])
start_ans = tf.placeholder(tf.int32,[batch_size])
end_ans = tf.placeholder(tf.int32,[batch_size])

In [None]:
zero_char = tf.zeros([1,char_embedding_dim], tf.float32)
char_emb = tf.concat([zero_char,char_emb],axis=0)
print char_emb

In [None]:
question_char = tf.nn.embedding_lookup(char_emb,question_char)
print question_char
context_char = tf.nn.embedding_lookup(char_emb,context_char)
print context_char

In [None]:
with tf.variable_scope("Input_Embedding_Layer"):
    question_char1 = conv(question_char,char_embedding_dim,kernel_size=5,name="q_char_conv0")
    question_char1 = tf.reduce_max(question_char1,axis=2)
    question_emb = tf.concat([question_word,question_char1],axis=-1)
    question_emb = highway(question_emb, scope="q_highway")
    print question_emb
    
    context_char1 = conv(context_char,char_embedding_dim,kernel_size=5,name="c_char_conv0")
    context_char1 = tf.reduce_max(context_char1,axis=2)
    context_emb = tf.concat([context_word,context_char1],axis=-1)
    context_emb = highway(context_emb, scope="c_highway")
    print context_emb

In [None]:
with tf.variable_scope("Embedding_encoding_layer"):
    question = encoderblock(question_emb, (1,7), 128, 4, "q_encoder_block")
    print question

    context = encoderblock(context_emb, (1,7), 128, 4, "c_encoder_block")
    print context

In [None]:
with tf.variable_scope("Context-query_attention"):
    q = tf.tile(tf.expand_dims(question,axis=1),[1,c_words,1,1])
    c = tf.tile(tf.expand_dims(context,axis=2),[1,1,q_words,1])
    s = conv(tf.concat([q,c,tf.multiply(q,c)],axis=-1),1,name="similarity_matrix")
    s = tf.squeeze(s,axis=-1)
    print s
    s_ = tf.nn.softmax(s,axis=-1)
    a = tf.matmul(s_,question)
    print a
    b = tf.matmul(s_, tf.matmul(tf.transpose(tf.nn.softmax(s,axis=1),[0,2,1]),context))
    print b

In [None]:
with tf.variable_scope("Model_encoder_layer"):
    enc_input = tf.concat([context,a,context*a,context*b],axis=-1)
    print enc_input
    enc_input = conv(enc_input,128,name="conv0")
    enc_input = encoderblock(enc_input, (1,7), 128, 2, "encoder_layer", 7)
    print enc_input
    output_list = [enc_input]
    for i in range(model_encoder_layers-1):
        temp = encoderblock(output_list[i], (1,7), 128, 2, "encoder_layer", 7, True)
        print temp
        output_list.append(temp)

In [None]:
with tf.variable_scope("Output_Layer"):
    start_logits = tf.squeeze(conv(tf.concat([output_list[0], output_list[1]],axis = -1),1, bias = False, name = "start_pointer"),-1)
    end_logits = tf.squeeze(conv(tf.concat([output_list[0], output_list[2]],axis = -1),1, bias = False, name = "end_pointer"), -1)
    
    logits1 = tf.nn.softmax(start_logits)
    logits2 = tf.nn.softmax(end_logits)
    print logits1
    
    start = tf.one_hot(start_ans,c_words)
    end = tf.one_hot(end_ans,c_words)
    print start
    start = tf.log(tf.multiply(start,logits1))
    end = tf.log(tf.multiply(end,logits2))
    losses = -(tf.reduce_mean(start)+tf.reduce_mean(end))
    print losses

In [None]:
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.001
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, 0.99, staircase=True)
# Passing global_step to minimize() will increment it at each step.
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize = optimizer.minimize(losses,global_step=global_step)

In [None]:
with tf.Session() as sess:
    train_writer = tf.summary.FileWriter( './logs/1/train ', sess.graph)

In [None]:
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

In [None]:
with open('train_data.json') as f:
    train_data = json.load(f)

In [None]:
train_data['data'][0]

In [None]:
%%time
model = loadGloveModel("GloVe/glove.840B.300d.txt")

In [None]:
saver = tf.train.Saver()

In [None]:
def get_word_embedding(l, word_limit):
    temp_w = []
    for q in l:
        if q in model.keys():
            temp_w.append(model[q])
        else:
            temp_w.append(sess.run(unk))
    print len(temp_w)
    temp_zeros = np.array([0 for _ in range(word_embedding_dim)])
    if len(temp_w)<word_limit:
        temp_w+=[temp_zeros for _ in range(word_limit-len(temp_w))]
    return np.array(temp_w[:word_limit])

In [None]:
def get_char_embedding(l, word_limit):
    temp_c = []
    for q in l:
        word = []
        for char in q:
            word.append(vocab.index(char))
        word+=[0 for _ in range(max_word_len-len(word))]
        temp_c.append(word)
    if len(temp_c)<word_limit:
        temp_c+=[[0 for _ in range(max_word_len)] for _ in range(word_limit-len(temp_c))]
    return np.array(temp_c[:word_limit])

In [None]:
def compute_f1(ans_predicted, ans_actual):
    tp = 0
    fp = 0
    tn = 0
    for gold_toks, pred_toks in zip(ans_actual, ans_predicted):
        common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
        num_same = sum(common.values())
        tp+=num_same
        fp+=(len(pred_tokens)-num_same)
        tn+=(len(gold_tokens)-num_same)
    return tp,fp,tn

In [None]:
no_of_batches = int(len(train_data['data'])/batch_size)
epoch = 60
prev_f1=0
cur_f1=0
max_f1=0

for i in range(epoch):
    count=0
    q_w = []
    q_c = []
    c_w = []
    c_c = []
    a_s = []
    a_e = []
    for t in train_data['data']:
        if t['answer_start']!=-1:
            q_w.append(get_word_embedding(t['question'],q_words))
            q_c.append(get_char_embedding(t['question'],q_words))
            c_w.append(get_word_embedding(t['context'],c_words))
            c_c.append(get_char_embedding(t['context'],c_words))
            a_s.append(t['answer_start'])
            a_e.append(t['answer_end'])
            count+=1
        if count==batch_size:
            feed_dict={question_word:np.array(q_w),
                       question_char:np.array(q_c),
                       context_word:np.array(c_w),
                       context_char:np.array(c_c),
                       start_ans:np.array(a_s),
                       end_ans:np.array(a_e)}
            sess.run(minimize,feed_dict)
            count=0
            q_w = []
            q_c = []
            c_w = []
            c_c = []
            a_s = []
            a_e = []
            break
    print "Epoch - ",str(i)
    tp=0
    fp=0
    tn=0
    ans_ac = []
    ans_pred = []
    count=0
    q_w = []
    q_c = []
    c_w = []
    c_c = []
    for t in test_data['data']:
        q_w.append(get_word_embedding(t['question'],q_words))
        q_c.append(get_char_embedding(t['question'],q_words))
        c_w.append(get_word_embedding(t['context'],c_words))
        c_c.append(get_char_embedding(t['context'],c_words))
        ans_ac.append(t['answer'])
        count+=1
        if count%batch_size==0:
            feed_dict={question_word:np.array(q_w),
                       question_char:np.array(q_c),
                       context_word:np.array(c_w),
                       context_char:np.array(c_c)}
            ans_st = sess.run(tf.argmax(logits1),feed_dict)
            ans_e = sess.run(tf.argmax(logits2),feed_dict)
            for k in range(batch_size):
                ans_pred.append(test_data['data'][count-batch_size+k]['context'][ans_st[k]:ans_e[k]])
            x,y,z=compute_f1(ans_pred, ans_ac)
            tp+=x
            fp+=y
            tn+=z
    precision = 1.0 * tp / (tp+fp)
    recall = 1.0 * tp / (tp+tn)
    f1 = (2 * precision * recall) / (precision + recall)
    print("F1 Score:")+str(f1)
    if f1>max_f1:
        save_path = saver.save(sess, "./saved_model/model.ckpt")
        print("Model saved in path: %s" % save_path)
        max_f1 = f1
    else:
        print "Not Saved"
