## Model

In [9]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution()

In [None]:
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True):
    self.config = config
    self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
                                       initializer=tf.constant_initializer(0), trainable=False)
    # c: context; q: question; ch: context char; qh: question char; y1: start; y2: end
    self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next()
    self.is_train = tf.get_variable(
        "is_train", shape=[], dtype=tf.bool, trainable=False)
    # word embedding
    self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(
        word_mat, dtype=tf.float32), trainable=False)
    # char embedding
    self.char_mat = tf.get_variable(
        "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32))
    # mask for padding, used in the pointer network
    self.c_mask = tf.cast(self.c, tf.bool)
    self.q_mask = tf.cast(self.q, tf.bool)
    # actual word length for context in a batch
    self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
    self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

    if opt:
        N, CL = config.batch_size, config.char_limit
        # max word length for context in a batch
        self.c_maxlen = tf.reduce_max(self.c_len)
        # max word lenght for question in a batch
        self.q_maxlen = tf.reduce_max(self.q_len)
        
        # truncate at [bs, c_maxlen]
        self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen])
        # truncate at [bs, q_maxlen]
        self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen])
        
        # truncate the mask
        self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen])
        self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen])
        
        # [bs, c_maxlen, char_limit]
        self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) 
        # [bs, q_maxlen, char_limit]
        self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) 
        
        # y is one_hot encoded
        # [batch_size, c_maxlen]
        self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen])
        self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen])
    else:
        self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit
    
    # actual char length for context, reshape to 1D tensor
    self.ch_len = tf.reshape(tf.reduce_sum(
        tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1])
    # actual char length for question, reshape to 1D tensor
    self.qh_len = tf.reshape(tf.reduce_sum(
        tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1])

    self.ready()

    if trainable:
        self.lr = tf.get_variable(
            "lr", shape=[], dtype=tf.float32, trainable=False)
        self.opt = tf.train.AdadeltaOptimizer(
            learning_rate=self.lr, epsilon=1e-6)
        grads = self.opt.compute_gradients(self.loss)
        gradients, variables = zip(*grads)
        capped_grads, _ = tf.clip_by_global_norm(
            gradients, config.grad_clip)
        self.train_op = self.opt.apply_gradients(
            zip(capped_grads, variables), global_step=self.global_step)


In [None]:
def ready(self):
    config = self.config
    N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
    gru = cudnn_gru if config.use_cudnn else native_gru

    with tf.variable_scope("emb"):
        # char embedding
        with tf.variable_scope("char"):
            # char-embedding
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc]) # [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc]) # [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]

            # variational dropout
            # same drouput mask for each timestep
            ch_emb = dropout(
                ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)  # [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]
            qh_emb = dropout(
                qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)  # [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]

            # bi_gru for context
            cell_fw = tf.contrib.rnn.GRUCell(dg)
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
            ch_emb = tf.concat([state_fw, state_bw], axis=1)

            # bi_gru for question
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
            qh_emb = tf.concat([state_fw, state_bw], axis=1)

            qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) # [bs, q_maxlen, 2*char_hidden_size]
            ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) # [bs, c_maxlen, 2*char_hidden_size]

        # word embedding
        with tf.name_scope("word"):
            c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) # [bs, c_maxlen, word_emb_dim]
            q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) # [bs, q_maxlen, word_emb_dim]

        # concat word and char embedding 
        c_emb = tf.concat([c_emb, ch_emb], axis=2) # [bs, c_maxlen, word_emb_dim + char_emb_dim]
        q_emb = tf.concat([q_emb, qh_emb], axis=2) # [bs, q_maxlen, word_emb_dim + char_emb_dim]

    # Q and C encoding
    with tf.variable_scope("encoding"):
        rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
        ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
        c = rnn(c_emb, seq_len=self.c_len) # [bs, c_maxlen, hidden_size]
        q = rnn(q_emb, seq_len=self.q_len) # [bs, q_maxlen, hidden_size]

    # C Q attention
    with tf.variable_scope("attention"):
        qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                               keep_prob=config.keep_prob, is_train=self.is_train) # [bs, c_maxlen, 2 * hidden_size]
        rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
        ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
        att = rnn(qc_att, seq_len=self.c_len) # [bs, c_maxlen, hidden_size]

    # C C self attention
    with tf.variable_scope("match"):
        self_att = dot_attention(
            att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) # [bs, c_maxlen, 2 * hidden_size]
        rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
        ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
        match = rnn(self_att, seq_len=self.c_len)  # [bs, c_maxlen, hidden_size]

    # pointer network
    # logits 1: start position logits; logits 2: end position logits
    with tf.variable_scope("pointer"):
        # self attention
        init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                    keep_prob=config.ptr_keep_prob, is_train=self.is_train)
        pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
        )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
        logits1, logits2 = pointer(init, match, d, self.c_mask)

    # compute loss
    with tf.variable_scope("predict"):
        ##### for prediction
        ##### During prediction, we choose the best span from token i to token i' such that i<=i'<=i+15 and p1*p2 is maximized.
        # outer product: p1*p2
        # tf.expand_dims(tf.nn.softmax(logits1), axis=2): [bs, c_maxlen, 1]
        # tf.expand_dims(tf.nn.softmax(logits2), axis=1): [bs, 1, c_maxlen]
        # outer: [bs, c_maxlen, c_maxlen]
        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                          tf.expand_dims(tf.nn.softmax(logits2), axis=1))  
        
        # slice [start:start+15]
        outer = tf.matrix_band_part(outer, 0, 15)  # [bs, c_maxlen, 15]
        
        # yp1: start prob; yp2: end prob
        self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
        
        ##### for training: see below image
        losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                            logits=logits1, labels=tf.stop_gradient(self.y1))  # padding are included in the loss
        losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                            logits=logits2, labels=tf.stop_gradient(self.y2))  # padding are included in the loss
        self.loss = tf.reduce_mean(losses + losses2)


<img src='../img/loss_formula.png' width=800>

In [2]:
c_maxlen = 3
a = np.random.randint(0, 10, [3, 1])
b = np.random.randint(0, 10, [1, 3])

In [4]:
a

array([[5],
       [6],
       [6]])

In [5]:
b

array([[0, 6, 2]])

In [6]:
a * b

array([[ 0, 30, 10],
       [ 0, 36, 12],
       [ 0, 36, 12]])

In [7]:
a @ b

array([[ 0, 30, 10],
       [ 0, 36, 12],
       [ 0, 36, 12]])

In [16]:
c = np.random.randint(0,20,[15,15])
c

array([[ 2, 10,  4, 13,  3,  6,  8,  3, 10,  6,  5,  4,  7, 14, 11],
       [10,  7, 17, 13, 13, 19, 16, 15, 15, 17, 17,  2,  6, 17, 16],
       [ 6,  6,  7, 16, 17,  2, 19, 14,  7, 17, 12, 15,  0,  2, 16],
       [ 3, 19,  7,  4,  6, 17,  9, 15, 12, 18,  4, 11, 16, 11,  5],
       [ 8, 18, 11,  2,  3,  8, 13,  3,  6, 19,  9, 13,  2, 15, 18],
       [14,  1,  4, 16, 10, 11, 17,  6, 16, 11,  6, 15,  6,  3,  2],
       [16, 18,  9, 13,  5, 10,  4, 15, 15,  6,  2,  3,  6, 19, 18],
       [ 9,  6,  0, 14, 14,  8,  1,  3, 14, 13, 12, 13, 14,  0, 13],
       [ 2, 10,  1, 15,  1, 11, 13,  2,  3, 14,  4, 15,  9, 15, 19],
       [19,  9,  3, 18,  6,  5, 13, 13, 19, 17,  7, 12,  4,  4,  9],
       [16, 15, 10,  8, 12, 10, 15, 17,  2, 14, 19,  9,  3, 19,  9],
       [15, 11,  6,  7,  0, 13,  7,  9,  6,  9, 16, 11, 18, 14,  9],
       [13,  9,  6,  2,  7, 19,  2,  7,  2,  4,  3,  5,  1, 19,  7],
       [ 2,  2,  6, 14,  0, 13,  6, 15, 17,  3,  9,  6,  5,  6, 10],
       [ 3,  6,  6,  3, 11, 18, 11

In [17]:
tf.matrix_band_part(c, 0, 5)

<tf.Tensor: id=12, shape=(15, 15), dtype=int64, numpy=
array([[ 2, 10,  4, 13,  3,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  7, 17, 13, 13, 19, 16,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  7, 16, 17,  2, 19, 14,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  6, 17,  9, 15, 12,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  3,  8, 13,  3,  6, 19,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 11, 17,  6, 16, 11,  6,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  4, 15, 15,  6,  2,  3,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  3, 14, 13, 12, 13, 14,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  3, 14,  4, 15,  9, 15,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  7, 12,  4,  4,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 19,  9,  3, 19,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 11, 18, 14,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 19,  7],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [None]:
def get_loss(self):
        return self.loss

def get_global_step(self):
    return self.global_step