In [1]:
import tensorflow as tf
import numpy as np

In [2]:
embeddings = tf.constant(np.load('./data/squad/glove.trimmed.100.npz')['glove'], name='W')

In [3]:
a = open('./data/squad/train.ids.context', 'r')

In [4]:
lines = a.readlines()

In [5]:
b = map(int, lines[0].strip().split(' '))

In [6]:
def build_mask(x):
    return tf.ones_like(x, dtype=tf.int32)

def make_dataset(filenames, batch_size=10, epoch_size=2, with_mask=False):
    dataset = tf.contrib.data.TextLineDataset(filenames)
    
    dataset = (dataset.map(lambda line: tf.string_split([line]).values)
                    .map(lambda strings: tf.string_to_number(strings, out_type=tf.int32)))
    if with_mask:
        dataset = dataset.map(lambda x: (x, build_mask(x)))
                    
    
    return dataset

In [7]:
dataset1 = make_dataset(['./data/squad/train.ids.context'], with_mask=True)
dataset2 = make_dataset(['./data/squad/train.ids.question'], with_mask=True)
dataset3 = make_dataset(['./data/squad/train.span'])

dataset = tf.contrib.data.Dataset.zip((dataset1, dataset2, dataset3))
dataset = (dataset.padded_batch(10, padded_shapes=(([None], [None]), ([None], [None]), ([None])))
                    .repeat(2)
                    .shuffle(buffer_size=10000))
iterator = dataset.make_one_shot_iterator()
context_tuple, question_tuple, span = iterator.get_next()
context, cmask = context_tuple
question, qmask = question_tuple



In [8]:
context_embed = tf.nn.embedding_lookup(embeddings, context)
question_embed = tf.nn.embedding_lookup(embeddings, question)



In [9]:
from tensorflow.python.util import nest
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops.rnn_cell import DropoutWrapper
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
from functools import reduce
from operator import mul


def flatten(tensor, keep):
    fixed_shape = tensor.get_shape().as_list()
    start = len(fixed_shape) - keep
    left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
    out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
    flat = tf.reshape(tensor, out_shape)
    return flat


def reconstruct(tensor, ref, keep):
    ref_shape = ref.get_shape().as_list()
    tensor_shape = tensor.get_shape().as_list()
    ref_stop = len(ref_shape) - keep
    tensor_start = len(tensor_shape) - keep
    pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
    keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
    # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
    # keep_shape = tensor.get_shape().as_list()[-keep:]
    target_shape = pre_shape + keep_shape
    out = tf.reshape(tensor, target_shape)
    return out


def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
           is_train=None):
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    flat_args = [flatten(arg, 1) for arg in args]
    if input_keep_prob < 1.0:
        assert is_train is not None
        flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
                     for arg in flat_args]
    flat_out = rnn_cell_impl._linear(flat_args, output_size, bias, 
                                     bias_initializer=tf.zeros_initializer())
    out = reconstruct(flat_out, args[0], 1)
    if squeeze:
        out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
    if wd:
        add_wd(wd)

    return out


def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
    with tf.variable_scope(scope or "highway_layer"):
        d = arg.get_shape()[-1]
        with tf.variable_scope('trans'):
            trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
            trans = tf.nn.relu(trans)
        with tf.variable_scope('gate'):
            gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
            gate = tf.nn.sigmoid(gate)
        out = gate * trans + (1 - gate) * arg
        return out

def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
    with tf.variable_scope(scope or "highway_network"):
        prev = arg
        cur = None
        for layer_idx in range(num_layers):
            cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd,
                                input_keep_prob=input_keep_prob, is_train=is_train)
            prev = cur
        return cur
    
# TODO do you need this?   
class SwitchableDropoutWrapper(DropoutWrapper):
    def __init__(self, cell, is_train, input_keep_prob=1.0, output_keep_prob=1.0,
             seed=None):
        super(SwitchableDropoutWrapper, self).__init__(cell, input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob,
                                                       seed=seed)
        self.is_train = is_train

    def __call__(self, inputs, state, scope=None):
        outputs_do, new_state_do = super(SwitchableDropoutWrapper, self).__call__(inputs, state, scope=scope)
        tf.get_variable_scope().reuse_variables()
        outputs, new_state = self._cell(inputs, state, scope)
        outputs = tf.cond(self.is_train, lambda: outputs_do, lambda: outputs)
        if isinstance(state, tuple):
            new_state = state.__class__(*[tf.cond(self.is_train, lambda: new_state_do_i, lambda: new_state_i)
                                       for new_state_do_i, new_state_i in zip(new_state_do, new_state)])
        else:
            new_state = tf.cond(self.is_train, lambda: new_state_do, lambda: new_state)
        return outputs, new_state
    
def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                              initial_state_fw=None, initial_state_bw=None,
                              dtype=None, parallel_iterations=None,
                              swap_memory=False, time_major=False, scope=None):
    assert not time_major

    flat_inputs = flatten(inputs, 2)  # [-1, J, d]
    
    print(inputs.get_shape(), flat_inputs.get_shape(), 'Flattening')
    flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')

    (flat_fw_outputs, flat_bw_outputs), final_state = \
        _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
                                   initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
                                   dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
                                   time_major=time_major, scope=scope)

    fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
    bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
    print(flat_fw_outputs.get_shape(), fw_outputs.get_shape(), 'Reconstruct')
    # FIXME : final state is not reshaped!
    return (fw_outputs, bw_outputs), final_state



In [10]:
with tf.variable_scope('context'):
    context_embed = highway_network(context_embed, 2, True, wd=0.0, is_train=False)

with tf.variable_scope('question'):
    question_embed = highway_network(question_embed, 2, True, wd=0.0, is_train=False)


In [11]:
from tensorflow.python.ops.rnn_cell import BasicLSTMCell
from tensorflow.python.ops.rnn_cell import DropoutWrapper

# TODO the input_keep_prob should be 1.0 for inference
cell = BasicLSTMCell(100, state_is_tuple=True)

is_train = tf.constant(False)
d_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=0.5)

c_len = tf.reduce_sum(tf.cast(cmask, 'int32'), 1)  # [N]
q_len = tf.reduce_sum(tf.cast(qmask, 'int32'), 1)  # [N]

# TODO figure out why these embeds are in float64
context_embed = tf.cast(context_embed, tf.float32)
question_embed = tf.cast(question_embed, tf.float32)

#TODO remove reuse for production
with tf.variable_scope('prepro'):
    (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = _bidirectional_dynamic_rnn(d_cell, d_cell, question_embed, q_len, dtype='float', scope='u1') # [N, J, d], [N, d]

    tf.get_variable_scope().reuse_variables()

    (fw_h, bw_h), _ = _bidirectional_dynamic_rnn(cell, cell, context_embed, c_len, dtype='float', scope='u1')  # [N, M, JX, 2d]



In [35]:
sess.run([tf.shape(c_len)])

[array([10], dtype=int32)]

In [12]:
u = tf.concat([fw_u, bw_u], 2)
h = tf.concat([fw_h, bw_h], 2)

In [13]:
VERY_BIG_NUMBER = 1e30
VERY_SMALL_NUMBER = 1e-30
VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER

def exp_mask(val, mask, name=None):
    """Give very negative number to unmasked elements in val.
    For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
    Typically, this effectively masks in exponential space (e.g. softmax)
    Args:
        val: values to be masked
        mask: masking boolean tensor, same shape as tensor
        name: name for output tensor

    Returns:
        Same shape as val, where some elements are very small (exponentially zero)
    """
    if name is None:
        name = "exp_mask"
    return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)


def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
    with tf.variable_scope(scope or "Linear_Logits"):
        logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first',
                        wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
        if mask is not None:
            logits = exp_mask(logits, mask)
        return logits


In [14]:
# Attention Layer
with tf.variable_scope('attention_layer'):
    JQ = tf.shape(u)[1]
    JX = tf.shape(h)[1]

    h_aug = tf.tile(tf.expand_dims(h, 2), [1, 1, JQ, 1])
    u_aug = tf.tile(tf.expand_dims(u, 1), [1, JX, 1, 1])

    h_mask_aug = tf.tile(tf.expand_dims(cmask, 2), [1, 1, JQ])
    u_mask_aug = tf.tile(tf.expand_dims(qmask, 1), [1, JX, 1])
    hu_mask = tf.cast(h_mask_aug, tf.bool) & tf.cast(u_mask_aug, tf.bool)
    hu_aug = h_aug * u_aug
    u_logits = linear_logits([h_aug, u_aug, hu_aug], True, scope='u_logits', mask=hu_mask)
    u_logits_reshaped = flatten(u_logits, 1)
    u_softmax = tf.nn.softmax(u_logits_reshaped)
    out = reconstruct(u_softmax, u_logits, 1)
    target_rank = len(u_aug.get_shape().as_list())
    u_a = tf.reduce_sum(tf.expand_dims(out, -1) * u_aug, target_rank - 2)

In [15]:
h_logits = tf.reduce_sum(u_logits, 2)
h_logits_reshaped = flatten(h_logits, 1)
h_softmax = tf.nn.softmax(h_logits_reshaped)
out = reconstruct(h_softmax, h_logits, 1)
target_rank = len(h.get_shape().as_list())
h_a = tf.reduce_sum(tf.expand_dims(out, -1) * h, target_rank - 2)
h_a = tf.tile(tf.expand_dims(h_a, 1), [1, JX, 1])



In [16]:
p0 = tf.concat([h, u_a, h * u_a, h * h_a], 2)


In [17]:
with tf.variable_scope("main"):
    cell = BasicLSTMCell(100, state_is_tuple=True)
    first_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=0.5)
    (fw_g0, bw_g0), _ = _bidirectional_dynamic_rnn(first_cell, first_cell, p0, c_len, dtype='float', scope='g0')  # [N, JX, 2d]
    g0 = tf.concat([fw_g0, bw_g0], 2)

    cell = BasicLSTMCell(100, state_is_tuple=True)
    first_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=0.5)
    (fw_g1, bw_g1), _ = _bidirectional_dynamic_rnn(first_cell, first_cell, g0, c_len, dtype='float', scope='g1')  # [N, JX, 2d]
    g1 = tf.concat([fw_g1, bw_g1], 2)


In [18]:
logits = linear_logits([g1, p0], 100, 0.0, scope='logits1', mask=cmask)

In [19]:
def softmax(logits, mask=None, scope=None):
    with tf.name_scope(scope or "Softmax"):
        if mask is not None:
            logits = exp_mask(logits, mask)
        flat_logits = flatten(logits, 1)
        flat_out = tf.nn.softmax(flat_logits)
        out = reconstruct(flat_out, logits, 1)

        return out

def softsel(target, logits, mask=None, scope=None):
    """

    :param target: [ ..., J, d] dtype=float
    :param logits: [ ..., J], dtype=float
    :param mask: [ ..., J], dtype=bool
    :param scope:
    :return: [..., d], dtype=float
    """
    with tf.name_scope(scope or "Softsel"):
        a = softmax(logits, mask=mask)
        target_rank = len(target.get_shape().as_list())
        out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2)
        return out


In [20]:
# TODO use batch _size
a1i = softsel(tf.reshape(g1, [10, JX, 2 * 100]), tf.reshape(logits, [10, JX]))
a1i = tf.tile(tf.expand_dims(a1i, 1), [1, JX, 1])

In [21]:
flat_logits1 = tf.reshape(logits, [-1, JX])
flat_yp = tf.nn.softmax(flat_logits1)  # [-1, M*JX]
yp = tf.reshape(flat_yp, [-1, JX])


In [22]:
cell = BasicLSTMCell(100, state_is_tuple=True)
d_cell = SwitchableDropoutWrapper(cell, is_train, input_keep_prob=0.5)
(fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, tf.concat([p0, g1, a1i, g1 * a1i], 2),
                                                          c_len, dtype='float', scope='g2')  # [N, M, JX, 2d]
g2 = tf.concat([fw_g2, bw_g2], 2)
logits2 = linear_logits([g2, p0], 100, 0.0, scope='logits2', mask=cmask)
flat_logits2 = tf.reshape(logits2, [-1, JX])
flat_yp = tf.nn.softmax(flat_logits2)  # [-1, M*JX]
yp2 = tf.reshape(flat_yp, [-1, JX])

(TensorShape([Dimension(10), Dimension(None), Dimension(1400)]), TensorShape([Dimension(10), Dimension(None), Dimension(1400)]), 'Flattening')
(TensorShape([Dimension(10), Dimension(None), Dimension(100)]), TensorShape([Dimension(10), Dimension(None), Dimension(100)]), 'Reconstruct')


In [33]:
span0, span1 = tf.split(span, num_or_size_splits=2, axis=1)

loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(span0, [-1]), logits=flat_logits1, name='loss1')
loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(span1, [-1]), logits=flat_logits2, name='loss2')
loss = tf.reduce_mean(tf.reshape(loss1, [-1, 1]) * tf.to_float(cmask)) + tf.reduce_mean(tf.reshape(loss2, [-1, 1]) * tf.to_float(cmask))
#sess.run([tf.shape(tf.reshape(span0, [-1, 1]))])

In [34]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run([tf.shape(g0), tf.shape(g1), tf.shape(logits), tf.shape(yp), loss, tf.shape(loss2)])


[array([ 10, 168, 200], dtype=int32),
 array([ 10, 168, 200], dtype=int32),
 array([ 10, 168], dtype=int32),
 array([ 10, 168], dtype=int32),
 7.1617618,
 array([10], dtype=int32)]