In [1]:
import tensorflow as tf
import numpy as np
import re
import random
import json
import collections

In [2]:
def tokenize(string):
    string = re.sub(r'\(|\)', '', string)
    return string.split()

In [3]:
LABEL_MAP = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 2,
    "hidden" : 0
}
PADDING = "<PAD>"
UNKNOWN = "<UNK>"
def load_uwre_data(path):
    """
    Load UWRE data.
    "uwre" is set to "genre".
    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            if loaded_example["gold_label"] not in LABEL_MAP:
                continue
            loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            loaded_example["genre"] = "uwre"
            data.append(loaded_example)
        random.seed(1)
        random.shuffle(data)
    return data


In [4]:
def build_uwre_dictionary(training_datasets):
    """
    Extract vocabulary and build dictionary.
    """
    word_counter = collections.Counter()
    for i, dataset in enumerate(training_datasets):
        for example in dataset:
            word_counter.update(tokenize(example['sentence1']))
            word_counter.update(tokenize(example['sentence2']))
    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    vocabulary = [PADDING, UNKNOWN] + vocabulary

    word_indices = dict(zip(vocabulary, range(len(vocabulary))))

    return word_indices

In [5]:
def loadEmbedding_rand(path, word_indices):
    """
    Load GloVe embeddings. Doing a random normal initialization for OOV words.
    """
    n = len(word_indices)
    m = 300
    emb = np.empty((n, m), dtype=np.float32)

    emb[:,:] = np.random.normal(size=(n,m))

    emb[0:2, :] = np.zeros((1,m), dtype="float32")
    
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            s = line.split()
            if s[0] in word_indices:
                if len(s) > 301:
                    tail = s[len(s)-300:]                
                    head = [s[0]]
                    s = head + tail
                    # print(head)
                emb[word_indices[s[0]], :] = np.asarray(s[1:]) 
    return emb

In [6]:
def length(sequence):
    """
    Get true length of sequences (without padding), and mask for true-length in max-length.

    Input of shape: (batch_size, max_seq_length, hidden_dim)
    Output shapes, 
    length: (batch_size)
    mask: (batch_size, max_seq_length, 1)
    """
    populated = tf.sign(tf.abs(sequence))
    length = tf.cast(tf.reduce_sum(populated, axis=1), tf.int32)
    mask = tf.cast(tf.expand_dims(populated, -1), tf.float32)
    return length, mask

In [7]:
def biLSTM(inputs, dim, seq_len, name):
    """
    A Bi-Directional LSTM layer. Returns forward and backward hidden states as a tuple, and cell states as a tuple.

    Ouput of hidden states: [(batch_size, max_seq_length, hidden_dim), (batch_size, max_seq_length, hidden_dim)]
    Same shape for cell states.
    """
    with tf.name_scope(name):
        with tf.variable_scope('forward' + name):
            lstm_fwd = tf.contrib.rnn.LSTMCell(num_units=dim)
        with tf.variable_scope('backward' + name):
            lstm_bwd = tf.contrib.rnn.LSTMCell(num_units=dim)

        hidden_states, cell_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fwd, cell_bw=lstm_bwd, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name)

    return hidden_states, cell_states

In [8]:
def reader(inputs, lengths, output_size, contexts=(None, None), scope=None):
    """Dynamic bi-LSTM reader; can be conditioned with initial state of other rnn.
    Args:
        inputs (tensor): The inputs into the bi-LSTM
        lengths (tensor): The lengths of the sequences
        output_size (int): Size of the LSTM state of the reader.
        context (tensor=None, tensor=None): Tuple of initial (forward, backward) states
                                  for the LSTM
        scope (string): The TensorFlow scope for the reader.
        drop_keep_drop (float=1.0): The keep probability for dropout.
    Returns:
        Outputs (tensor): The outputs from the bi-LSTM.
        States (tensor): The cell states from the bi-LSTM.
    """
    with tf.variable_scope(scope or "reader") as varscope:
        cell_fw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
        cell_bw = tf.contrib.rnn.LSTMCell(output_size, initializer=tf.contrib.layers.xavier_initializer())
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw,
            cell_bw,
            inputs,
            sequence_length=lengths,
            initial_state_fw=contexts[0],
            initial_state_bw=contexts[1],
            dtype=tf.float32
        )

        return outputs, states

In [9]:
training_uwre = load_uwre_data('./mini_uwre.json')

In [10]:
word_indices = build_uwre_dictionary([training_uwre])

In [11]:
len(word_indices)

216

In [12]:
loaded_embeddings = loadEmbedding_rand('./mini_glove.txt', word_indices)

In [13]:
np.shape(loaded_embeddings)

(216, 300)

In [14]:
# embedding_dim = emb_dim = 300 (embedding dimension)
# dim = hidden_dim = 300 (hidden embedding dimension)
# sequence_length = seq_length = 50
embeddings = loaded_embeddings


In [15]:
## Define the placeholders
premise_x = tf.placeholder(tf.int32, [None, 50]) # sequence_length = 50
hypothesis_x = tf.placeholder(tf.int32, [None, 50])# sequence_length = 50
y = tf.placeholder(tf.int32, [None])
keep_rate_ph = tf.placeholder(tf.float32, [])

In [16]:
hypothesis_x

<tf.Tensor 'Placeholder_1:0' shape=(?, 50) dtype=int32>

In [17]:
## Define parameters
E = tf.Variable(embeddings, True)

W_mlp = tf.Variable(tf.random_normal([300 * 8, 300], stddev=0.1))
b_mlp = tf.Variable(tf.random_normal([300], stddev=0.1))

W_cl = tf.Variable(tf.random_normal([300, 3], stddev=0.1))
b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

In [18]:
## Function for embedding lookup and dropout at embedding layer
def emb_drop(x):
    emb = tf.nn.embedding_lookup(E, x)
    emb_drop = tf.nn.dropout(emb, keep_rate_ph)
    return emb_drop

In [19]:
# Get lengths of unpadded sentences
prem_seq_lengths, mask_prem = length(premise_x)
hyp_seq_lengths, mask_hyp = length(hypothesis_x)

In [20]:
mask_hyp

<tf.Tensor 'Cast_1:0' shape=(?, 50, 1) dtype=float32>

In [21]:
mask_prem

<tf.Tensor 'Cast:0' shape=(?, 50, 1) dtype=float32>

In [22]:
### First cbiLSTM layer ###
premise_in = emb_drop(premise_x)
hypothesis_in = emb_drop(hypothesis_x)

In [29]:
hypothesis_in

<tf.Tensor 'dropout_1/mul:0' shape=(?, 50, 300) dtype=float32>

In [24]:
hypothesis_outs, c2 = biLSTM(hypothesis_in, dim=300, seq_len=50, name='hypothesis')
with tf.variable_scope("conditional_first_premise_layer") as fstPremise_scope:
    premise_outs, c1 = reader(premise_in, prem_seq_lengths, self.dim, c2, scope=fstPremise_scope)

ValueError: sequence_length must be a vector of length batch_size, but saw shape: ()