In [11]:
import os
import sys
import random
import argparse
import json
import nltk
import numpy as np
from tqdm import tqdm
import importlib
import unicodedata
from six.moves.urllib.request import urlretrieve

import tensorflow as tf
from tensorflow.nn import embedding_lookup
from tensorflow import keras
from tensorflow.python.ops.rnn_cell import DropoutWrapper
from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.ops import rnn_cell

from text_helper import maybe_download
from batch_helper import get_batch_generator

importlib.reload(sys)
random.seed(42)
np.random.seed(42)

In [4]:
DEFAULT_DATA_DIR="/users/vijay/MIDS/w266/Project/AnsweringMachines/dataset/"
SQUAD_BASE_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

In [5]:
_PAD = b"<pad>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _UNK]
PAD_ID = 0
UNK_ID = 1

In [6]:
def download_and_process(data_dir = DEFAULT_DATA_DIR):
    """
    Produce 4 files per dataset (train & dev).
    span: has the start & end span numbers for the answer (e.g. 34 35)
    answer: the text version of span above (e.g. "hundred yue")
    question: text tokens of the question (e.g. "who populated the ...")
    context": text tokens of the context paragraph (e.g. "the area of modern zhejiang was ...")
    """

    print( "Will download SQuAD datasets to {}".format(data_dir))
    print( "Will put preprocessed SQuAD datasets in {}".format(data_dir))

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    train_filename = "train-v1.1.json"
    dev_filename = "dev-v1.1.json"

    # download train set
    maybe_download(SQUAD_BASE_URL, train_filename, data_dir, None)

    # read train set
    train_data = data_from_json(os.path.join(data_dir, train_filename))
    print( "Train data has %i examples total" % total_exs(train_data))

    # preprocess train set and write to file
    preprocess_and_write(train_data, 'train', data_dir)

    # download dev set
    maybe_download(SQUAD_BASE_URL, dev_filename, data_dir, None)

    # read dev set
    dev_data = data_from_json(os.path.join(data_dir, dev_filename))
    print( "Dev data has %i examples total" % total_exs(dev_data))

    # preprocess dev set and write to file
    preprocess_and_write(dev_data, 'dev', data_dir)

In [7]:
def get_glove(glove_path, embedding_size):
    """Reads from original GloVe .txt file and returns embedding matrix and
    mappings from words to word ids.

    Input:
      glove_path: path to glove.6B.{glove_dim}d.txt
      embedding_size: integer; needs to match the dimension in glove_path

    Returns:
      emb_matrix: Numpy array shape (400002, glove_dim) containing glove embeddings
        (plus PAD and UNK embeddings in first two rows).
        The rows of emb_matrix correspond to the word ids given in word2id and id2word
      word2id: dictionary mapping word (string) to word id (int)
      id2word: dictionary mapping word id (int) to word (string)
    """
    print("Loading GloVe vectors from file: {:s}".format(glove_path))
    vocab_size = int(4e5 + 2)
    
    emb_matrix = np.zeros((vocab_size + len(_START_VOCAB), embedding_size))
    word2id = {}
    id2word = {}
    
    # initlialize the extra tokens
    emb_matrix[:len(_START_VOCAB), :] = np.random.randn(len(_START_VOCAB), embedding_size)
    
    # add the extra tokens to the dictionaries
    idx = 0
    for word in _START_VOCAB:
        word2id[word] = idx
        id2word[idx] = word
        idx += 1
        
    with open(glove_path, 'r') as fp:
        for line in tqdm(fp, total=vocab_size):
            line = line.lstrip().rstrip().split(" ")
            word = line[0]
            vector = list(map(float, line[1:]))
            assert(len(vector) == embedding_size)
            emb_matrix[idx, :] = vector
            word2id[word] = idx
            id2word[idx] = word
            idx += 1
            
    return emb_matrix, word2id, id2word

In [8]:
# clear all flags so that the cell below can be be executed multiple times
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)

In [9]:
dropout=0.3
batch_size=32
hidden_size_encoder=150
hidden_size_fully_connected=200
context_len=300
question_len=30
embedding_size=100
data_dir=DEFAULT_DATA_DIR
num_epochs=10
learning_rate=5e-5
max_gradient_norm=5.0

In [10]:
# Define path for glove vecs
glove_path = os.path.join(data_dir, "glove.6B.{}d.txt".format(100))
assert(os.path.exists(glove_path))


# Load embedding matrix and vocab mappings
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

  1%|          | 3506/400002 [00:00<00:11, 35059.28it/s]

Loading GloVe vectors from file: /users/vijay/MIDS/w266/Project/AnsweringMachines/dataset/glove.6B.100d.txt


100%|█████████▉| 400000/400002 [00:09<00:00, 42277.57it/s]


In [18]:
class RNNEncoder(object):
    """
    General-purpose module to encode a sequence using a RNN.
    It feeds the input through a RNN and returns all the hidden states.

    Note: In lecture 8, we talked about how you might use a RNN as an "encoder"
    to get a single, fixed size vector representation of a sequence
    (e.g. by taking element-wise max of hidden states).
    Here, we're using the RNN as an "encoder" but we're not taking max;
    we're just returning all the hidden states. The terminology "encoder"
    still applies because we're getting a different "encoding" of each
    position in the sequence, and we'll use the encodings downstream in the model.

    This code uses a bidirectional GRU, but you could experiment with other types of RNN.
    """

    def __init__(self, hidden_size, keep_prob):
        """
        Inputs:
          hidden_size: int. Hidden size of the RNN
          keep_prob: Tensor containing a single scalar that is the keep probability (for dropout)
        """
        self.hidden_size = hidden_size
        self.keep_prob = keep_prob
        self.rnn_cell_fw = tf.compat.v1.keras.layers.GRUCell(units=self.hidden_size)
        #self.rnn_cell_fw = rnn_cell.GRUCell(num_units=self.hidden_size)
        #self.rnn_cell_fw = DropoutWrapper(self.rnn_cell_fw, input_keep_prob=self.keep_prob)
        self.rnn_cell_bw = tf.compat.v1.keras.layers.GRUCell(units=self.hidden_size)
        #self.rnn_cell_bw = rnn_cell.GRUCell(num_units=self.hidden_size)
        #self.rnn_cell_bw = DropoutWrapper(self.rnn_cell_bw, input_keep_prob=self.keep_prob)

    def build_graph(self, inputs, masks, scopename):
        """
        Inputs:
          inputs: Tensor shape (batch_size, seq_len, input_size)
          masks: Tensor shape (batch_size, seq_len).
            Has 1s where there is real input, 0s where there's padding.
            This is used to make sure tf.nn.bidirectional_dynamic_rnn doesn't iterate through masked steps.

        Returns:
          out: Tensor shape (batch_size, seq_len, hidden_size*2).
            This is all hidden states (fw and bw hidden states are concatenated).
        """
        with vs.variable_scope(scopename, reuse=tf.AUTO_REUSE):
            
            input_lens = tf.reduce_sum(masks, reduction_indices=1) # shape (batch_size)

            # Note: fw_out and bw_out are the hidden states for every timestep.
            # Each is shape (batch_size, seq_len, hidden_size).
            (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(self.rnn_cell_fw, 
                                                                  self.rnn_cell_bw,
                                                                  inputs,
                                                                  input_lens,
                                                                  dtype=tf.float32) 

            # Concatenate the forward and backward hidden states
            # The shape is now (batch_size, seq_len, hidden_size*2)
            out = tf.concat([fw_out, bw_out], 2)

            # Apply dropout
            out = tf.nn.dropout(out, rate=1-self.keep_prob)

            return out

In [None]:
def masked_softmax(logits, mask, dim):
    """
    Takes masked softmax over given dimension of logits.

    Inputs:
      logits: Numpy array. We want to take softmax over dimension dim.
      mask: Numpy array of same shape as logits.
        Has 1s where there's real data in logits, 0 where there's padding
      dim: int. dimension over which to take softmax

    Returns:
      masked_logits: Numpy array same shape as logits.
        This is the same as logits, but with 1e30 subtracted
        (i.e. very large negative number) in the padding locations.
      prob_dist: Numpy array same shape as logits.
        The result of taking softmax over masked_logits in given dimension.
        Should be 0 in padding locations.
        Should sum to 1 over given dimension.
    """
    exp_mask = (1 - tf.cast(mask, 'float')) * (-1e30) # -large where there's padding, 0 elsewhere
    masked_logits = tf.add(logits, exp_mask) # where there's padding, set logits to -large
    prob_dist = tf.nn.softmax(masked_logits, dim)
    return masked_logits, prob_dist

In [15]:
class SimpleSoftmaxLayer(object):
    """
    Module to take set of hidden states, (e.g. one for each context location),
    and return probability distribution over those states.
    """

    def __init__(self):
        pass

    def build_graph(self, inputs, masks):
        """
        Applies one linear downprojection layer, then softmax.

        Inputs:
          inputs: Tensor shape (batch_size, seq_len, hidden_size)
          masks: Tensor shape (batch_size, seq_len)
            Has 1s where there is real input, 0s where there's padding.

        Outputs:
          logits: Tensor shape (batch_size, seq_len)
            logits is the result of the downprojection layer, but it has -1e30
            (i.e. very large negative number) in the padded locations
          prob_dist: Tensor shape (batch_size, seq_len)
            The result of taking softmax over logits.
            This should have 0 in the padded locations, and the rest should sum to 1.
        """
        with vs.variable_scope("SimpleSoftmaxLayer", reuse=tf.AUTO_REUSE):

            # Linear downprojection layer
            logits = tf.contrib.layers.fully_connected(inputs, num_outputs=1, activation_fn=None) # shape (batch_size, seq_len, 1)
            logits = tf.squeeze(logits, axis=[2]) # shape (batch_size, seq_len)

            # Take softmax over sequence
            masked_logits, prob_dist = masked_softmax(logits, masks, 1)

            return masked_logits, prob_dist

In [16]:
class BasicAttn(object):
    """Module for basic attention.

    Note: in this module we use the terminology of "keys" and "values" (see lectures).
    In the terminology of "X attends to Y", "keys attend to values".

    In the baseline model, the keys are the context hidden states
    and the values are the question hidden states.

    We choose to use general terminology of keys and values in this module
    (rather than context and question) to avoid confusion if you reuse this
    module with other inputs.
    """

    def __init__(self, keep_prob, key_vec_size, value_vec_size):
        """
        Inputs:
          keep_prob: tensor containing a single scalar that is the keep probability (for dropout)
          key_vec_size: size of the key vectors. int
          value_vec_size: size of the value vectors. int
        """
        self.keep_prob = keep_prob
        self.key_vec_size = key_vec_size
        self.value_vec_size = value_vec_size

    def build_graph(self, values, values_mask, keys):
        """
        Keys attend to values.
        For each key, return an attention distribution and an attention output vector.

        Inputs:
          values: Tensor shape (batch_size, num_values, value_vec_size).
          values_mask: Tensor shape (batch_size, num_values).
            1s where there's real input, 0s where there's padding
          keys: Tensor shape (batch_size, num_keys, value_vec_size)

        Outputs:
          attn_dist: Tensor shape (batch_size, num_keys, num_values).
            For each key, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, num_keys, hidden_size).
            This is the attention output; the weighted sum of the values
            (using the attention distribution as weights).
        """
        with vs.variable_scope("BasicAttn", reuse=tf.AUTO_REUSE):

            # Calculate attention distribution
            values_t = tf.transpose(values, perm=[0, 2, 1]) # (batch_size, value_vec_size, num_values)
            attn_logits = tf.matmul(keys, values_t) # shape (batch_size, num_keys, num_values)
            print("Basic attn keys", keys.shape)
            print("Basic attn values", values_t.shape)
            print("Basic attn logits", attn_logits.shape)
            attn_logits_mask = tf.expand_dims(values_mask, 1) # shape (batch_size, 1, num_values)
            _, attn_dist = masked_softmax(attn_logits, attn_logits_mask, 2) # shape (batch_size, num_keys, num_values). take softmax over values

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(attn_dist, values) # shape (batch_size, num_keys, value_vec_size)

            # Apply dropout
            output = tf.nn.dropout(output, rate=1-self.keep_prob)

            return attn_dist, output

In [14]:
class SQuAD_Model(object):
    """Top-level Question Answering module"""

    def __init__(self, id2word, word2id, emb_matrix):
        """
        Initializes the QA model.

        Inputs:
          id2word: dictionary mapping word idx (int) to word (string)
          word2id: dictionary mapping word (string) to word idx (int)
          emb_matrix: numpy array shape (400002, embedding_size) containing pre-traing GloVe embeddings
        """
        print("Initializing the SQuAD_Model...")
        
        self.id2word = id2word
        self.word2id = word2id
        self.emb_matrix = emb_matrix
        
        with tf.variable_scope("QAModel", 
                               initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, uniform=True),
                               reuse=tf.AUTO_REUSE):
            self.context_ids = tf.placeholder(tf.int32, shape=[None, context_len])
            self.context_mask = tf.placeholder(tf.int32, shape=[None, context_len])
            self.question_ids = tf.placeholder(tf.int32, shape=[None, question_len])
            self.question_mask = tf.placeholder(tf.int32, shape=[None, question_len])
            self.answer_span = tf.placeholder(tf.int32, shape=[None, 2])
        
            # Add a placeholder to feed in the keep probability (for dropout).
            # This is necessary so that we can instruct the model to use dropout when training, but not when testing
            self.keep_prob = tf.placeholder_with_default(1.0, shape=())
            self.hidden_size_fully_connected = hidden_size_fully_connected
        
            # here we setup functions used to output vectors using the embedding matrix
            # self.context_embeds & self.question_embeds
            self.add_embedding_layer(emb_matrix)
            
            # build the computational graph (similar to compiling the model?)
            self.build_graph()
            
            
            self.add_loss()
        
        # Define trainable parameters, gradient, gradient norm, and clip by gradient norm
        params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, params)
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
        self.param_norm = tf.global_norm(params)

        # Define optimizer and updates
        # (updates is what you need to fetch in session.run to do a gradient update)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate) # you can try other optimizers
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
        
        self.summaries = tf.summary.merge_all()
        
    def add_embedding_layer(self, emb_matrix):
        """
        Adds word embedding layer to the graph.

        Inputs:
          emb_matrix: shape (400002, embedding_size).
            The GloVe vectors, plus vectors for PAD and UNK.
        """
        embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix")
        self.context_embeds = tf.nn.embedding_lookup(embedding_matrix, self.context_ids)
        self.question_embeds = tf.nn.embedding_lookup(embedding_matrix, self.question_ids)
        
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings 
           to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and 
            end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross 
            entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
    
        encoder = RNNEncoder(hidden_size=hidden_size_encoder, keep_prob=self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embeds, self.context_mask, scopename='RNNEncoder')
        question_hiddens = encoder.build_graph(self.question_embeds, self.question_mask, scopename='RNNEncoder')
        
        # Use context hidden states to attend to question hidden states - Basic Attention
        last_dim = context_hiddens.get_shape().as_list()[-1]
        print("last dim", last_dim)

        attn_layer = BasicAttn(self.keep_prob, last_dim,
                                   last_dim)
        _, attn_output = attn_layer.build_graph(question_hiddens, 
                                                self.question_mask,
                                                context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
        
        # Apply fully connected layer to each blended representation
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.hidden_size_fully_connected) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist", reuse=tf.AUTO_REUSE):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist", reuse=tf.AUTO_REUSE):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)

        
    def add_loss(self):
        """
        Add loss computation to the graph.

        Uses:
              self.logits_start: shape (batch_size, context_len)
                IMPORTANT: Assumes that self.logits_start is masked (i.e. has -large in masked locations).
                That's because the tf.nn.sparse_softmax_cross_entropy_with_logits
                function applies softmax and then computes cross-entropy loss.
                So you need to apply masking to the logits (by subtracting large
                number in the padding location) BEFORE you pass to the
                sparse_softmax_cross_entropy_with_logits function.

              self.ans_span: shape (batch_size, 2)
                Contains the gold start and end locations

            Defines:
              self.loss_start, self.loss_end, self.loss: all scalar tensors
        """
        with vs.variable_scope("loss", reuse=tf.AUTO_REUSE):

            # Calculate loss for prediction of start position
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_start, labels=self.answer_span[:, 0]) # loss_start has shape (batch_size)
            self.loss_start = tf.reduce_mean(loss_start) # scalar. avg across batch
            tf.summary.scalar('loss_start', self.loss_start) # log to tensorboard

            # Calculate loss for prediction of end position
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_end, labels=self.answer_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end)
            tf.summary.scalar('loss_end', self.loss_end)

            # Add the two losses
            self.loss = self.loss_start + self.loss_end
            tf.summary.scalar('loss', self.loss)
            
    def run_train_iter(self, session, batch):
        """
        This performs a single training iteration (forward pass, loss computation, backprop, parameter update)

        Inputs:
          session: TensorFlow session
          batch: a Batch object
          summary_writer: for Tensorboard

        Returns:
          loss: The loss (averaged across the batch) for this batch.
          global_step: The current number of training iterations we've done
          param_norm: Global norm of the parameters
          gradient_norm: Global norm of the gradients
        """
        # Match up our input data with the placeholders
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask
        input_feed[self.question_ids] = batch.qn_ids
        input_feed[self.question_mask] = batch.qn_mask
        input_feed[self.answer_span] = batch.ans_span
        input_feed[self.keep_prob] = 1.0 - dropout # apply dropout

        # output_feed contains the things we want to fetch.
        output_feed = [self.updates, self.summaries, self.loss, self.global_step, self.param_norm, self.gradient_norm]

        # Run the model
        [_, summaries, loss, global_step, param_norm, gradient_norm] = session.run(fetches=output_feed, feed_dict=input_feed)

        return loss, global_step, param_norm, gradient_norm
    
    def train(self, session, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path):
        """
        Main training loop.

        Inputs:
          session: TensorFlow session
          {train/dev}_{qn/context/ans}_path: paths to {train/dev}.{context/question/answer} data files
        """

        # Print number of model parameters
        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        logging.info("Number of params: %d (retrieval took %f secs)" % (num_params, toc - tic))

        epoch = 0
        print_every = 1

        logging.info("Beginning training loop...")
        while epoch < num_epochs:
            epoch += 1
            epoch_tic = time.time()

            # Loop over batches
            for batch in get_batch_generator(self.word2id, train_context_path, train_qn_path, train_ans_path, batch_size, context_len=context_len, question_len=question_len, discard_long=True):

                # Run training iteration
                iter_tic = time.time()
                loss, global_step, param_norm, grad_norm = self.run_train_iter(session, batch)
                iter_toc = time.time()
                iter_time = iter_toc - iter_tic
                
                # Sometimes print info to screen
                if global_step % print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, loss %.5f, grad norm %.5f, param norm %.5f, batch time %.3f' %
                        (epoch, global_step, loss, grad_norm, param_norm, iter_time))


In [19]:
qm = SQuAD_Model(id2word=id2word, word2id=word2id, emb_matrix=emb_matrix)

Initializing the SQuAD_Model...


TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.

In [None]:
# Get filepaths to train/dev datafiles for tokenized queries, contexts and answers
train_context_path = os.path.join(data_dir, "train.context")
train_qn_path = os.path.join(data_dir, "train.question")
train_ans_path = os.path.join(data_dir, "train.span")
dev_context_path = os.path.join(data_dir, "dev.context")
dev_qn_path = os.path.join(data_dir, "dev.question")
dev_ans_path = os.path.join(data_dir, "dev.span")

In [None]:
qm.context_embeds[0].shape

In [None]:
import time
import logging
import re
logging.basicConfig(level=logging.INFO)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    qm.train(session, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path)