# Question Answering with SQUAD

### Setup (if using Google Colaboratory)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
%cd /content/drive/My\ Drive/NLP\ HW5

/content/drive/My Drive/NLP HW5


### Imports

In [1]:
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.notebook import tqdm
random.seed(1)

## Load Data

Read in the pickle files provided to you. We have preprocessed and tokenized all the question answering data form you. Just run the following cells to initialize your word and char to index maps as well as training data. 

For `word2idx` and `char2idx`, index 0 is reserved for *<unk>* and 1 is reserved for *<pad>*. 

In [2]:
word2idx_file = 'data/word2idx.pickle'
char2idx_file = 'data/char2idx.pickle'
train_pkl = 'data/train.pkl'
valid_pkl = 'data/valid.pkl'

In [3]:
word2idx = pickle.load(open(word2idx_file, 'rb')) # word2idx 
idx2word = {v : k for k, v in word2idx.items()}
word_vocab = list(word2idx.keys())
print("Vocab size:", len(word_vocab))
# for i in range(5):
    # print(word_vocab[i], word2idx[word_vocab[i]], idx2word[word2idx[word_vocab[i]]])
# Print <unk> and <pad>
print(idx2word[0], word2idx['<unk>'])
print(idx2word[1], word2idx['<pad>'])

Vocab size: 23284
<unk> 0
<pad> 1


In [4]:
MAX_WORD_LEN = max([len(x) for x in word2idx.keys()])
MAX_WORD_LEN

30

In [5]:
char2idx = pickle.load(open(char2idx_file, 'rb')) # char2idx 
idx2char = {v : k for k, v in char2idx.items()}
char_vocab = list(char2idx.keys())
print("Unique chars:", len(char_vocab))
print(idx2char[0], char2idx['<unk>'])
print(idx2char[1], char2idx['<pad>'])

Unique chars: 96
<unk> 0
<pad> 1


In [6]:
val_df = pd.read_pickle(valid_pkl).reset_index(drop=True)
train_df = pd.read_pickle(train_pkl).reset_index(drop=True)
print("Train samples", len(train_df))
print("Valid samples", len(val_df))
train_df.head()

Train samples 8504
Valid samples 2013


Unnamed: 0,id,context,question,label,answer,context_ids,question_ids,label_idx
0,56be85543aeaaa14008c9063,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"[269, 286]",in the late 1990s,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[55, 21, 98, 412, 1067, 510, 5]","[56, 59]"
1,56be85543aeaaa14008c9065,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"[207, 226]",singing and dancing,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[11, 270, 21, 98, 5164, 7, 90, 128, 12, 1204, ...","[44, 46]"
2,56be85543aeaaa14008c9066,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"[526, 530]",2003,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[55, 21, 98, 1571, 550, 16, 609, 8, 176, 10, 7...","[112, 112]"
3,56bf6b0f3aeaaa14008c9601,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"[166, 180]","Houston, Texas","[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[31, 25, 82, 8, 153, 21, 98, 293, 4104, 113, 5]","[36, 38]"
4,56bf6b0f3aeaaa14008c9602,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"[276, 286]",late 1990s,"[72, 9510, 3491, 19, 2772, 28, 13132, 9511, 19...","[31, 35, 857, 21, 98, 176, 765, 5]","[58, 59]"


In [7]:
sample = train_df.sample(1).iloc[0]
print("Question:", sample.question)
print("Question Ids:", sample.question_ids)
print("Answer:", sample.answer)
print("Answer Ids:", [word2idx[x] for x in sample.answer.split()])
[char_start, char_end] = sample.label
print("Context:", sample.context[char_start - 30 : char_end + 30])
[token_start, token_end] = sample.label_idx
print("Context Ids:", sample.context_ids[token_start - 5: token_end + 5])

Question: Who fought over the movie rights for Thunderball?
Question Ids: [32, 5118, 101, 2, 477, 397, 15, 4053, 5]
Answer: Ian Fleming and Kevin McClory
Answer Ids: [5748, 3372, 8, 10385, 4052]
Context: tion starting in 1961 between Ian Fleming and Kevin McClory over the film rights to the n
Context Ids: [8096, 1300, 7, 6698, 111, 5748, 3372, 8, 10385, 4052, 101, 2, 149, 397]


In [8]:
# Counts
print("Train samples", len(train_df))
print("Val samples", len(val_df))

Train samples 8504
Val samples 2013


In [9]:
gold_val_answers = val_df[['id', 'answer']].groupby('id')['answer'].apply(list)
gold_val_answers.head()

id
56ddde6b9a695914005b9628                     [France, France, France, France]
56ddde6b9a695914005b9629    [10th and 11th centuries, in the 10th and 11th...
56ddde6b9a695914005b962a    [Denmark, Iceland and Norway, Denmark, Iceland...
56ddde6b9a695914005b962b                         [Rollo, Rollo, Rollo, Rollo]
56ddde6b9a695914005b962c    [10th century, the first half of the 10th cent...
Name: answer, dtype: object

# 1. BiLSTM + Attention (same as [BiDAF](https://arxiv.org/abs/1611.01603) but different encoder)

> Let's implement a variant of BiDAF using the same LSTM code from HW4 as encoder. 


<img src="img/BiDAF.png">

Here, we will swap out *Contextual Embed Layer*, *Word Embed Layer*, and *Character Embed Layer* with our own LSTM-based encoder. 

In [10]:
# you will need to import again after every change you make
#    autoreload does not work here for some reason (let us 
#    know on piazza if you figure out a fix)
from MyQA_s import *

In [11]:
sampleInput = train_df.iloc[0]
context = sampleInput.context_ids # tokenized context converted to ids
question = sampleInput.question_ids # tokenized query converted to ids
labels = sampleInput.label_idx # [start_idx, end_idx] of answer
# create char inputs for context and question
context_chars = []
for w in context:
    w = idx2word[w]
    padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
    context_chars += padding + list(w)
print(context_chars)
context_chars = prepare_sequence(context_chars, char2idx)
question_chars = []
for w in question:
    w = idx2word[w]
    padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
    question_chars += padding + list(w)
question_chars = prepare_sequence(question_chars, char2idx)
context = torch.tensor(context, dtype=torch.long)
question = torch.tensor(question, dtype=torch.long)

['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'B', 'e', 'y', 'o', 'n', 'c', 'é', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'G', 'i', 's', 'e', 'l', 'l', 'e', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'K', 'n', 'o', 'w', 'l', 'e', 's', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '-', '<pad>'

In [12]:
# Hyperparameters (you will need to change these)
# Encoding layer
EMBEDDING_DIM = 6
# EMBEDDING_DIM = 12
# HIDDEN_DIM = 3
HIDDEN_DIM = 6
LEARNING_RATE = 0.1
LSTM_LAYERS = 1
DROPOUT = 0
EPOCHS = 2
CHAR_EMBEDDING_DIM = 3
# CHAR_EMBEDDING_DIM = 6
CHAR_HIDDEN_DIM = 3
# CHAR_HIDDEN_DIM = 6
BIDIRECTIONAL_LSTM = True
# Modeling layer
LSTM_LAYERS_MODELING = 2

### Part 1: Encoder

We will use the same POSTagger architecture to encode our passage and questions. The minor distinction here is, you have to encode inputs twice. Once for passage ("context") and one for the question ("query"). *Check that this works before moving on the the next part.*
We will also omit the HighWay Layer for this HW. However, you are welcome to add it for Part 2. 

Implement code in `LSTMEncoder` class in `MyQA.py`. 

In [13]:
directions = 1 if not BIDIRECTIONAL_LSTM else 2
encode = LSTMEncoder(EMBEDDING_DIM, HIDDEN_DIM, 
                     CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, 
                     len(char2idx), len(word2idx), lstm_layers=LSTM_LAYERS, 
                     bidirectional=BIDIRECTIONAL_LSTM, dropout=DROPOUT)
context_enc, query_enc = encode(context, context_chars, question, question_chars)

# verify this with various hyperparameters (by running this cell multiple times)

assert context_enc.shape == (len(context), HIDDEN_DIM*directions)
assert query_enc.shape == (len(question), HIDDEN_DIM*directions)

print("SUCCESS")

SUCCESS


### Part 2: Attention Layer

Implement the Attention Flow Layer same as in BiDAF paper inside `AttentionFlow`. More instructions are included in `MyQA.py`. 



In [14]:
attention = AttentionFlow(HIDDEN_DIM * directions * 3)
G = attention(context_enc, query_enc)

# verify this with various hyperparameters (by running this cell multiple times)
assert G.shape == (len(context), HIDDEN_DIM * directions * 4)

print("SUCCESS")

  a_t = self.softmax(torch.tensor(S[t]))


RuntimeError: Expected object of scalar type float but got scalar type double for sequence element 1.

### Part 3: Modeling Layer

Implement the `ModelingLayer` class in `MyQA.py` same as in BiDAF paper. More instructions are included in `MyQA.py`

In [17]:
modeling = ModelingLayer(HIDDEN_DIM * directions * 4, HIDDEN_DIM, 
                         num_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT, 
                         bidirectional=BIDIRECTIONAL_LSTM)
M = modeling(G)

# verify this with various hyperparameters (by running this cell multiple times)
assert M.shape == (len(context), HIDDEN_DIM * directions)

print("SUCCESS")

SUCCESS


### Part 4: Output Layer

Implement the `OutputLayer` class in `MyQA.py` that will yield start_idx and end_idx vectors.


In [18]:
output = OutputLayer(HIDDEN_DIM*10, HIDDEN_DIM * directions, 
                     HIDDEN_DIM, bidirectional=BIDIRECTIONAL_LSTM)
start, end = output(G, M)

# verify this with various hyperparameters (by running this cell multiple times)
start = start.unsqueeze(0)
end = end.unsqueeze(0)
assert start.shape == (1, len(context))
assert end.shape == (1, len(context))

print("SUCCESS")

SUCCESS


  start = self.start_softmax(start)
  end = self.end_softmax(end)


### Finally implement the `BiDAF` class

Now combine all the modules you just wrote into `BiDAF` class in `MyQA.py`

In [19]:
model = BiDAF(EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM, 
               CHAR_HIDDEN_DIM, len(char2idx), len(word2idx), 
               bidirectional=BIDIRECTIONAL_LSTM, phrase_LSTM_layers=LSTM_LAYERS, 
               modeling_LSTM_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT)
start, end = model(context, context_chars, question, question_chars)
start = start.unsqueeze(0)
end = end.unsqueeze(0)

# verify this with various hyperparameters (by running this cell multiple times)
assert start.shape == (1, len(context))
assert end.shape == (1, len(context))

print("SUCCESS")

SUCCESS


  a_t = self.softmax(torch.tensor(S[t]))
  start = self.start_softmax(start)
  end = self.end_softmax(end)


# 2. Implement Training and Eval

For this homework, you are required to just implement the train and eval functions. You don't need to train it until convergence or until it reaches high accuracy. However, as you might've noticed, the hyperparameters are the same as HW4. After implementing train and eval, your goal is to train the network for 2 epochs and tune hyperparameters so that the loss is decreasing consistently. We will verify this from your outputs. 

The idea here is to see the network we constructed above work. 

In [20]:
def get_eval_scores(predictions):
    """2D list of [[ID, PRED], [ID, PRED], ...]"""
    f1 = exact_match = 0
    for row in predictions:
        [ID, PRED] = row
        goldans = gold_val_answers.loc[ID]
        f1 += max(compute_f1(g, PRED) for g in goldans)
        exact_match += max(compute_exact(g, PRED) for g in goldans)
    num_samples = len(predictions)
    return f1 / num_samples, exact_match / num_samples

In [None]:
def train_BiDAF(epoch, model, loss_function, optimizer):
    train_loss = 0
    train_examples = len(train_df)
    itr = 0
    for i, row in tqdm(train_df.iterrows(), total=len(train_df)):
        # for i, row in train_df.iterrows():
        context = row.context_ids # tokenized context converted to ids
        question = row.question_ids # tokenized query converted to ids
        labels = row.label_idx # [start_idx, end_idx] of answer
        #############################################################################
        # TODO: Implement the training loop
        # Find the gradient with respect to the loss and update the model 
        #   parameters using the optimizer.
        # `context` and `question` are both already tokenized and converted to ids
        # You need to prepare your character level input. Feel free to use the 
        #   prepare_sequence method for this. 
        #############################################################################
        context_chars = []
        for w in context:
            w = idx2word[w]
            padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
            context_chars += padding + list(w)
        context_chars = prepare_sequence(context_chars, char2idx)
        question_chars = []
        for w in question:
            w = idx2word[w]
            padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
            question_chars += padding + list(w)
        question_chars = prepare_sequence(question_chars, char2idx)
        context = torch.tensor(context)
        question = torch.tensor(question)

        start_logits, end_logits = model(context, context_chars, question, question_chars)
        model.zero_grad()
        optimizer.zero_grad()
        loss = loss_function(start_logits, labels[0], end_logits, labels[1])
        # print(model.output.start_linear.weight.grad)
        train_loss = train_loss + loss.detach().data
        
        loss.backward()
        optimizer.step()
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        if ((i+1) % 100) == 0:
            avg_train_loss = train_loss / i
#             print("\nLoss:", avg_train_loss)
            avg_val_loss, f1_avg, exact_match_avg = eval_BiDAF(model, loss_function, rand_samples=10)
            print("Iter: {}/{}\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}\tVal F1: {:.0f} \tVal ExactMatch: {:.0f}".format(i, train_examples, avg_train_loss, avg_val_loss, f1_avg, exact_match_avg))
    
    avg_train_loss = train_loss / train_examples
    avg_val_loss, f1_avg, exact_match_avg = eval_BiDAF(model, loss_function, rand_samples=1000)        
    print("Iter: {}/{}\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}\tVal F1: {:.0f} \tVal ExactMatch: {:.0f}".format(i, train_examples, avg_train_loss, avg_val_loss, f1_avg, exact_match_avg))

def eval_BiDAF(model, loss_function, rand_samples=-1):
    val_loss = 0
    if rand_samples != -1:
        validation_set = val_df.sample(rand_samples)
    else:
        validation_set = val_df
    val_examples = len(validation_set)
    predictions = []
    with torch.no_grad():
        for i, row in tqdm(validation_set.iterrows(), total=len(validation_set)):
            # for i, row in validation_set.iterrows(): # if tqdm is not installed 
            context = row.context_ids # tokenized context converted to ids
            orig_context = prepare_sequence(context, idx2word)
            question = row.question_ids # tokenized query converted to ids
            orig_question = prepare_sequence(question, idx2word)
            [true_start, true_end] = row.label_idx # [start_idx, end_idx] of answer
            pred_start, pred_end = None, None # maxidx of the softmax outputs
            #############################################################################
            # TODO: Implement the training loop
            # Find the gradient with respect to the loss and update the model 
            #   parameters using the optimizer.
            # `context` and `question` are both already tokenized and converted to ids
            # You need to prepare your character level input. Feel free to use the 
            #   prepare_sequence method for this. 
            #############################################################################
            context_chars = []
            for w in orig_context:
                padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
                context_chars += padding + list(w)
            context_chars = prepare_sequence(context_chars, char2idx)
            question_chars = []
            for w in orig_question:
                padding = ['<pad>'] * (MAX_WORD_LEN - len(w))
                question_chars += padding + list(w)
            question_chars = prepare_sequence(question_chars, char2idx)
            context = torch.tensor(context)
            question = torch.tensor(question)
            start_logits, end_logits = model(context, context_chars, question, question_chars)
            loss = loss_function(start_logits, true_start, end_logits, true_end)
            val_loss = val_loss + loss.detach().data
            pred_start = torch.argmax(start_logits)
            pred_end = torch.argmax(end_logits)
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            predictions.append([row.id, orig_context[pred_start : pred_end + 1]])
            
    f1_avg, exact_match_avg = get_eval_scores(predictions)
    avg_val_loss = val_loss / val_examples
    return avg_val_loss, f1_avg*100, exact_match_avg*100

def log_sum_loss(logits_start, start, logits_end, end):
    loss = - torch.log(logits_start[start]) - torch.log(logits_end[end])
    return loss

#############################################################################
# TODO: define loss function and optimizer to be used in train_BiDAF 
#############################################################################
model = BiDAF(EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM, 
               CHAR_HIDDEN_DIM, len(char2idx), len(word2idx), 
               bidirectional=BIDIRECTIONAL_LSTM, phrase_LSTM_layers=LSTM_LAYERS, 
               modeling_LSTM_layers=LSTM_LAYERS_MODELING, dropout=DROPOUT)
loss_function = log_sum_loss # TODO
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) # TODO
for epoch in range(1, EPOCHS + 1):
    train_BiDAF(epoch, model, loss_function, optimizer)
#     eval_BiDAF(model, loss_function, optimizer)
#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################

HBox(children=(FloatProgress(value=0.0, max=8504.0), HTML(value='')))

  a_t = self.softmax(torch.tensor(S[t]))
  start = self.start_softmax(start)
  end = self.end_softmax(end)


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 99/8504	Avg Train Loss: 12.3702	Avg Val Loss: 10.3166	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 199/8504	Avg Train Loss: 14.6799	Avg Val Loss: 9.6961	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 299/8504	Avg Train Loss: 14.0987	Avg Val Loss: 11.1911	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 399/8504	Avg Train Loss: 13.4802	Avg Val Loss: 9.9622	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 499/8504	Avg Train Loss: 13.1452	Avg Val Loss: 8.5801	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 599/8504	Avg Train Loss: 12.6280	Avg Val Loss: 9.5306	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 699/8504	Avg Train Loss: 12.4617	Avg Val Loss: 9.4880	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 799/8504	Avg Train Loss: 12.2055	Avg Val Loss: 10.7511	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 899/8504	Avg Train Loss: 11.9622	Avg Val Loss: 9.2257	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 999/8504	Avg Train Loss: 11.9130	Avg Val Loss: 8.9993	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1099/8504	Avg Train Loss: 12.0731	Avg Val Loss: 12.9293	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1199/8504	Avg Train Loss: 11.9165	Avg Val Loss: 8.2997	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1299/8504	Avg Train Loss: 11.7168	Avg Val Loss: 8.7780	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1399/8504	Avg Train Loss: 11.6348	Avg Val Loss: 9.5933	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1499/8504	Avg Train Loss: 11.5263	Avg Val Loss: 9.9014	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1599/8504	Avg Train Loss: 11.4483	Avg Val Loss: 10.3342	Val F1: 0 	Val ExactMatch: 0


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Iter: 1699/8504	Avg Train Loss: 11.4419	Avg Val Loss: 9.6259	Val F1: 0 	Val ExactMatch: 0
