In [None]:
import json
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('punkt')

# Stanford question answering dataset (SQuAD)

Today we are going to work with a popular NLP dataset.

Here is the description of the original problem:

```
Stanford Question Answering Dataset (SQuAD) is a new reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage. With 100,000+ question-answer pairs on 500+ articles, SQuAD is significantly larger than previous reading comprehension datasets.
```


We are not going to solve it :) Instead we will try to answer the question in a different way: given the question, we will find a **sentence** containing the answer, but not within the context, but in a **whole databank**

Just watch the hands

In [None]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json

In [None]:
data = json.load(open('train-v1.1.json'))

In [None]:
data['data'][0]

The code here is very similar to `week5/`

In [None]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter,defaultdict
tokenizer = RegexpTokenizer(r"\w+|\d+")

#Dictionary of tokens
token_counts = Counter()

def tokenize(value):
    return tokenizer.tokenize(value.lower())

for q in tqdm.tqdm_notebook(data['data']):
    for p in q['paragraphs']:
        token_counts.update(tokenize(p['context']))

In [None]:
min_count = 4

tokens = [w for w, c in token_counts.items() if c > min_count] 

In [None]:
dict_size = len(tokens)+2

token_to_id = {t: i + 2 for i,t in enumerate(tokens)}
id_to_token = {i + 2: t for i,t in enumerate(tokens)}

In [None]:
assert token_to_id['me'] != token_to_id['woods']
assert token_to_id[id_to_token[42]]==42
assert len(token_to_id)==len(tokens)
assert 0 not in id_to_token

In [None]:
from nltk.tokenize import sent_tokenize
def build_dataset(train_data):
    '''Takes SQuAD data
    Returns a list of tuples - a set of pairs (q, a_+)
    '''
    data = []
    for q in tqdm.tqdm_notebook(train_data):
        for p in q['paragraphs']:
            offsets = []
            curent_index = 0
            for sent in sent_tokenize(p['context']):
                curent_index+=len(sent)+2
                offsets.append((curent_index, sent))
                
            for qa in p['qas']:
                answer = qa['answers'][0]
                found = False
                for o, sent in offsets:
                    if answer['answer_start']<o:
                        data.append((qa['question'], sent))
                        found = True
                        break
                assert found
    return data

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data['data'], test_size=0.1)

data_train = build_dataset(train_data)
data_val = build_dataset(val_data)

In [None]:
data_val[2]

In [1]:
def vectorize(strings, token_to_id, UNK=1, PAD=0):
    '''This function gets a string array and transforms it to padded token matrix
    Remember to:
     - Transform a string to list of tokens
     - Transform each token to it ids (if not in the dict, replace with UNK)
     - Pad each line to max_len'''
    token_matrix = []
    
    for s in strings:
        seq = [token_to_id.get(token,UNK) for token in tokenize(s)]
        token_matrix.append(seq)
    
    max_len = max(map(len,token_matrix))
        
    # handle empty batch
    if max_len == 0:
        max_len = 1
    
    for i in range(len(token_matrix)):
        while(len(token_matrix[i]) < max_len):
            token_matrix[i].append(PAD)
    
    return np.array(token_matrix,dtype='int32')

In [2]:
test = vectorize(["Hello, adshkjasdhkas, world", "data"], token_to_id, 1)
assert test.shape==(2,3)
assert (test[:,1]==(1,0)).all()
print("Correct!")

NameError: name 'token_to_id' is not defined

# Deep Learning

The beginning is same as always

In [None]:
import theano
import theano.tensor as T
import lasagne
from lasagne.layers import *

margin = 0.1

In [None]:
def build_encoder(lstm_size=50, embeddings_size=50, target_space_dim=50, PAD=0):
    '''
    Build a lasagne network that converts input sequence to a fixed-size vector.
    Must have a single input layer that accepts int32[batch,max_len]
    '''
    inp = InputLayer([None, None], dtype='int32')
    mask = ExpressionLayer(inp, lambda ix: T.neq(ix,PAD))

    #YOUR CODE HERE: build at least a single-layer LSTM with embedding input.
    # use mask_input=mask and select final (or maximum) outputs
    assert tuple(net.output_shape) == (None,target_space_dim)
    return net

question_encoder = build_encoder()
answer_encoder = build_encoder()

We are going to use a single encoder for both poitive and negative answers.

In [None]:
questions = T.imatrix(name="word_ids_questions")
answers_positive = T.imatrix(name="word_ids_answers_positive")
answers_negative = T.imatrix(name="word_ids_answers_negative")

positive_output = get_output(answer_encoder,answers_positive)
negative_output = #YOUR CODE: answer decoder's vector for negative answers
anchor_output = #YOUR CODE: question decoder's vector for questions


In [None]:
# compute dot products to get similarity. Also: you can use T.batched_dot for speed
positive_dot = T.sum(anchor_output*positive_output, axis=1)
negative_dot = T.sum(anchor_output*negative_output, axis=1)


# compute triplet loss (pairwise hinge loss) as per formulae in the lecture.
# please use T.maximum and not T.max!
loss = #YOUR CODE

recall = T.mean(positive_dot > negative_dot)

In [None]:
allparams = get_all_params([answer_encoder,question_encoder],trainable=True)

updates = lasagne.updates.adam(loss, allparams)
train_op = theano.function([questions, answers_positive, answers_negative],
                           [loss, recall],
                           updates=updates)

validate_op = theano.function([questions, answers_positive, answers_negative], [loss, recall])

### Training on minibatches

In [None]:
batch_size = 200
def iterate_batches(data, only_positives=False):
    """Takes a data list
    Returns a dict, containing pairs for each input type
    only_positives indicates either we need to iterate over triplets vs only positive (needed for index)
    """

    ind = 0
    while ind < len(data):
        data_batch = data[ind:ind + batch_size]
        batch = {}
        
        batch['positive'] = vectorize([sample[1] for sample in data_batch], token_to_id)
        if not only_positives:
            batch['anchor'] = vectorize([sample[0] for sample in data_batch], token_to_id)
            batch['negative'] = vectorize([ data[np.random.randint(0, len(data))][1]  for i in range(len(data_batch))], \
                                          token_to_id)
        
        yield batch
        ind+=batch_size

In [None]:
def validate():
    total_loss, total_recall = 0, 0
    batches = 0
    for batch in  iterate_batches(data_val):
        batches+=1
        current_loss, current_recall =  validate_op(batch['anchor'],
                                                    batch['positive'],
                                                    batch['negative'])
        total_loss+=current_loss
        total_recall+=current_recall
        
    total_loss/=batches
    total_recall/=batches
    
    if total_recall > 0.9:
        print('Cool! If recall is right, you earned (3 pts)')
    return (total_loss, total_recall)

In [None]:
num_epochs = 100
step = 0
for j in range(num_epochs):
    for i, (batch) in  enumerate(iterate_batches(data_train)):
        current_loss, current_recall =  train_op(batch['anchor'],
                                                 batch['positive'],
                                                 batch['negative'])
        step+=1
        print("Current step: %s. Current loss is %s, Current recall is %s" % (step, current_loss, current_recall))
        if i%100==0:
            print("Validation. Loss: %s, Recall: %s" %validate())

In [None]:
class Index(object):
    """Represents index of calculated embeddings"""
    def __init__(self, data):
        """Class constructor takes a dataset and stores all unique sentences and their embeddings"""
        raise NotImplemented()
    def predict(self, query, top_size =1):
        """
        Function takes:
         - query is a string, containing question
        Function returns:
         - a list with len of top_size, containing the closet answer from the index
        You may want to use np.argpartition
          """
        raise NotImplemented()    
    def calculate_FHS(self, D):
        """Prototype for home assignment. Returns a float number"""
        raise NotImplemented()
        
        
        

In [None]:
index = Index(data_val)

In [None]:
assert len(index.vectors) == len(index.sent)
assert type(index.sent[1])==str
assert index.vectors.shape == (len(index.sent), target_space_dim)
p  = index.predict("Hey", top_size=3)
assert len(p) == 3
assert type(p[0])==str
assert index.predict("Hello", top_size=50)!=index.predict("Not Hello", top_size=50)
print("Ok (2 pts)")

In [None]:
index.predict('To show their strength in the international Communist movement, what did China do?', top_size=10)

In [None]:
data_val[np.random.randint(0, 100)]

# Home assignment
**Task 1.** (3 pts) Implement **semihard** sampling strategy. Use **in-graph** sampling. You have a prototype above

**Task 2.1.** (1 pt) Calculate a **FHS** (First Hit Success) metric on a whole validation dataset (over each query on whole `data_val` index). Prototype of the function in in `Index` class. Compare different model based on this metric. Add table with FHS values to your report.

**Task 2.2.** Add calculation of other representative metrics. You may want to calculate different recalls on a mini-batch, or some ranking metrics.   

**Task 3.** (2 pt) Do experiments with deep architecture and find out the best one. Analyse your results and write a conclusion. 

**describe your results here**

Bonus task 1. (2++ pts) Add manual negatives to the model. What can be a good manual negative in this case?

Bonus task 2. (2++ pts) Implement more efficient Nearest Neighbors Search method. How well it performs on our dataset?



