In [None]:
import json
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('punkt')

# Stanford question answering dataset (SQuAD)

Today we are going to work with a popular NLP dataset.

Here is the description of the original problem:

```
Stanford Question Answering Dataset (SQuAD) is a new reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage. With 100,000+ question-answer pairs on 500+ articles, SQuAD is significantly larger than previous reading comprehension datasets.
```


We are not going to solve it :) Instead we will try to answer the question in a different way: given the question, we will find a **sentence** containing the answer, but not within the context, but in a **whole databank**

Just watch the hands

In [None]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json

In [None]:
data = json.load(open('train-v1.1.json'))

In [None]:
data['data'][0]

The code here is very similar to `week5/`

In [None]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter,defaultdict
tokenizer = RegexpTokenizer(r"\w+|\d+")

#Dictionary of tokens
token_counts = Counter()

def tokenize(value):
    return tokenizer.tokenize(value.lower())



for q in tqdm.tqdm_notebook(data['data']):
    for p in q['paragraphs']:
        token_counts.update(tokenize(p['context']))

In [None]:
min_count = 4

tokens = [w for w, c in token_counts.items() if c > min_count] 

In [None]:
dict_size = len(tokens)+2
id_to_word = dict()
word_to_id = dict()

token_to_id = {t:i+2 for i,t in enumerate(tokens)}

id_to_token = {i+2:t for i,t in enumerate(tokens)}

In [None]:
assert token_to_id['me'] != token_to_id['woods']
assert token_to_id[id_to_token[42]]==42
assert len(token_to_id)==len(tokens)
assert 0 not in id_to_token

In [None]:
from nltk.tokenize import sent_tokenize
def build_dataset(train_data):
    '''Takes SQuAD data
    Returns a list of tuples - a set of pairs (q, a_+)
    '''
    D = []
    for q in tqdm.tqdm_notebook(train_data):
        for p in q['paragraphs']:
            offsets = []
            curent_index = 0
            for sent in sent_tokenize(p['context']):
                curent_index+=len(sent)+2
                offsets.append((curent_index, sent))
                
                
            for qa in p['qas']:
                answer = qa['answers'][0]
                found = False
                for o, sent in offsets:
                    if answer['answer_start']<o:
                        D.append((qa['question'], sent))
                        found = True
                        break
                assert found
    return D

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data['data'], test_size=0.1)

Dtrain = build_dataset(train_data)
Dval = build_dataset(val_data)

In [None]:
Dval[2]

In [None]:
def vectorize(strings, token_to_id, UNK):
    '''This function gets a string array and transforms it to padded token matrix
    Remember to:
     - Transform a string to list of tokens
     - Transform each token to it ids (if not in the dict, replace with UNK)
     - Pad each line to max_len'''
    max_len = 0
    token_matrix = []
    
    #<your code>\
    num_tokens = []
    
    for s in strings:
        seq = []
        for token in tokenize(s):
            if token in token_to_id:
                seq.append(token_to_id[token])
            else:
                seq.append(UNK)
        token_matrix.append(seq)
        max_len = max(max_len, len(token_matrix[-1]))
    # empty batch plug
    if max_len == 0:
        max_len = 1
    for i in range(len(token_matrix)):
        num_tokens.append(len(token_matrix[i]))
        while(len(token_matrix[i])<max_len):
            token_matrix[i].append(0)
            
   

    
    
    return np.array(token_matrix), np.array(num_tokens)

In [None]:
test = vectorize(["Hello, adshkjasdhkas, world", "data"], token_to_id, 1)[0]
assert test.shape==(2,3)
assert (test[:,1]==(1,0)).all()
print("Correct!")

# Deep Learning

The beginning is same as always

In [None]:
%env CUDA_VISIBLE_DEVICES = ""
import tensorflow as tf
tf.reset_default_graph()

In [None]:
target_space_dim = 50 # Here we define dimension of the target space

In [None]:
embeddings_size = 50
word_embeddings_matrix = <YOUR CODE>

In [None]:
def get_signle_input(name):
    '''Returns a pair of inputs'''
    return (tf.placeholder(dtype=tf.int32,shape=[None, None], name="word_ids_%s"%name),
    tf.placeholder(dtype=tf.int32,shape=[None], name="num_words_%s"%name))

In [None]:
def encode(word_ids, num_words,name, reuse=False):
    '''The function takes:
     - word_ids - a matrix with word ids
     - num_words - a vector, showing how many words is in each sample
     - name - name for variables
     - reuse - are weights reused
     Returns:
     - outputs - a matrix [batch_size, target_space_dim]
    '''
    <YOUR CODE>
    return output


We are going to use a single `encode`, but with different weights. You can use different encode for anchor and negatives/positives.

Negative sampling can be either `in-graph` or `out-graph`. We start with out-graph. In the home assignment you are going to use in-graph.

In [None]:
def sample_semihard_outputs(anchor_output, positive_output):
    """Function samples negatives in-graph. Returns negative_output. Use it in home assignment"""
    raise NotImplementedError

In [None]:
inputs = {name: get_signle_input(name) for name in ['anchor', 'positive', 'negative']}
margin = 0.1
anchor_output = encode(*inputs['anchor'], 'anchor')
positive_output = <YOUR CODE>
negative_output = <YOUR CODE>

positive_dot = <YOUR CODE>
negative_dot = <YOUR CODE>

loss = <YOUR CODE>
recall = tf.reduce_mean(tf.cast(tf.greater(positive_dot, negative_dot), tf.float32))

In [None]:
batch_size = 200
def iterate_batches(data, only_positives=False):
    """Takes a D
    Returns a dict, containing pairs for each input type
    only_positives indicates either we need to iterate over triplets vs only positive (needed for index)
    """
    i = 0
    while i < len(data):
        batch = dict()
        data_batch = data[i:i+batch_size]
        
        
        batch['positive'] = vectorize([sample[1] for sample in data_batch], token_to_id, 1)
        if not only_positives:
            <YOUR CODE>
        
       
        
        yield batch
        i+=batch_size

In [None]:
optimizer = tf.train.AdamOptimizer() # <your code here>
global_step = tf.Variable(initial_value=0)
train_op = optimizer.minimize(
  loss=loss,
  global_step=global_step, var_list=tf.trainable_variables())

In [None]:
#list(iterate_batches(D))

In [None]:
def get_inputs(batch):
    feed_dict = {}
    for name, tensors in batch.items():
        feed_dict[inputs[name][0]] = tensors[0]
        feed_dict[inputs[name][1]] = tensors[1]
    return feed_dict

In [None]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [None]:
def validate():
    total_loss, total_recall = 0, 0
    batches = 0
    for  batch in  iterate_batches(Dval):
        batches+=1
        current_loss, current_recall =  sess.run([loss, recall], get_inputs(batch))
        total_loss+=current_loss
        total_recall+=current_recall
    total_loss/=batches
    total_recall/=batches
    if total_recall > 0.9:
        print('Cool! If recall is right, you earned (3 pts)')
    return (total_loss, total_recall)

In [None]:
num_epochs = 100
for j in range(num_epochs):
    for i, (batch) in  enumerate(iterate_batches(Dtrain)):
        _, step, current_loss, current_recall =  sess.run([train_op, global_step, loss, recall], get_inputs(batch))
        print("Current step: %s. Current loss is %s, Current recall is %s" % (step, current_loss, current_recall))
        if i%100==0:
            print("Validation. Loss: %s, Recall: %s" %validate())

In [None]:
class Index(object):
    """Represents index of calculated embeddings"""
    def __init__(self, D):
        """Class constructor takes a dataset and stores all unique sentences and their embeddings"""
        <YOUR CODE>
        
    def predict(self, query, top_size =1):
        """
        Function takes:
         - query is a string, containing question
        Function returns:
         - a list with len of top_size, containing the closet answer from the index
        You may want to use np.argpartition
          """
        <YOUR CODE>
    
    def calculate_FHS(self, D):
        """Prototype for home assignment. Returns a float number"""
        raise NotImplementedError
        
        
        

In [None]:
index = Index(Dval)

In [None]:
assert len(index.vectors) == len(index.sent)
assert type(index.sent[1])==str
assert index.vectors.shape == (len(index.sent), target_space_dim)
p  = index.predict("Hey", top_size=3)
assert len(p) == 3
assert type(p[0])==str
assert index.predict("Hello", top_size=50)!=index.predict("Not Hello", top_size=50)
print("Ok (2 pts)")

In [None]:
index.predict('To show their strength in the international Communist movement, what did China do?', top_size=10)

In [None]:
Dval[np.random.randint(0, 100)]

# Home assignment
**Task 1.** (3 pts) Implement **semihard** sampling strategy. Use **in-graph** sampling. You have a prototype above

**Task 2.1.** (1 pt) Calculate a **FHS** (First Hit Success) metric on a whole validation dataset (over each query on whole `Dval` index). Prototype of the function in in `Index` class. Compare different model based on this metric. Add table with FHS values to your report.

**Task 2.2.** Add calculation of other representative metrics. You may want to calculate different recalls on a mini-batch, or some ranking metrics.   

**Task 3.** (2 pt) Do experiments with deep architecture and find out the best one. Analyse your results and write a conclusion. 

**describe your results here**

Bonus task 1. (2++ pts) Add manual negatives to the model. What can be a good manual negative in this case?

Bonus task 2. (2++ pts) Implement more efficient Nearest Neighbors Search method. How well it performs on our dataset?



