__This seminar__ teaches you about metric learning for NLP.

In [None]:
import json
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('punkt')

# Stanford question answering dataset (SQuAD)

_this seminar is based on original notebook by [Oleg Vasilev](https://github.com/Omrigan/)_

Today we are going to work with a popular NLP dataset.

Here is the description of the original problem:

```
Stanford Question Answering Dataset (SQuAD) is a new reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage. With 100,000+ question-answer pairs on 500+ articles, SQuAD is significantly larger than previous reading comprehension datasets.
```


We are not going to solve it :) Instead we will try to answer the question in a different way: given the question, we will find a **sentence** containing the answer, but not within the context, but in a **whole databank**

In [None]:
# download the data
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json

In [None]:
data = json.load(open('train-v1.1.json'))

In [None]:
data['data'][0]['paragraphs'][0]

### The NLP part

The code here is very similar to `week10/`: preprocess text into tokens, create dictionaries, etc.

In [None]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter,defaultdict
tokenizer = RegexpTokenizer(r"\w+|\d+")

#Dictionary of tokens
token_counts = Counter()

def tokenize(value):
    return tokenizer.tokenize(value.lower())

for q in tqdm.tqdm_notebook(data['data']):
    for p in q['paragraphs']:
        token_counts.update(tokenize(p['context']))

In [None]:
min_count = 4

tokens = [w for w, c in token_counts.items() if c > min_count] 
tokens = ["_PAD_", "_UNK_"] + tokens

token_to_id = {t : i for i, t in enumerate(tokens)}


In [None]:
assert token_to_id['me'] != token_to_id['woods']
assert token_to_id[tokens[42]]==42
assert len(token_to_id)==len(tokens)

In [None]:
PAD_ix = token_to_id["_PAD_"]
UNK_ix = token_to_id['_UNK_']

#good old as_matrix for the third time
def as_matrix(sequences, max_len=None):
    if isinstance(sequences[0], (str, bytes)):
        sequences = [tokenize(s) for s in sequences]
        
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype='int32') + PAD_ix
    for i, seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_ix) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [None]:
test = as_matrix(["Definitely, thOsE tokens areN'T LowerCASE!!", "I'm the monument to all your sins."])
print(test)
assert test.shape==(2,8)
print("Correct!")

### Build the dataset

In [None]:
from nltk.tokenize import sent_tokenize
def build_dataset(train_data):
    '''Takes SQuAD data
    Returns a list of tuples - a set of pairs (q, a_+)
    '''
    dataset = []
    for row in tqdm.tqdm_notebook(train_data):
        for paragraph in row['paragraphs']:
            offsets = []
            curent_index = 0
            for sent in sent_tokenize(paragraph['context']):
                curent_index+=len(sent)+2
                offsets.append((curent_index, sent))
                
            for qa in paragraph['qas']:
                question, answer = qa['question'], qa['answers'][0]
                
                #find a sentence that contains an answer
                for offset, sent in offsets:
                    if answer['answer_start'] < offset:
                        dataset.append((question, sent))
                        break
    return dataset

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data['data'], test_size=0.1)

train_data = build_dataset(train_data)
val_data = build_dataset(val_data)

In [None]:
for i in range(2, 18, 6):
    print("Q: %s\nA: %s\n" % val_data[i])

# Building the model

Any self-respecting DSSM must have one or several vectorizers. In our case,
* Context vectorizer
* Answer vectorizer

It is perfectly legal to share some layers between them, but make sure they are at least a little different.

In [None]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class GlobalMaxPooling(nn.Module):
    def __init__(self, dim=-1):
        super(self.__class__, self).__init__()
        self.dim = dim
        
    def forward(self, x):
        return x.max(dim=self.dim)[0]

In [None]:
# we might as well create a global embedding layer here

GLOBAL_EMB = nn.Embedding(len(tokens), 64, padding_idx=PAD_ix)

In [None]:
class QuestionVectorizer(nn.Module):
    def __init__(self, n_tokens=len(tokens), out_size=64, use_global_emb=True):
        """ 
        A simple sequential encoder for questions.
        Use any combination of layers you want to encode a variable-length input 
        to a fixed-size output vector
        
        If use_global_emb is True, use GLOBAL_EMB as your embedding layer
        """
        super(self.__class__, self).__init__()
        if use_global_emb:
            self.emb = GLOBAL_EMB
        else:
            self.emb = <YOUR CODE>
            
        <YOUR CODE>
        
    def forward(self, text_ix):
        """
        :param text_ix: int64 Variable of shape [batch_size, max_len]
        :returns: float32 Variable of shape [batch_size, out_size]
        """
        <YOUR CODE>
        return <YOUR CODE>

In [None]:
class AnswerVectorizer(nn.Module):
    def __init__(self, n_tokens=len(tokens), out_size=64, use_global_emb=True):
        """ 
        A simple sequential encoder for answers.
        x -> emb -> conv -> global_max -> relu -> dense
        
        If use_global_emb is True, use GLOBAL_EMB as your embedding layer
        """
        super(self.__class__, self).__init__()
        if use_global_emb:
            self.emb = GLOBAL_EMB
        else:
            self.emb = <YOUR CODE>
            
        <YOUR CODE>
        
    def forward(self, text_ix):
        """
        :param text_ix: int64 Variable of shape [batch_size, max_len]
        :returns: float32 Variable of shape [batch_size, out_size]
        """
        <YOUR CODE>
        return <YOUR CODE>

In [None]:
for vectorizer in [QuestionVectorizer(out_size=100), AnswerVectorizer(out_size=100)]:
    print("Testing %s ..." % vectorizer.__class__.__name__)
    dummy_x = Variable(torch.LongTensor(test))
    dummy_v = vectorizer(dummy_x)

    assert isinstance(dummy_v, Variable)
    assert tuple(dummy_v.shape) == (dummy_x.shape[0], 100)

    del vectorizer
    print("Seems fine")

In [None]:
from itertools import chain

question_vectorizer = QuestionVectorizer()
answer_vectorizer = AnswerVectorizer()

opt = torch.optim.Adam(chain(question_vectorizer.parameters(),
                             answer_vectorizer.parameters()))

We are going to use a single `encode`, but with different weights. You can use different encode for anchor and negatives/positives.

Negative sampling can be either `in-graph` or `out-graph`. We start with out-graph. In the home assignment you are going to use in-graph.

In [None]:
def generate_batch(data, batch_size=None, replace=False, volatile=False, max_len=None):
    """ Samples training/validation batch with random negatives """
    if batch_size is not None:
        batch_ix = np.random.choice(len(data), batch_size, replace=replace)
        negative_ix = np.random.choice(len(data), batch_size, replace=True)
    else:
        batch_ix = range(len(data))
        negative_ix = np.random.permutation(np.arange(len(data)))

    
    anchors, positives = zip(*[data[i] for i in batch_ix])
    
    # sample random rows as negatives.
    # Note: you can do better by sampling "hard" negatives
    negatives = [data[i][1] for i in negative_ix]
    
    anchors, positives, negatives = map(lambda x: Variable(torch.LongTensor(as_matrix(x, max_len=max_len)),
                                                           volatile=volatile), 
                                        [anchors, positives, negatives])
    return anchors, positives, negatives

In [None]:
_dummy_anchors, _dummy_positives, _dummy_negatives = generate_batch(train_data, 2)

print("Q:")
print(_dummy_anchors)
print("A+:")
print(_dummy_positives)
print("A-:")
print(_dummy_negatives)

In [None]:
def compute_loss(anchors, positives, negatives, delta=1):
    """ 
    Compute the triplet loss:
    
    max(0, delta + sim(anchors, negatives) - sim(anchors, positives))
    
    where sim is a dot-product between vectorized inputs
    
    """
    <YOUR CODE>
    return <YOUR CODE>

In [None]:
def compute_recall(anchors, positives, negatives, delta=1):
    """
    Compute the probability (ratio) at which sim(anchors, negatives) is greater than sim(anchors, positives)
    """
    <YOUR CODE>
    return <YOUR CODE>

In [None]:
print(compute_loss(_dummy_anchors, _dummy_positives, _dummy_negatives))
print(compute_recall(_dummy_anchors, _dummy_positives, _dummy_negatives))

### Training loop

In [None]:
num_epochs = 100
max_len = 100
batch_size = 32
batches_per_epoch = 100

In [None]:
from tqdm import tnrange
def iterate_minibatches(data, batch_size=32, max_len=None,
                        max_batches=None, shuffle=True, verbose=True):
    indices = np.arange(len(data))
    if shuffle:
        indices = np.random.permutation(indices)
    if max_batches is not None:
        indices = indices[: batch_size * max_batches]
        
    irange = tnrange if verbose else range
    
    for start in irange(0, len(indices), batch_size):
        yield generate_batch([data[i] for i in indices[start : start + batch_size]], max_len=max_len)

For a difference, we'll ask __you__ to implement training loop this time.

Here's a sketch of one epoch:
1. iterate over __`batches_per_epoch`__ batches from __`train_data`__
    * Compute loss, backprop, optimize
    * Compute and accumulate recall
    
2. iterate over __`batches_per_epoch`__ batches from __`val_data`__
    * Compute and accumulate recall
    
3. print stuff :)

In [None]:
<YOUR CODE>