# SQuAD - A Model

## Loading

In [3]:
"""Load spacy."""
from spacy.en import English
spacy = English()
print('Spacy loaded.')

Spacy loaded.


In [4]:
import simplejson as json
def load_data_file(filepath):
    """Load the json file, and check the version."""
    with open(filepath) as data_file:
        parsed_file = json.load(data_file)
        if (parsed_file['version'] != '1.0'):
            raise ValueError('Dataset version unrecognized.')
        return parsed_file['data']
    
train, dev = load_data_file('./data/train-v1.0.json'), load_data_file('./data/dev-v1.0.json')

## Statistics

## Overlap between train and dev contexts

In [5]:
from tqdm import tqdm

def simple_tokenize(text):
        return [word.lower() for word in text.split()]

def get_context_vocab(data):    
    """Get the set of words in the paragraphs of data."""
    set_of_words_in_data = set()
    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            c = paragraph['context']
            c_tokens = simple_tokenize(c)
            set_of_words_in_data |= set(c_tokens)
    return set_of_words_in_data

train_context_words = get_context_vocab(train)
print("# Unique Context Words in train:", len(train_context_words))
dev_words = get_context_vocab(dev)
print("# Unique Context Words in dev:", len(dev_words))
difference_words = dev_words - train_context_words
print("# Unique Context Words in dev not in train:", len(difference_words))

100%|██████████| 442/442 [00:01<00:00, 417.97it/s]
100%|██████████| 48/48 [00:00<00:00, 365.82it/s]

# Unique Context Words in train: 169632
# Unique Context Words in dev: 38449
# Unique Context Words in dev not in train: 12572





## Overlap between train context and dev answers

In [6]:
def get_answer_vocab(data):
    """Get the set of words in the answers of data."""
    set_of_words_in_data = set()
    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                answer = qa['answers'][0]
                for answer in qa['answers']:
                    a = answer['text']
                    a_tokens = simple_tokenize(a)
                    tok_set = set(a_tokens)
                    set_of_words_in_data |= tok_set
    return set_of_words_in_data

print("# Unique Context Words in train:", len(train_context_words))
print("# Unique Context Words in train:", len(train_context_words))
dev_answer_words = get_answer_vocab(dev)
print("# Unique Answer Words in dev:", len(dev_answer_words))
difference_words = dev_answer_words - train_context_words
print("# Unique Answer Words in dev but not in context train:",
      len(difference_words))

  0%|          | 0/48 [00:00<?, ?it/s]

# Unique Context Words in train: 169632
# Unique Context Words in train: 169632


100%|██████████| 48/48 [00:00<00:00, 325.02it/s]

# Unique Answer Words in dev: 13002
# Unique Answer Words in dev but not in context train: 3249





## How do you annotate data?

In [7]:
def transform_data_annotate(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    answers.append({
                        'text': spacy(answer['text']),
                        'answer_start': answer['answer_start'],
                    })
                qas.append({
                        'question': spacy(qa['question']),
                        'answers': answers
                    })
            paragraphs.append({
                    'context': spacy(paragraph['context']),
                    'qas': qas
                })
        articles.append({
                'title': spacy(article['title']),
                'paragraphs': paragraphs
            })
    return articles

train_annotated, dev_annotated = transform_data_annotate(train), transform_data_annotate(dev)

100%|██████████| 442/442 [04:54<00:00,  1.61it/s]
100%|██████████| 48/48 [00:37<00:00,  1.28it/s]


## Make answers spans of context

In [8]:
def get_span_contain_position(s1, s2):
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return x_longest - longest, x_longest


def transform_data_answer_spannify(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            context = paragraph['context']
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    context_text = list(map(lambda x: x.text, context))
                    answer_text = list(map(lambda x: x.text, answer['text']))
                    start_index, end_index = get_span_contain_position(context_text, answer_text)
                    answer = context[start_index:end_index]
                    answers.append(answer)
                qa['answers'] = answers
                qas.append(qa)
            paragraph['qas'] = qas
            paragraphs.append(paragraph)
        article['paragraphs'] = paragraphs
        articles.append(article)
    return articles

train_spanned, dev_spanned = transform_data_answer_spannify(train_annotated), transform_data_answer_spannify(dev_annotated)

100%|██████████| 442/442 [00:34<00:00, 12.70it/s]
100%|██████████| 48/48 [00:13<00:00,  3.29it/s]


In [25]:
def transform_data_into_triples(data):
    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    yield context, question, answer

train_triples, dev_triples = list(transform_data_into_triples(train_spanned)), list(transform_data_into_triples(dev_spanned))

## Put input in right format

In [None]:
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def transform_to_model_format(triples):
    def get_max_lengths(triples):
        max_context = max_question = max_answer = 0
        def set_longest(text, max_field):
            if len(text.text) > max_field:
                max_field = len(text.text)
            return max_field

        for context, question, answer in tqdm(triples):
            max_context = set_longest(context, max_context)
            max_question = set_longest(question, max_question)
            max_answer = set_longest(answer, max_answer)

        return max_context, max_question, max_answer

    max_context, max_question, max_answer = get_max_lengths(triples)
    
    def preprocess_triples(triples):
        for context, question, answer in tqdm(triples):
            context = [token.orth for token in context]
            question = [token.orth for token in question]
            answer_start, answer_end = answer.start, answer.end
            answer = np.zeros(max_answer)
            answer[answer_start: answer_end] = 1
            yield context, question, answer

    contexts, questions, answers = zip(*preprocess_triples(triples))
    contexts = pad_sequences(contexts, maxlen=max_context)
    questions = pad_sequences(questions, maxlen=max_question)
    return contexts, questions, answers

print(transform_to_model_format(dev_triples)[0])

100%|██████████| 33615/33615 [00:02<00:00, 16427.39it/s]
 93%|█████████▎| 31380/33615 [00:01<00:00, 26016.19it/s]

In [12]:
from keras.layers import Input, Embedding, LSTM, Dense, merge
from keras.models import Model

# headline input: meant to receive sequences of 100 integers, between 1 and 10000.
# note that we can name any layer by passing it a "name" argument.
main_input = Input(shape=(100,), dtype='int32', name='main_input')

# this embedding layer will encode the input sequence
# into a sequence of dense 512-dimensional vectors.
x = Embedding(output_dim=512, input_dim=10000, input_length=100)(main_input)

# a LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence
lstm_out = LSTM(32)(x)

Using Theano backend.
Using gpu device 0: GeForce GT 650M (CNMeM is enabled with initial size: 70.0% of memory, cuDNN 5005)
