# SQuAD - A Model

## Loading

In [None]:
"""Load spacy."""
from spacy.en import English
spacy = English()
print('Spacy loaded.')

In [None]:
import simplejson as json
def load_data_file(filepath):
    """Load the json file, and check the version."""
    with open(filepath) as data_file:
        parsed_file = json.load(data_file)
        if (parsed_file['version'] != '1.0'):
            raise ValueError('Dataset version unrecognized.')
        return parsed_file['data']
    
train, dev = load_data_file('./data/train-v1.0.json'), load_data_file('./data/dev-v1.0.json')

## Statistics

## Overlap between train and dev contexts

In [None]:
from tqdm import tqdm

def simple_tokenize(text):
        return [word.lower() for word in text.split()]

def get_context_vocab(data):    
    """Get the set of words in the paragraphs of data."""
    set_of_words_in_data = set()
    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            c = paragraph['context']
            c_tokens = simple_tokenize(c)
            set_of_words_in_data |= set(c_tokens)
    return set_of_words_in_data

train_context_words = get_context_vocab(train)
print("# Unique Context Words in train:", len(train_context_words))
dev_words = get_context_vocab(dev)
print("# Unique Context Words in dev:", len(dev_words))
difference_words = dev_words - train_context_words
print("# Unique Context Words in dev not in train:", len(difference_words))

## Overlap between train context and dev answers

In [None]:
def get_answer_vocab(data):
    """Get the set of words in the answers of data."""
    set_of_words_in_data = set()
    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                answer = qa['answers'][0]
                for answer in qa['answers']:
                    a = answer['text']
                    a_tokens = simple_tokenize(a)
                    tok_set = set(a_tokens)
                    set_of_words_in_data |= tok_set
    return set_of_words_in_data

print("# Unique Context Words in train:", len(train_context_words))
print("# Unique Context Words in train:", len(train_context_words))
dev_answer_words = get_answer_vocab(dev)
print("# Unique Answer Words in dev:", len(dev_answer_words))
difference_words = dev_answer_words - train_context_words
print("# Unique Answer Words in dev but not in context train:",
      len(difference_words))

## How do you annotate data?

In [None]:
def annotate(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    answers.append({
                        'text': spacy(answer['text']),
                        'answer_start': answer['answer_start'],
                    })
                qas.append({
                        'question': spacy(qa['question']),
                        'answers': answers
                    })
            paragraphs.append({
                    'context': spacy(paragraph['context']),
                    'qas': qas
                })
        articles.append({
                'title': spacy(article['title']),
                'paragraphs': paragraphs
            })
    return articles

train_annotated, dev_annotated = annotate(train), annotate(dev)

In [None]:
def get_span_contain_position(s1, s2):
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return x_longest - longest, x_longest

import random
sample_article = random.choice(dev_annotated)
sample_paragraph = sample_article['paragraphs'][0]
sample_answer = sample_paragraph['qas'][0]['answers'][0]

sample_paragraph_text = sample_paragraph['context']
sample_question_text = sample_paragraph['qas'][0]['question']
sample_answer_text = sample_answer['text']
sample_answer_start = sample_answer['answer_start']

print(sample_paragraph_text, sample_question_text, sample_answer_text, sample_answer_start)
start_index, end_index = get_span_contain_position(
    list(map(lambda x: x.text, sample_paragraph_text)),
    list(map(lambda x: x.text, sample_answer_text)))
print(start_index, end_index)
sample_paragraph_text[start_index:end_index].text == sample_answer_text.text

In [None]:
def get_answers_as_context_spans(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            context = paragraph['context']
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    context_text = list(map(lambda x: x.text, context))
                    answer_text = list(map(lambda x: x.text, answer['text']))
                    start_index, end_index = get_span_contain_position(context_text, answer_text)
                    answer = context[start_index:end_index]
                    answers.append(answer)
                qa['answers'] = answers
                qas.append(qa)
            paragraph['qas'] = qas
            paragraphs.append(paragraph)
        article['paragraphs'] = paragraphs
        articles.append(article)
    return articles

devlet = get_answers_as_context_spans(dev_annotated)

In [None]:
def print_sample(data):
    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    yield context, question, answer

next(print_sample(devlet))