# Loading

In [1]:
import spacy
from spacy.en import English
nlp = English()

In [2]:
import simplejson as json
def load_data_file(filepath):
    """Load the json file, and check the version."""
    with open(filepath) as data_file:
        parsed_file = json.load(data_file)
        if (parsed_file['version'] != '1.0'):
            raise ValueError('Dataset version unrecognized.')
        return parsed_file['data']
    
train, dev = load_data_file('./data/train-v1.0.json'), load_data_file('./data/dev-v1.0.json')

## Annotate data

In [4]:
from tqdm import tqdm

def transform_data_annotate(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    answers.append({
                        'text': nlp(answer['text'], entity = True),
                        'answer_start': answer['answer_start'],
                    })
                qas.append({
                        'question': nlp(qa['question'], entity = True),
                        'answers': answers
                    })
            paragraphs.append({
                    'context': nlp(paragraph['context'], entity = True),
                    'qas': qas
                })
        articles.append({
                'title': nlp(article['title'], entity = True),
                'paragraphs': paragraphs
            })
    return articles

train_annotated, dev_annotated = transform_data_annotate(train), transform_data_annotate(dev)

100%|██████████| 442/442 [04:31<00:00,  1.57it/s]
100%|██████████| 48/48 [00:36<00:00,  1.39it/s]


## Make answers spans of context

In [5]:
def get_span_contain_position(s1, s2):
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return x_longest - longest, x_longest


def transform_data_answer_spannify(data):
    articles = []
    for article in tqdm(data):
        paragraphs = []
        for paragraph in article['paragraphs']:
            qas = []
            context = paragraph['context']
            for qa in paragraph['qas']:
                answers = []
                for answer in qa['answers']:
                    context_text = [x.text for x in context]
                    answer_text = [x.text for x in answer['text']]
                    start_index, end_index = get_span_contain_position(context_text, answer_text)
                    answer = context[start_index:end_index]
                    answers.append(answer)
                qa['answers'] = answers
                qas.append(qa)
            paragraph['qas'] = qas
            paragraphs.append(paragraph)
        article['paragraphs'] = paragraphs
        articles.append(article)
    return articles

train_spanned, dev_spanned = transform_data_answer_spannify(train_annotated), transform_data_answer_spannify(dev_annotated)

100%|██████████| 442/442 [00:42<00:00,  9.44it/s]
100%|██████████| 48/48 [00:12<00:00,  3.37it/s]


## Put input in right format

In [6]:
def get_max_lengths(data):
    max_context = max_question = max_answer = 0

    def set_longest(text, max_field):
        if len(text.text) > max_field:
            max_field = len(text.text)
        return max_field

    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            max_context = set_longest(context, max_context)
            for qa in paragraph['qas']:
                question = qa['question']
                max_question = set_longest(question, max_question)
                for answer in qa['answers']:
                    max_answer = set_longest(answer, max_answer)

    return max_context, max_question, max_answer

max_context, max_question, max_answer = get_max_lengths(train_spanned)
print(max_context, max_question, max_answer)

100%|██████████| 442/442 [00:05<00:00, 74.31it/s]

3706 25651 239





In [None]:
import numpy as np
from scipy import sparse

def transform_data_to_model_format(data, max_context, max_question, max_answer):
    store = spacy.strings.StringStore()
    contexts, questions, answers = [], [], []
    
    for article in tqdm(data):
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            c = np.zeros(max_context, dtype=np.int)
            c[:len(context)] = [store[token.orth_] for token in context]
            sparse.csr_matrix(
.            for qa in paragraph['qas']:
                question = qa['question']
                q = np.zeros(max_question, dtype=np.int)
                q[:len(question)] = [store[token.orth_] for token in question]
                for answer in qa['answers']:
                    a = np.zeros((max_context, 2), dtype=np.int) # not max_answer
                    
                    a[answer.start: answer.end, 1] = 1
                    a[:answer.start, 0] = 1
                    a[answer.end:  , 0] = 1
                    
                    contexts.append(c)
                    questions.append(q)
                    answers.append(a)
    return sparse.csr_matrix(contexts), sparse.csr_matrix(questions), sparse.csr_matrix(answers), len(store)

train_contexts, train_questions, train_answers, vocab_size = transform_data_to_model_format(train_spanned, max_context, max_question, max_answer)

100%|██████████| 442/442 [00:14<00:00, 29.68it/s]


In [58]:
save_obj = {
    'contexts': train_contexts,
    'questions': train_questions,
    'answers': train_answers,
    'vocab_size': vocab_size
}

print("Created save object")

Created save object


In [60]:
import pickle
with open('train.dat', 'wb') as outfile:
    pickle.dump(save_obj, outfile, pickle.HIGHEST_PROTOCOL)