<a href="https://colab.research.google.com/github/vegarab/msc-qg/blob/master/notebooks/dataloading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

import json


In [6]:
!curl -LO https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 40.1M  100 40.1M    0     0  36.9M      0  0:00:01  0:00:01 --:--:-- 36.9M


In [None]:
EOS_FORMATS = {
    't5': '</s>'
}

def add_eos_to_input(_input, model='t5'):
    pass


In [59]:
class SQUADDataset(Dataset):
    def __init__(self, config, tokenizer, datafile):
        self.config = config
        self.tokenizer = tokenizer

        self.data = self._squad_json_to_dataframe(datafile)

        self.question = self.data.question
        self.context = self.data.context
        self.answer = self.data.answer

    def __len__(self):
        return len(self.question)

    def __getitem__(self, index):
        # TODO: Setup strings based on model
        answer = str(self.answer[index]))
        answer = ' '.join(answer.split())
        answer = 'answer: ' + answer

        context = str(self.context[index]))
        context = ' '.join(context.split())
        context = 'context: ' + context

        _input = answer + ' <sep> ' + context + ' </s>'

        question = str(self.question[index]))
        question = ' '.join(question.split())

        _output = question + ' </s'

        source = self.tokenizer.batch_encode_plus([_input],
                                                  max_length=self.config.source_len,
                                                  pad_to_max_length=True
                                                  return_tensors='pt')

        target = self.tokenizer.batch_encode_plus([_output],
                                                  max_length=self.config.q_len,
                                                  pad_to_max_length=True,
                                                  return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['target_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()


        return {
            'source_ids', source_ids.to(dtype=torch.long),
            'source_mask', source_mask.to(dtype=torch.long),
            'target_ids', target_ids.to(dtype=torch.long),
            'target_mask', target_mask.to(dtype=torch.long)
        }

    
    def _squad_json_to_dataframe(self, datafile):
        '''
        datafile: path to the squad json file.
        '''
        _file = json.loads(open(datafile).read())
        _record_path = ['data', 'paragraphs', 'qas', 'answers']

        # Parsing different levels in the JSON file
        answers = pd.json_normalize(_file, _record_path)
        questions = pd.json_normalize(_file, _record_path[:-1])
        contexts = pd.json_normalize(_file, _record_path[:-2])

        # Repeating context according to IDs for each question
        idx = np.repeat(contexts['context'].values, contexts['qas'].str.len())
        ndx  = np.repeat(questions['id'].values, questions['answers'].str.len())
        questions['context'] = idx
        answers['q_idx'] = ndx

        # Merge all of this, using the newly created index
        main = pd.concat([questions[['id', 'question','context', 'answers']].set_index('id'),
                          answers.set_index('q_idx')],
                         1, sort=False).reset_index()
        main['c_id'] = main['context'].factorize()[0]

        # Only use necessary columns and set appropriate names
        main = main[['index', 'question', 'context', 'text', 'answer_start']]
        main.rename(columns={'text':'answer'}, inplace=True)

        return main


In [57]:
dataset = SQUADDataset(None, None, 'train-v2.0.json')

In [58]:
dataset.data.head()

Unnamed: 0,index,question,context,answer,answer_start
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0
