In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lighting as pl
from sklearn.model.selection import train_test_split
from termcolor import colored
import textwrap

from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup

In [None]:
pl.seed_everything(49)

In [None]:
def extract_questions_and_answers(factoid_path):
    with factoid_path.open() as file:
        data = json.load(file)
        
    sections = data['data']
    
    for section in sections:
        questions = section['paragraphs']
        
        data_rows = []
        
        for question in questions:
            context = question['context']
            for qa in question['qas']:
                question = qa['question']
                answers = qa['answers']
                
                for ans in answers:
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                    answer_end = answer_start + len(answer_text)
                    
                    data_rows.append([
                        'question': question,
                        'context': context,
                        'answer_text': answer_text,
                        'answer_start': answer_start.
                        'answer_end': answer_end
                    ])
                    
    return pd.DataFrame(data_rows)

In [None]:
df = extract_questions_and_answers(Path('train-v2.0.json'))

In [None]:
def color_answer(question):
    answer_start, answer_end = question['answer_start'], question['answer_end']
    context = question['context']
    
    return colored(context[:answer_start], 'white') + colored(context[answer_start:answer_end+1], 'green') +\
colored(context[answer_end+1:], 'white')

In [None]:
MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)