# Preprocess and external data notebook

## MLQA data

In [None]:
!wget https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip

In [None]:
DATA_PATH = './data'

In [None]:
import zipfile
with zipfile.ZipFile('MLQA_V1.zip') as zip_ref:
    zip_ref.extractall(DATA_PATH)

In [None]:
import os
import sys
import random
import argparse
import json
import nltk
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
mlqa_train_data = f'{DATA_PATH}/MLQA_V1/dev/dev-context-hi-question-hi.json'
mlqa_test_data = f'{DATA_PATH}/MLQA_V1/test/test-context-hi-question-hi.json'
with open(mlqa_train_data, 'r') as file:
    train_data = json.load(file)
with open(mlqa_test_data, 'r') as file:
    test_data = json.load(file)

In [None]:
def preprocess(dataset, tier):
    num_exs = 0 
    examples = []
    for articles_id in tqdm(range(len(dataset['data'])), 
                            desc=f'preprocessing {tier}'):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            qas = article_paragraphs[pid]['qas'] 
            for qn in qas:
                question = qn['question'] 
                ans_text = qn['answers'][0]['text']
                ans_start_charloc = qn['answers'][0]['answer_start']
                ans_end_charloc = ans_start_charloc + len(ans_text)
                examples.append({
                    'context':context, 
                    'question':question, 
                    'answer_text':ans_text, 
                    'answer_start':ans_start_charloc
                })
                num_exs += 1
    print('num examples:', num_exs)
    return examples

In [None]:
examples_train = preprocess(train_data, 'dev')
examples_test = preprocess(test_data, 'test')

In [None]:
examples = examples_train + examples_test
mlqa = pd.DataFrame(examples)
mlqa['language'] = 'hindi'

## XQUAD data

In [None]:
!cd data/ && git clone https://github.com/deepmind/xquad.git

In [None]:
xquad_train_file = f'{DATA_PATH}/xquad/xquad.hi.json'
with open(xquad_train_file, 'r') as file:
    train_data = json.load(file)
examples_train = preprocess(train_data, 'dev')
xquad = pd.DataFrame(examples_train)
xquad['language'] = 'hindi'

## Save data to files

In [None]:
mlqa.to_csv(f'{DATA_PATH}/mlqa_hindi.csv', index=False)
xquad.to_csv(f'{DATA_PATH}/xquad.csv', index=False)

In [None]:
display(xquad.head())
display(mlqa.head())