### Preprocessing

In [115]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

def proccess_data(json):
    rows = []
    for paragraphs in json:
        for paragraph in paragraphs["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                answer = [answer["text"] for answer in qa["answers"]]
                if len(answer) == 1:
                    answer = answer[0]
                row = {
                    "context": context,
                    "answers_text": answer,
                    "is_impossible": qa["is_impossible"],
                    "question": qa["question"]
                }
                rows.append(row)

    df = pd.DataFrame(rows).drop_duplicates()
    return df

In [116]:
file_path = "./data/raw_data/newsqa_train.json"
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)["data"]

In [117]:
train = proccess_data(data)
train.head()  # Displaying the first few rows of the dataframe

Unnamed: 0,context,answers_text,is_impossible,question
0,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",19,False,What was the amount of children murdered ?
1,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Sudanese region of Darfur,False,Where was one employee killed ?
2,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,Archbishop Desmond Tutu,False,who did say South Africa did not issue a visa ...
3,-LRB- CNN -RRB- -- England international footb...,29-year-old,False,How many years old was the businessman ?
4,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",a series of killings and threats by Muslim ext...,False,What frightened the families ?


In [118]:
file_path = "./data/raw_data/newsqa_dev.json"
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)["data"]

test = proccess_data(data)
test.head()  # Displaying the first few rows of the dataframe

Unnamed: 0,context,answers_text,is_impossible,question
0,"TEHRAN , Iran -LRB- CNN -RRB- -- Iran 's parli...",U.S. President-elect Barack Obama,False,Iran criticizes who ?
1,"LONDON , England -LRB- CNN -RRB- -- Israeli mi...",hit and set on fire,False,What happened to the U.N. compound ?
2,WASHINGTON -LRB- CNN -RRB- -- There are no imm...,President Obama,False,Who said there is no immediate plans for deplo...
3,"LOS ANGELES , California -LRB- CNN -RRB- -- Fo...",intends to follow up with,False,Will Lieberman investigate further ?
4,-LRB- CNN -RRB- -- A Colorado prosecutor Frida...,Tim Masters,False,Who spent nine years in prison ?


### check data for obvious mistakes

In [119]:
assert np.array([type(lst)!=list for lst in test["answers_text"]]).all()
assert np.array([type(lst)!=list for lst in train["answers_text"]]).all()

In [120]:
assert not train["is_impossible"].any()
assert not test["is_impossible"].any()

In [121]:
min_text_length = 100
assert min_text_length < min(np.array([len(cont) for cont in train["context"]]))
assert min_text_length < min(np.array([len(cont) for cont in test["context"]]))

### set cross-validation folds

In [126]:
n_folds = 5
train = train.reset_index(drop=True)
# Initialize GroupKFold
gkf = GroupKFold(n_splits=n_folds)

# Assigning each row to a fold, using 'context' as the group
groups = train['context']
train["fold"] = -1

for fold, (train_idx, val_idx) in enumerate(gkf.split(train, groups=groups)):
    train.loc[val_idx, "fold"] = fold

train.head()  # Displaying the first few rows of the dataframe with the 'fold' column added


Unnamed: 0,context,answers_text,is_impossible,question,fold
0,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",19,False,What was the amount of children murdered ?,2
1,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Sudanese region of Darfur,False,Where was one employee killed ?,2
2,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,Archbishop Desmond Tutu,False,who did say South Africa did not issue a visa ...,4
3,-LRB- CNN -RRB- -- England international footb...,29-year-old,False,How many years old was the businessman ?,2
4,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",a series of killings and threats by Muslim ext...,False,What frightened the families ?,4


In [142]:
context_folds = train.groupby('context')['fold'].nunique()
assert len(context_folds[context_folds > 1]) == 0

### save data

In [143]:
test.to_csv("./data/data/test.csv")
train.to_csv("./data/data/train.csv")