# Explore [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) Dataset

In [3]:
import json
from collections import defaultdict
import pandas as pd

In [3]:
train_path = "../data/train-v2.0.json"

Loading Training data

In [4]:
with open(train_path, "r") as f:
    train = json.load(f)

In [5]:
len(train)

2

In [6]:
train.keys()

dict_keys(['version', 'data'])

## Converting the JSON file to a cleaner DataFrame Representation

In [7]:
del train["version"]

In [8]:
len(train)

1

In [9]:
train = train["data"]

In [10]:
len(train)

442

Each item in the training data corresponds to one of the 442 Wikipedia articles used to create the dataset. Each document consists of several questions having (possibly) several answers with each 'context'.

In [22]:
qa_dict = defaultdict(lambda: dict())

In [23]:
for d in train:
    for para in d["paragraphs"]:
        for p in para["qas"]:
            if len(p["answers"]) == 1:
                qa_dict[p["id"]] = {
                    "question": p["question"],
                    "answer": p["answers"][0]["text"],
                    "answer_start": p["answers"][0]["answer_start"],
                    "is_impossible": p["is_impossible"],
                    "context": para["context"]
                }
            else:
                for a in p["answers"]:
                    qa_dict[p["id"]] = {
                        "question": p["question"],
                        "answer": p["answers"][0]["text"],
                        "answer_start": a["answer_start"],
                        "is_impossible": p["is_impossible"],
                        "context": context
                    }

In [26]:
train_df = pd.DataFrame.from_dict(qa_dict, orient="index")

In [27]:
train_df.head()

Unnamed: 0,question,answer,answer_start,is_impossible,context
56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,in the late 1990s,269,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,singing and dancing,207,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,late 1990s,276,False,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...


In [28]:
train_df.to_csv("../data/train_df.csv")

In [29]:
len(train_df)

86821

## Loading the DataFrame

In [4]:
train_df = pd.read_csv("../data/train_df.csv")

In [6]:
train_df.loc[0]["context"]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'