# KorQuAD

In [2]:
import os
import datasets
from datasets import (
    Dataset,
    DatasetDict, 
    load_from_disk, 
    load_dataset,
    concatenate_datasets,
)
from datasets import Sequence, Value, Features

import pandas as pd

In [2]:
answers = []
contexts = []
ids = []
questions = []
titles = []

path = "./data/korquad/KorQuAD_v1.0_dev.json"
dev = pd.read_json(path)

for example in dev["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(qa["id"])
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)    

In [3]:
len(answers), len(contexts), len(ids), len(questions), len(titles)

(5774, 5774, 5774, 5774, 5774)

In [4]:
korquad_dev = {
    "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
}
korquad_dev = pd.DataFrame(korquad_dev)
korquad_dev.shape

(5774, 5)

In [None]:
answers = []
contexts = []
ids = []
questions = []
titles = []

path = "./data/korquad/KorQuAD_v1.0_train.json"
train = pd.read_json(path)

for example in train["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(qa["id"])
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)    

In [6]:
korquad_train = {
    "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
}
korquad_train = pd.DataFrame(korquad_train)
korquad_train.shape

(60407, 5)

In [7]:
features = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
    }
)

In [8]:
korquad_train = Dataset.from_pandas(korquad_train, features=features)
korquad_dev = Dataset.from_pandas(korquad_dev, features=features)

In [11]:
DatasetDict(
    {"train": korquad_train,
     "validation": korquad_dev,}
).save_to_disk("./data/korquad/")

# AI Hub Data

In [3]:
answers = []
contexts = []
ids = []
questions = []
titles = []
classtypes = []

path = "./data/aihub/ko_nia_normal_squad_all.json"
normal = pd.read_json(path)

for example in normal["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(qa["id"])
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)
            classtypes.append(qa["classtype"])

In [4]:
features = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
        "classtype": Value(dtype="string", id=None),
    }
)

In [5]:
normal = {
    "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
    "classtype": classtypes,
}
normal = pd.DataFrame(normal)
normal.shape

(243425, 6)

In [6]:
normal = Dataset.from_pandas(normal, features=features)

In [7]:
path = "./data/aihub/ko_nia_noanswer_squad_all.json"
noanswer = pd.read_json(path)

In [8]:
# answers = []
contexts = []
ids = []
questions = []
titles = []
classtypes = []

path = "./data/aihub/ko_nia_noanswer_squad_all.json"
noanswer = pd.read_json(path)

for example in noanswer["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            # text = [answers["text"] for answers in qa["answers"]]
            # answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(qa["id"])
            questions.append(qa["question"])
            # answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)
            classtypes.append(qa["classtype"])

In [9]:
noanswer = {
    # "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
    "classtype": classtypes,
}
noanswer = pd.DataFrame(noanswer)
noanswer.shape

(100244, 5)

In [10]:
features = Features(
    {
        # "answers": Sequence(
        #     feature={
        #         "text": Value(dtype="string", id=None),
        #         "answer_start": Value(dtype="int32", id=None),
        #     },
        #     length=-1,
        #     id=None,
        # ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
        "classtype": Value(dtype="string", id=None),
    }
)

noanswer = Dataset.from_pandas(noanswer, features=features)

In [11]:
answers = []
clues = []
contexts = []
ids = []
questions = []
titles = []
classtypes = []

path = "./data/aihub/ko_nia_clue0529_squad_all.json"
clue = pd.read_json(path)

for example in clue["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            clue_text = [answers["clue_text"] for answers in qa["clue"]]
            clue_start = [answers["clue_start"] for answers in qa["clue"]]
            ids.append(qa["id"])
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            clues.append({"clue_text": clue_text, "clue_start": clue_start})
            contexts.append(context)
            titles.append(title)
            classtypes.append(qa["classtype"])

In [12]:
clue = {
    "answers": answers,
    "clue": clues,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
    "classtype": classtypes,
}
clue = pd.DataFrame(clue)
clue.shape

(96663, 7)

In [13]:
features = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "clue": Sequence(
            feature={
                "clue_text": Value(dtype="string", id=None),
                "clue_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
        "classtype": Value(dtype="string", id=None),
    }
)

clue = Dataset.from_pandas(clue, features=features)

In [14]:
DatasetDict(
    {"normal": normal,
     "noanswer": noanswer,
     "clue": clue,}
).save_to_disk("./data/aihub/")

# AI Hub Book

In [80]:
answers = []
contexts = []
ids = []
questions = []
titles = []
is_impossibles = []

path = "./data/aihub_book/train.json"
book_train = pd.read_json(path)

for example in book_train["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(str(qa["id"]))
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)
            is_impossibles.append(qa["is_impossible"])

In [81]:
book_train = {
    "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
    "is_impossible": is_impossibles,
}
book_train = pd.DataFrame(book_train)
book_train.shape

(900000, 6)

In [84]:
features = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
        "is_impossible": Value(dtype="bool", id=None),
    }
)

book_train = Dataset.from_pandas(book_train, features=features)

In [88]:
answers = []
contexts = []
ids = []
questions = []
titles = []
is_impossibles = []

path = "./data/aihub_book/valid.json"
book_valid = pd.read_json(path)

for example in book_valid["data"].tolist():
    paragraphs = example["paragraphs"]
    title = example["title"]
    for paragraph in paragraphs:
        qas = paragraph["qas"]
        context = paragraph["context"]
        for qa in qas:
            text = [answers["text"] for answers in qa["answers"]]
            answer_start = [answers["answer_start"] for answers in qa["answers"]]
            ids.append(str(qa["id"]))
            questions.append(qa["question"])
            answers.append({"text": text, "answer_start": answer_start})
            contexts.append(context)
            titles.append(title)
            is_impossibles.append(qa["is_impossible"])

In [89]:
book_valid = {
    "answers": answers,
    "context": contexts,
    "id": ids,
    "question": questions,
    "title": titles,
    "is_impossible": is_impossibles,
}
book_valid = pd.DataFrame(book_valid)
book_valid.shape

(50000, 6)

In [90]:
features = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "title": Value(dtype="string", id=None),
        "is_impossible": Value(dtype="bool", id=None),
    }
)

book_valid = Dataset.from_pandas(book_valid, features=features)

In [91]:
DatasetDict(
    {"train": book_train,
     "validation": book_valid,}
).save_to_disk("./data/aihub_book/")