# Question Answering Data Preparation

In [1]:
import random
import sys

import pandas

sys.path.append('../..')
from aips import *
from IPython.display import HTML,display

engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")

NOTE: This notebook depends upon the Outdoors dataset. If you have any issues, please rerun the [Setting up the Outdoors Dataset](../ch13/1.setting-up-the-outdoors-dataset.ipynb) notebook.

## Listing 14.4

In [2]:
def get_questions():
    question_types = ["who", "what", "when",
                      "where", "why", "how"]
    questions = []
    for type in question_types:
        request = {"query": type,
                   "query_fields": ["title"],
                   "return_fields": ["id", "url", "owner_user_id",
                                     "title", "accepted_answer_id"],
                   "filters": [("accepted_answer_id", "*")],
                   "limit": 10000}
        docs = outdoors_collection.search(**request)["docs"]
        questions += [document for document in docs #Only titles starting with a question type
                      if document["title"].lower().startswith(type)]
    return questions

## Listing 14.5

In [3]:
def get_answers_from_questions(questions, batch_size=500):
    answer_ids = list(set([str(q["accepted_answer_id"])
                           for q in questions]))
    batches = math.ceil(len(answer_ids) / batch_size)
    answers = {}
    for n in range(0, batches):
        ids = answer_ids[n * batch_size:(n + 1) * batch_size]
        request = {"query": "(" + " ".join(ids) + ")",
                   "query_fields": "id",
                   "limit": batch_size,
                   "filters": [("post_type", "answer")],
                   "order_by": [("score", "desc")]}
        docs = outdoors_collection.search(**request)["docs"]
        answers |= {int(d["id"]): d["body"] for d in docs}
    return answers
    
def get_context_dataframe(questions):
    answers = get_answers_from_questions(questions)
    contexts = {"id": [], "question": [], "context": [], "url": []}
    for question in questions:
        contexts["id"].append(question["id"])
        contexts["url"].append(question["url"])
        contexts["question"].append(question["title"]),
        if question["accepted_answer_id"] in answers:
            context = answers[question["accepted_answer_id"]]
        else:
            context = "Not found"
        contexts["context"].append(context)
    return pandas.DataFrame(contexts)

In [4]:
questions = get_questions()
contexts = get_context_dataframe(questions)
display(contexts[0:5])

Unnamed: 0,id,question,context,url
0,4410,Who places the anchors that rock climbers use?,There are two distinct styles of free rock cli...,https://outdoors.stackexchange.com/questions/4410
1,5347,"Who places the bolts on rock climbing routes, ...",What you're talking about is Sport climbing. G...,https://outdoors.stackexchange.com/questions/5347
2,20662,Who gets the bill if you activate a PLB to hel...,"Almost always the victim gets the bill, but as...",https://outdoors.stackexchange.com/questions/2...
3,7623,What knot is this one? What are its purposes?,Slip knot It's undoubtably a slip knot that's ...,https://outdoors.stackexchange.com/questions/7623
4,11587,"What sort of crane, and what sort of snake?","To answer the snake part of it, looking at som...",https://outdoors.stackexchange.com/questions/1...


In [5]:
contexts.to_csv("data/outdoors/qa-seed-contexts.csv", index=False)

## Listing 14.6

In [6]:
import torch

def get_processor_device(): 
    return 0 if torch.cuda.is_available() else -1

def display_guesses(guesses):
    display(HTML(pandas.DataFrame(guesses[0:10]).to_html(index=False)))

In [7]:
from transformers import pipeline 
import tqdm

model_name = "deepset/roberta-base-squad2"
device = get_processor_device()

def answer_questions(contexts, k=10):
    nlp = pipeline("question-answering", model=model_name,
                   tokenizer=model_name, device=device)
    guesses = []
    for _, row in tqdm.tqdm(contexts[0:k].iterrows(), total=k):
        result = nlp({"question": row["question"],
                      "context": row["context"]})
        guesses.append(result)
    return guesses

In [8]:
guesses = answer_questions(contexts, k=len(contexts))

100%|██████████| 1663/1663 [12:54<00:00,  2.15it/s]


In [9]:
display_guesses(guesses)

score,start,end,answer
0.278927,474,516,a local enthusiast or group of enthusiasts
0.200849,81,117,the person who is creating the climb
0.018632,14,24,the victim
0.222317,29,38,slip knot
0.000551,1255,1262,aquatic
0.374998,15,40,a high-tech treasure hunt
0.046053,171,249,camping in a location where electric/water/sewer connections are not available
0.247008,227,265,the traditional longbow made from wood
0.480407,408,473,shoes intended to closely approximate barefoot running conditions
0.563754,192,232,"a tube of lightweight, stretchy material"


In [10]:
contexts["answers"] = guesses
contexts.to_csv("data/outdoors/qa-squad2-guesses.csv", index=False)

## Listing 14.7

### ***Manually labeling data**
*The above csv file (data/question-answer-squad2-guesses.csv) is used as a raw first pass at attempting to answer the questions.  This is then used with human-in-the-loop manual correction and labelling of the data.  There is no python code that can do this for you.  The data MUST be labelled by an intelligent person with an understanding of the domain.  All further listings will use the 'golden set' - the manually corrected answer file, and not the guesses that were generated above.*

In [11]:
from datasets import Dataset, DatasetDict
random.seed(0)

def get_training_data(filename):
    golden_answers = pandas.read_csv(filename)
    golden_answers = golden_answers[golden_answers["class"] != None]
    qa_data = []
    for _, row in golden_answers.iterrows():
        answers = row["gold"].split("|")
        starts = [row["context"].find(a) for a in answers]
        missing = -1 in starts
        if not missing:
            row["title"] = row["question"]
            row["answers"] = {"text": answers, "answer_start": starts}
            qa_data.append(row)
    columns = ["id", "url", "title", "question", "context", "answers"]
    df = pandas.DataFrame(qa_data, columns=columns).sample(frac=1)
    train_split = int(len(df) * 0.75)
    eval_split = (int((len(df) - train_split) / 1.25) +
                  train_split - 1)
    train_dataset = Dataset.from_pandas(df[:train_split])
    test_dataset = Dataset.from_pandas(df[train_split:eval_split])
    validation_dataset = Dataset.from_pandas(df[eval_split:])
    return DatasetDict({"train": train_dataset, "test": test_dataset,
                        "validation": validation_dataset})

#This golden answers file was labeled by me (Max Irwin).
#It took about 2-3 hours to label 200 question/answer rows
#Doing so will give you a deeper appreciation for the difficulty of the NLP task.
#I *highly* encourage you to label even more documents, and re-run the fine-tuning tasks coming up.
datadict = get_training_data("data/outdoors/outdoors_golden_answers.csv")
model_path = "data/question-answering/question-answering-training-set"

#datadict.save_to_disk(model_path)

datadict

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 1243
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 331
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 84
    })
})

Up next: [Question Answering LLM Fine-tuning](3.question-answering-fine-tuning.ipynb)