# Question Answering Data Preparation

In [1]:
import html
import json
import pickle
import random
import sys

import pandas

sys.path.append('../..')
from aips import *

engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")

NOTE: This notebook depends upon the Outdoors dataset. If you have any issues, please rerun the [Setting up the Outdoors Dataset](../ch13/1.setting-up-the-outdoors-dataset.ipynb) notebook.

## Listing 14.4

In [2]:
def get_questions():
    question_types = ["who", "what", "when", "where", "why", "how"]
    questions = []
    for type in question_types:
        request = {
            "query": type,
            "query_fields": ["title"],
            "return_fields": ["id", "url", "owner_user_id", "title", "accepted_answer_id"],
            "filters": [("accepted_answer_id", "*")],
            "limit": 10000
        }
        docs = outdoors_collection.search(**request)["docs"]
        questions += [d for d in docs
                      if d["title"].lower().startswith(type)]  #Only titles starting with a question type
    return questions

## Listing 14.5

In [3]:
def get_context_dataframe(questions):
    contexts = {"id": [], "question":[], "context":[], "url": []}
    for question in questions:
        print(question)
        request = {"query": "*",
                   "query_fields": ["body"],
                   "filters": [("id", question["accepted_answer_id"])],
                   "limit": 1,
                   "order_by": [("score", "desc")]}
        docs = outdoors_collection.search(**request)["docs"]
        contexts["id"].append(question["id"])
        contexts["url"].append(question["url"])
        contexts["question"].append(question["title"]),
        contexts["context"].append(docs[0]["body"])
    return pandas.DataFrame(contexts)

In [4]:
questions = get_questions()
contexts = get_context_dataframe(questions)
contexts[0:10]

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
contexts.to_csv("../../data/outdoors/question-answer-seed-contexts.csv", index=False)

## Listing 14.6

In [None]:
import torch

def get_processor_device():
    return 0 if torch.cuda.is_available() else -1

In [None]:
from transformers import pipeline
import tqdm

model_name = "deepset/roberta-base-squad2"
pipeline_type = "question-answering"

device = get_processor_device()

def answer_questions(contexts, k=10):
    nlp = pipeline(pipeline_type, model=model_name,
                   tokenizer=model_name, device=device)
    guesses = []
    for _, row in tqdm.tqdm(contexts[0:k].iterrows(), total=k):
        result = nlp({"question": row["question"], "context": row["context"]})
        guesses.append(result)
    return guesses

In [None]:
guesses = answer_questions(contexts, k=len(contexts))
contexts["answers"] = guesses
guesses[0:10]

  6%|▋         | 105/1663 [00:23<06:10,  4.21it/s]

In [None]:
contexts.to_csv("../../data/outdoors/question-answer-squad2-guesses.csv", index=False)

## Listing 14.7

### ***Manually labeling data**
*The above csv file (../../data/question-answer-squad2-guesses.csv) is used as a raw first pass at attempting to answer the questions.  This is then used with human-in-the-loop manual correction and labelling of the data.  There is no python code that can do this for you.  The data MUST be labelled by an intelligent person with an understanding of the domain.  All further listings will use the 'golden set' - the manually corrected answer file, and not the guesses that were generated above.*

In [None]:
from datasets import Dataset, DatasetDict
random.seed(0)

def get_training_data(filename):
    golden_answers = pandas.read_csv(filename)
    golden_answers["class"] = golden_answers["class"].fillna(-2).astype(int)
    validated = golden_answers[golden_answers["class"] > -1]    
    table = {"id": [], "url": [], "title": [], "question": [], "context": [], "answers": []}
    
    for _, row in validated.iterrows():
        answers = row["gold"].split("|")
        starts = []
        missing = False
        for answer in answers:
            result = row["context"].find(answer)
            starts.append(result)
            if(result < 0):
                missing = True
        if not missing:
            table["id"].append(row["id"])
            table["url"].append(row["url"])
            table["title"].append(row["question"])
            table["question"].append(row["question"])
            table["context"].append(row["context"])
            table["answers"].append({"text": answers, "answer_start": starts})
            
    df = pandas.DataFrame(table).sample(frac=1)
    train_split = int(len(df) * 0.75)
    eval_split = int((len(df) - train_split) / 1.25) + train_split - 1
    train_dataset = Dataset.from_pandas(df[:train_split])
    test_dataset = Dataset.from_pandas(df[train_split:eval_split])
    validation_dataset = Dataset.from_pandas(df[eval_split:])
    return DatasetDict({"train": train_dataset, "test": test_dataset, "validation": validation_dataset})

#This golden answers file was labeled by me (Max Irwin).
#It took about 2-3 hours to label 200 question/answer rows
#Doing so will give you a deeper appreciation for the difficulty of the NLP task.
#I *highly* encourage you to label even more documents, and re-run the fine-tuning tasks coming up.
data = get_training_data("../../data/outdoors/outdoors_golden_answers_20210130.csv")
data.save_to_disk("../../data/outdoors/question-answering-training-set")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 125
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 10
    })
})

Up next: [Question Answering LLM Fine-tuning](3.question-answering-GPU-fine-tuning.ipynb)