# Question Answering Data Preparation

In [1]:
import sys
import html
import pandas
import pickle
import json
sys.path.append('..')
from aips import *
path = "../data/outdoors/"
engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")

NOTE: This notebook depends upon the Outdoors dataset. If you have any issues, please rerun the [Setting up the Outdoors Dataset](../ch13/1.setting-up-the-outdoors-dataset.ipynb) notebook.

## Listing 14.4

In [2]:
def get_questions():
    question_types = ["who", "what", "when", "where", "why", "how"]
    questions = []
    for type in question_types:
        request = {
            "query": type,
            "fields": ["id", "url", "owner_user_id", "title", "accepted_answer_id"],
            "params": {
              "qf": ["title"],
              "fq": ["accepted_answer_id:[* TO *]"],
              "defType": "edismax",
              "rows": 10000
            }
        }
        docs = engine.docs_from_response(outdoors_collection.search(request))
        questions += [document for document in docs
                      if document["title"].lower().startswith(type)]  #Only titles starting with a question type
    return questions

## Listing 14.5

In [3]:
def get_context_dataframe(questions):
    contexts = {"id": [], "question":[], "context":[], "url": []}
    for question in questions:
        request = {
            "query": "*:*",
            "fields": ["body"],
            "params": {
              "fq": ["id:" + str(question["accepted_answer_id"])],
              "defType": "edismax",
              "rows": 1,
              "sort": "score desc"
            }
        }
        docs = engine.docs_from_response(outdoors_collection.search(request))
        contexts["id"].append(question["id"])
        contexts["url"].append(question["url"])
        contexts["question"].append(question["title"]),
        contexts["context"].append(docs[0]["body"])
    return pandas.DataFrame(contexts)

In [4]:
questions = get_questions()
contexts = get_context_dataframe(questions)
contexts[0:10]

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
#mkdir ../data/outdoors
contexts.to_csv("../data/outdoors/question-answer-seed-contexts.csv", index=False)
#contexts = pandas.read_csv(path+'data/question-answer-seed-contexts.csv')

## Listing 14.6

In [None]:
def get_processor_type():
    return 0 if torch.cuda.is_available() else -1

In [None]:
from transformers import pipeline
import tqdm

model_name = "deepset/roberta-base-squad2"
pipeline_type = "question-answering"

device = get_processor_type()

def answer_questions(contexts, k=10):
    nlp = pipeline(pipeline_type, model=model_name,tokenizer=model_name, device=device)
    guesses = []
    for _, row in tqdm.tqdm(contexts[0:k].iterrows(), total=k):
        result = nlp({"question": row["question"], "context": row["context"]})
        guesses.append(result)
    return guesses

In [None]:
guesses = answer_questions(contexts, k=len(contexts))
contexts["answers"] = guesses
print(guesses[0:10])
"""
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1662/1662 [43:19<00:00,  1.56s/it] 

[{'score': 0.2776225209236145, 'start': 474, 'end': 517, 'answer': 'a local enthusiast or group of enthusiasts.'}, {'score': 0.1954791247844696, 'start': 81, 'end': 118, 'answer': 'the person who is creating the climb.'}, {'score': 0.024139929562807083, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.3299234211444855, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005422658286988735, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.37733304500579834, 'start': 15, 'end': 41, 'answer': 'a high-tech treasure hunt.'}, {'score': 0.5653401613235474, 'start': 192, 'end': 233, 'answer': 'a tube of lightweight, stretchy material.'}, {'score': 0.10057392716407776, 'start': 125, 'end': 155, 'answer': 'the cheapest one of the three,'}, {'score': 0.7781326174736023, 'start': 68, 'end': 77, 'answer': 'blocking.'}, {'score': 0.2520507276058197, 'start': 227, 'end': 266, 'answer': 'the traditional longbow made from wood,'}]
"""

100%|██████████| 1663/1663 [09:29<00:00,  2.92it/s]

[{'score': 0.2789272964000702, 'start': 474, 'end': 516, 'answer': 'a local enthusiast or group of enthusiasts'}, {'score': 0.20084792375564575, 'start': 81, 'end': 117, 'answer': 'the person who is creating the climb'}, {'score': 0.018631743267178535, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.22231647372245789, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005512409843504429, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.3749975562095642, 'start': 15, 'end': 40, 'answer': 'a high-tech treasure hunt'}, {'score': 0.5637544989585876, 'start': 192, 'end': 232, 'answer': 'a tube of lightweight, stretchy material'}, {'score': 0.11091530323028564, 'start': 125, 'end': 154, 'answer': 'the cheapest one of the three'}, {'score': 0.805173933506012, 'start': 68, 'end': 76, 'answer': 'blocking'}, {'score': 0.24700817465782166, 'start': 227, 'end': 265, 'answer': 'the traditional longbow made from wood'}]





"\nSome weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n100%|██████████| 1662/1662 [43:19<00:00,  1.56s/it] \n\n[{'score': 0.2776225209236145, 'start': 474, 'end': 517, 'answer': 'a local enthusiast or group of enthusiasts.'}, {'score': 0.1954791247844696, 'start': 81, 'end': 118, 'answer': 'the person who is creating the climb.'}, {'score': 0.024139929562807083, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.3299234211444855, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005422658286988735, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.37733304500579834, 'start': 15, 'end': 41, 'answer': 'a high-tech treasure hunt.'}, {'score': 0.5653401613235474, 'start': 192, 'end': 233, 'answer': 'a tube

In [None]:
contexts.to_csv("../data/outdoors/question-answer-squad2-guesses.csv", index=False)

## Listing 14.7

### ***Manually labeling data**
*The above csv file (../data/question-answer-squad2-guesses.csv) is used as a raw first pass at attempting to answer the questions.  This is then used with human-in-the-loop manual correction and labelling of the data.  There is no python code that can do this for you.  The data MUST be labelled by an intelligent person with an understanding of the domain.  All further listings will use the 'golden set' - the manually corrected answer file, and not the guesses that were generated above.*

In [None]:
from datasets import Dataset, DatasetDict

def get_training_data(filename):
    golden_answers = pandas.read_csv(filename)
    golden_answers["class"] = golden_answers["class"].fillna(-2).astype(int)
    validated = golden_answers[golden_answers["class"] > -1]    
    table = {"id": [], "url": [], "title": [], "question": [], "context": [], "answers": []}
    
    for _, row in validated.iterrows():
        answers = row["gold"].split("|")
        starts = []
        missing = False
        for answer in answers:
            result = row["context"].find(answer)
            starts.append(result)
            if(result < 0):
                missing = True
        if not missing:
            table["id"].append(row["id"])
            table["url"].append(row["url"])
            table["title"].append(row["question"])
            table["question"].append(row["question"])
            table["context"].append(row["context"])
            table["answers"].append({"text": answers, "answer_start": starts})
            
    df = pandas.DataFrame(table).sample(frac=1)
    train_split = int(len(df) * 0.75)
    eval_split = int((len(df) - train_split) / 1.25) + train_split - 1
    train_dataset = Dataset.from_pandas(df[:train_split])
    test_dataset = Dataset.from_pandas(df[train_split:eval_split])
    validation_dataset = Dataset.from_pandas(df[eval_split:])
    return DatasetDict({"train": train_dataset, "test": test_dataset, "validation": validation_dataset})

#This golden answers file was labeled by me (Max Irwin).
#It took about 2-3 hours to label 200 question/answer rows
#Doing so will give you a deeper appreciation for the difficulty of the NLP task.
#I *highly* encourage you to label even more documents, and re-run the fine-tuning tasks coming up.
data = get_training_data(path + "outdoors_golden_answers_20210130.csv")
data.save_to_disk(path + "question-answering-training-set")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 125
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 10
    })
})

Up next: [Question Answering LLM Fine-tuning](3.question-answering-GPU-fine-tuning.ipynb)