In [1]:
import sys
import html
import pandas
import pickle
import json
sys.path.append('..')
from aips import *

path = "../data/outdoors/"
outdoors_collection="outdoors"

NOTE: This notebook depends upon the Outdoors dataset. If you have any issues, please rerun the [Setting up the Outdoors Dataset](../ch13/1.ch13-setting-up-the-outdoors-dataset.ipynb) notebook.

## Listing 14.4

In [2]:
def getQuestions():
    qtypes = ['Who','What','When','Where','Why','How']
    questions = []
    for qt in qtypes:
        lq = len(qt)
        request = {
            "query": qt,
            "fields": ["id", "url", "owner_user_id", "title", "accepted_answer_id"],
            "params": {
              "qf": ["title"],
              "fq": ["accepted_answer_id:[* TO *]"],
              "defType": "edismax",
              "rows":10000
            }
        }
        docs = requests.post(solr_url + outdoors_collection + "/select", json=request).json()["response"]["docs"]
        questions += [doc for doc in docs if doc['title'][0:lq]==qt] #Only titles starting with a question type
    return questions

## Listing 14.5

In [3]:
def getContextDataFrame(questions):
    contexts={"id":[],"question":[],"context":[],"url":[]}
    for question in questions:
        request = {
            "query": "*:*",
            "fields": ["body"],
            "params": {
              "fq": ["id:"+str(question["accepted_answer_id"])],
              "defType": "edismax",
              "rows":1,
              "sort":"score desc"
            }
        }
        docs = requests.post(solr_url + outdoors_collection + "/select", json=request).json()["response"]["docs"]
        contexts["id"].append(question["id"])
        contexts["url"].append(question["url"])
        contexts["question"].append(question["title"]),
        contexts["context"].append(docs[0]["body"])
    return pandas.DataFrame(contexts)

In [4]:
questions = getQuestions()
contexts = getContextDataFrame(questions)
contexts[0:10]

Unnamed: 0,id,question,context,url
0,4410,Who places the anchors that rock climbers use?,There are two distinct styles of free rock cli...,https://outdoors.stackexchange.com/questions/4410
1,5347,"Who places the bolts on rock climbing routes, ...",What you're talking about is Sport climbing. G...,https://outdoors.stackexchange.com/questions/5347
2,20662,Who gets the bill if you activate a PLB to hel...,"Almost always the victim gets the bill, but as...",https://outdoors.stackexchange.com/questions/2...
3,7623,What knot is this one? What are its purposes?,Slip knot It's undoubtably a slip knot that's ...,https://outdoors.stackexchange.com/questions/7623
4,11587,"What sort of crane, and what sort of snake?","To answer the snake part of it, looking at som...",https://outdoors.stackexchange.com/questions/1...
5,1660,What is Geocaching?,"In short, it's a high-tech treasure hunt. geoc...",https://outdoors.stackexchange.com/questions/1660
6,913,What is a buff?,To be honest I was dubious about getting somet...,https://outdoors.stackexchange.com/questions/913
7,4904,What Rope to purchase?,"Short answer: For your first rope, none of the...",https://outdoors.stackexchange.com/questions/4904
8,6397,What is a bloquers?,"I can only assume, that it derives from bloque...",https://outdoors.stackexchange.com/questions/6397
9,10173,What is a longbow?,The problem with the longbow discussion is tha...,https://outdoors.stackexchange.com/questions/1...


In [5]:
contexts.to_csv(path+'question-answer-seed-contexts.csv',index=False)
#contexts = pandas.read_csv(path+'data/question-answer-seed-contexts.csv')

## Listing 14.6

In [6]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import tqdm

model_name = 'deepset/roberta-base-squad2'
pipeline_type = 'question-answering'

device=-1 #CPU
#device=0 #<-- Uncomment to use GPU, if you are running in Google Colab

def answerQuestions(contexts,k=10):
    nlp = pipeline(pipeline_type, model=model_name, tokenizer=model_name, device=device)
    guesses = []
    for idx,row in tqdm.tqdm(contexts[0:k].iterrows(),total=k):
        result = nlp({"question":row["question"],"context":row["context"]})
        guesses.append(result)
    return guesses

In [7]:
guesses = answerQuestions(contexts,k=len(contexts))
contexts["answers"] = guesses
print(guesses[0:10])
"""
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1662/1662 [43:19<00:00,  1.56s/it] 

[{'score': 0.2776225209236145, 'start': 474, 'end': 517, 'answer': 'a local enthusiast or group of enthusiasts.'}, {'score': 0.1954791247844696, 'start': 81, 'end': 118, 'answer': 'the person who is creating the climb.'}, {'score': 0.024139929562807083, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.3299234211444855, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005422658286988735, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.37733304500579834, 'start': 15, 'end': 41, 'answer': 'a high-tech treasure hunt.'}, {'score': 0.5653401613235474, 'start': 192, 'end': 233, 'answer': 'a tube of lightweight, stretchy material.'}, {'score': 0.10057392716407776, 'start': 125, 'end': 155, 'answer': 'the cheapest one of the three,'}, {'score': 0.7781326174736023, 'start': 68, 'end': 77, 'answer': 'blocking.'}, {'score': 0.2520507276058197, 'start': 227, 'end': 266, 'answer': 'the traditional longbow made from wood,'}]
"""

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

  fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
100%|██████████| 1662/1662 [09:04<00:00,  3.05it/s]

[{'score': 0.2789272665977478, 'start': 474, 'end': 516, 'answer': 'a local enthusiast or group of enthusiasts'}, {'score': 0.2008480429649353, 'start': 81, 'end': 117, 'answer': 'the person who is creating the climb'}, {'score': 0.018631737679243088, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.22231696546077728, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005512479692697525, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.3749971091747284, 'start': 15, 'end': 40, 'answer': 'a high-tech treasure hunt'}, {'score': 0.5637548565864563, 'start': 192, 'end': 232, 'answer': 'a tube of lightweight, stretchy material'}, {'score': 0.1109151840209961, 'start': 125, 'end': 154, 'answer': 'the cheapest one of the three'}, {'score': 0.8051744699478149, 'start': 68, 'end': 76, 'answer': 'blocking'}, {'score': 0.24700796604156494, 'start': 227, 'end': 265, 'answer': 'the traditional longbow made from wood'}]





"\nSome weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n100%|██████████| 1662/1662 [43:19<00:00,  1.56s/it] \n\n[{'score': 0.2776225209236145, 'start': 474, 'end': 517, 'answer': 'a local enthusiast or group of enthusiasts.'}, {'score': 0.1954791247844696, 'start': 81, 'end': 118, 'answer': 'the person who is creating the climb.'}, {'score': 0.024139929562807083, 'start': 14, 'end': 24, 'answer': 'the victim'}, {'score': 0.3299234211444855, 'start': 29, 'end': 38, 'answer': 'slip knot'}, {'score': 0.0005422658286988735, 'start': 1255, 'end': 1262, 'answer': 'aquatic'}, {'score': 0.37733304500579834, 'start': 15, 'end': 41, 'answer': 'a high-tech treasure hunt.'}, {'score': 0.5653401613235474, 'start': 192, 'end': 233, 'answer': 'a tube

In [8]:
contexts.to_csv(path+'question-answer-squad2-guesses.csv',index=False)

## Listing 14.7

### ***Manually labeling data**
*The above csv file (data/question-answer-squad2-guesses.csv) is used as a raw first pass at attempting to answer the questions.  This is then used with human-in-the-loop manual correction and labelling of the data.  There is no python code that can do this for you.  The data MUST be labelled by an intelligent person with an understanding of the domain.  All further listings will use the 'golden set' - the manually corrected answer file, and not the guesses that were generated above.*

In [9]:
import datasets

def getTrainingData(filename):
    golden_answers = pandas.read_csv(filename)
    golden_answers["class"] = golden_answers["class"].fillna(-2).astype(int)
    validated=golden_answers[golden_answers["class"]>-1]
    
    table={"id":[],"url":[],"title":[],"question":[],"context":[],"answers":[]}
    
    for idx,row in validated.iterrows():
        answers = row["gold"].split('|')
        starts = []
        notfound = False
        for i in range(len(answers)):
            found = row["context"].find(answers[i])
            starts.append(found)
            if(found<0):
                notfound = True
        if not notfound:
            table["id"].append(row["id"])
            table["url"].append(row["url"])
            table["title"].append(row["question"])
            table["question"].append(row["question"])
            table["context"].append(row["context"])
            table["answers"].append({
                "text":answers,
                "answer_start":starts
            })
    df = pandas.DataFrame(table).sample(frac=1)
    train_split = int(len(df)*0.75)
    eval_split = int((len(df) - train_split)/1.25) + train_split - 1
    train_dataset = datasets.Dataset.from_pandas(df[:train_split])
    test_dataset = datasets.Dataset.from_pandas(df[train_split:eval_split])
    validation_dataset = datasets.Dataset.from_pandas(df[eval_split:])
    datadict = datasets.DatasetDict({'train':train_dataset,'test':test_dataset,'validation':validation_dataset})
    return datadict

#This golden answers file was labeled by me (Max Irwin).
#It took about 2-3 hours to label 200 question/answer rows
#Doing so will give you a deeper appreciation for the difficulty of the NLP task.
#I *highly* encourage you to label even more documents, and re-run the fine-tuning tasks coming up.
datadict = getTrainingData(path+'outdoors_golden_answers_20210130.csv')
datadict.save_to_disk(path+'question-answering-training-set')
datadict

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 125
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 10
    })
})