In [1]:
import sys
import html
import pandas
import pickle
import json
import spacy
import warnings
from IPython.core.display import display,HTML
warnings.filterwarnings('ignore') #Some operations warn inside a loop, we'll only need to see the first warning
sys.path.append('..')
from aips import *
outdoors_collection="outdoors"
path = "./"

In [2]:
nlp = spacy.load('en_core_web_sm')
nlp.remove_pipe('ner')
merge_nps = nlp.create_pipe('merge_noun_chunks')
nlp.add_pipe(merge_nps)
determiners = 'all an another any both del each either every half la many much nary neither no some such that the them these this those'.split(' ')
def getQueryFromQuestion(question):
    query = []
    doc = nlp(question)
    for tok in doc:
        if tok.pos_ in ['NOUN','VERB']:
            query.append(tok.text)
    if not len(query):
        query = [question]
    query = ' '.join(query)
    for d in determiners:
        query = query.replace(' '+d+' ','')
    return query

## Listing 14.16

In [3]:
def retriever(question):
    contexts={"id":[],"question":[],"context":[],"url":[]}
    query = getQueryFromQuestion(question)
    request = {
        "query": query,
        "fields": ["id","url","body"],
        "params": {
          "qf": ["body"],
          "fq": ["post_type_id:2"],
          "defType": "edismax",
          "rows":5
        }
    }
    docs = requests.post(f"{SOLR_URL}/{outdoors_collection}/select", json=request).json()["response"]["docs"]
    for doc in docs:
        contexts["id"].append(doc["id"])
        contexts["url"].append(doc["url"])
        contexts["question"].append(question)
        contexts["context"].append(doc["body"])
    return pandas.DataFrame(contexts)

In [4]:
example_contexts = retriever('What are minimalist shoes?')
example_contexts[0:10]

Unnamed: 0,id,question,context,url
0,18376,What are minimalist shoes?,"Minimalist shoes or ""barefoot"" shoes are shoes...",https://outdoors.stackexchange.com/questions/1...
1,18370,What are minimalist shoes?,There was actually a project done on the defin...,https://outdoors.stackexchange.com/questions/1...
2,16427,What are minimalist shoes?,"One summer job, I needed shoes to walk on a ro...",https://outdoors.stackexchange.com/questions/1...
3,18375,What are minimalist shoes?,The answer to this question will vary on your ...,https://outdoors.stackexchange.com/questions/1...
4,13540,What are minimalist shoes?,"Barefoot Shoes Also known as minimalist shoes,...",https://outdoors.stackexchange.com/questions/1...


## Listing 14.17

In [5]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import tqdm

# Our pretrained model!
model_name = '../data/outdoors/roberta-base-squad2-outdoors'

device=-1 #CPU
#device=0 #<-- Uncomment to use GPU, if you are running in Google Colab

qa_nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)

def reader(contexts):
    answers = []
    for idx,row in tqdm.tqdm(contexts.iterrows(),total=len(contexts)):
        result = qa_nlp({"question":row["question"],"context":row["context"]})
        result["id"] = row["id"]
        result["url"] = row["url"]
        answers.append(result)
    return answers

## Listing 14.18

In [6]:
def reranker(answers):
    sorted_answers = sorted(answers, key=lambda k: k['score'], reverse=True)
    return sorted_answers

## Listing 14.19

In [7]:
import urllib.parse
def ask(question):
    documents = retriever(question)
    answers = reader(documents)
    reranked = reranker(answers)
    
    stackexchange_search_url = 'https://outdoors.stackexchange.com/search?q='+urllib.parse.quote(question)
    display(HTML('<h1><a href="'+stackexchange_search_url+'" target=_blank>'+question+'</a></h1>'))
    for answer in reranked:
        display(HTML('<a href="'+answer["url"]+'" target=_blank>'+answer["id"]+'</a>&nbsp;<strong>'+answer["answer"]+'</strong><em>('+str(answer["score"])+')</em>'))    

In [8]:
ask('What is the best mosquito repellant?')

100%|██████████| 5/5 [00:00<00:00,  5.41it/s]


In [9]:
ask('What is the best waterproof boot?')

100%|██████████| 5/5 [00:01<00:00,  3.88it/s]


In [10]:
ask('How many people fit inside a two-person tent?')

100%|██████████| 5/5 [00:00<00:00,  6.10it/s]


In [11]:
ask('What hiking boots work with crampons?')

100%|██████████| 5/5 [00:02<00:00,  2.11it/s]


In [12]:
ask('How far can one person hike in one day?')

100%|██████████| 5/5 [00:01<00:00,  2.55it/s]


In [13]:
ask('How much water does a person need each day?')

100%|██████████| 5/5 [00:01<00:00,  4.17it/s]


In [14]:
ask('What materials make good walking poles?')

100%|██████████| 5/5 [00:02<00:00,  2.23it/s]
