## Question Answering Demo Application

In [5]:
import sys
import html
import pandas
import pickle
import json
import spacy
import warnings
from IPython.display import display,HTML
warnings.filterwarnings('ignore') #Some operations warn inside a loop, we"ll only need to see the first warning
sys.path.append("..")
from aips import *

engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")
path = "data/outdoors/"

In [6]:
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe("ner")
nlp.add_pipe("merge_noun_chunks")
determiners = "all an another any both del each either every half la many much nary neither no some such that the them these this those".split(" ")
def get_query_from_question(question):
    query = []
    doc = nlp(question)
    for tok in doc:
        if tok.pos_ in ["NOUN", "VERB"]:
            query.append(tok.text)
    if not len(query):
        query = [question]
    query = " ".join(query)
    for d in determiners:
        query = query.replace(f" {d} "," ")
    return query

## Listing 14.15

In [7]:
def retriever(question):
    contexts = {"id": [], "question": [], "context": [], "url": []}
    query = get_query_from_question(question)
    request = {
        "query": query,
        "fields": ["id", "url", "body"],
        "params": {
          "qf": ["body"],
          "fq": ["post_type_id:2"],
          "defType": "edismax",
          "rows": 5
        }
    }
    docs = engine.docs_from_response(outdoors_collection.search(request))
    for doc in docs:
        contexts["id"].append(doc["id"])
        contexts["url"].append(doc["url"])
        contexts["question"].append(question)
        contexts["context"].append(doc["body"])
    return pandas.DataFrame(contexts)

In [8]:
example_contexts = retriever("What is the best mosquito repellent?")
example_contexts[0:10]

Unnamed: 0,id,question,context,url
0,16459,What is the best mosquito repellent?,"Screens are the best solution, and the only so...",https://outdoors.stackexchange.com/questions/1...
1,1116,What is the best mosquito repellent?,According to one study of one species of mosqu...,https://outdoors.stackexchange.com/questions/1116
2,765,What is the best mosquito repellent?,Physical barriers are my deterrent of choice. ...,https://outdoors.stackexchange.com/questions/765
3,1332,What is the best mosquito repellent?,"CO 2 traps, according to the NIH , fare better...",https://outdoors.stackexchange.com/questions/1332
4,4311,What is the best mosquito repellent?,"Anecdotally - yes. I've heard it is possible, ...",https://outdoors.stackexchange.com/questions/4311


## Listing 14.16

In [14]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import tqdm

# Our pretrained model!
model_name = path + "roberta-base-squad2-outdoors"

device = -1 #CPU
#device=0 #<-- Uncomment to use GPU, if you are running in Google Colab

qa_nlp = pipeline("question-answering", model=model_name,
                  tokenizer=model_name, device=device)

def reader(contexts):
    answers = []
    for _, row in tqdm.tqdm(contexts.iterrows(), total=len(contexts)):
        answer = qa_nlp({"question": row["question"],
                         "context": row["context"]})
        answer["id"] = row["id"]
        answer["url"] = row["url"]
        answers.append(answer)
    return answers

## Listing 14.17

In [15]:
def reranker(answers):
    return sorted(answers, key=lambda k: k["score"], reverse=True)

## Listing 14.18

In [16]:
import urllib.parse
def ask(question):
    documents = retriever(question)
    answers = reader(documents)
    reranked = reranker(answers)
    stackexchange_search_url = "https://outdoors.stackexchange.com/search?q=" + urllib.parse.quote(question)
    display(HTML(f'<h1><a href="{stackexchange_search_url}" target=_blank>{question}</a></h1>'))
    for answer in reranked:
         display(HTML('<a href="'+answer["url"]+'" target=_blank>'+answer["id"]+'</a>&nbsp;<strong>'+answer["answer"]+'</strong><em>('+str(answer["score"])+')</em>'))

In [17]:
ask("What is the best mosquito repellant?")

100%|██████████| 5/5 [00:01<00:00,  3.61it/s]


In [None]:
ask("What is the best waterproof boot?")

100%|██████████| 5/5 [00:01<00:00,  3.57it/s]


In [None]:
ask("How many people fit inside a two-person tent?")

100%|██████████| 5/5 [00:00<00:00,  5.40it/s]


In [None]:
ask("What hiking boots work with crampons?")

100%|██████████| 5/5 [00:02<00:00,  1.81it/s]


In [None]:
ask("How far can one person hike in one day?")

100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [None]:
ask("How much water does a person need each day?")

100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [None]:
ask("What materials make good walking poles?")

100%|██████████| 5/5 [00:02<00:00,  1.80it/s]


Up next: [Chapter 15 - Foundation Models and Emerging Search Paradigms](../ch15/1.llm-exploration.ipynb)