In [1]:
import sys
import html
import pandas
import pickle
import json
import spacy
import warnings
from IPython.core.display import display,HTML
warnings.filterwarnings('ignore') #Some operations warn inside a loop, we'll only need to see the first warning
sys.path.append('..')
from aips import *

In [2]:
outdoors_collection="outdoors"
path = "./"

In [3]:
nlp = spacy.load('en_core_web_lg')
nlp.remove_pipe('ner')
merge_nps = nlp.create_pipe('merge_noun_chunks')
nlp.add_pipe(merge_nps)
determiners = 'all an another any both del each either every half la many much nary neither no some such that the them these this those'.split(' ')
def getQueryFromQuestion(question):
    query = []
    doc = nlp(question)
    for tok in doc:
        if tok.dep_ in ['nsubj']:
            query.append(tok.text)
    if not len(query):
        query = [question]
    query = ' '.join(query)
    for d in determiners:
        query = query.replace(d,'')
    return query

In [4]:
def getCandidateContexts(question):
    contexts={"id":[],"question":[],"context":[],"url":[]}
    query = getQueryFromQuestion(question)
    request = {
        "query": query,
        "fields": ["id","url","body"],
        "params": {
          "qf": ["body"],
          "fq": ["post_type_id:2"],
          "defType": "edismax",
          "rows":5
        }
    }
    docs = requests.post(solr_url + outdoors_collection + "/select", json=request).json()["response"]["docs"]
    for doc in docs:
        contexts["id"].append(doc["id"])
        contexts["url"].append(doc["url"])
        contexts["question"].append(question)
        contexts["context"].append(doc["body"])
    return pandas.DataFrame(contexts)

In [5]:
contexts = getCandidateContexts('What is the best mosquito repellant?')
contexts[0:10]

Unnamed: 0,id,question,context,url
0,16459,What is the best mosquito repellant?,"Screens are the best solution, and the only so...",https://outdoors.stackexchange.com/questions/1...
1,1116,What is the best mosquito repellant?,According to one study of one species of mosqu...,https://outdoors.stackexchange.com/questions/1116
2,765,What is the best mosquito repellant?,Physical barriers are my deterrent of choice. ...,https://outdoors.stackexchange.com/questions/765
3,1332,What is the best mosquito repellant?,"CO 2 traps, according to the NIH , fare better...",https://outdoors.stackexchange.com/questions/1332
4,4311,What is the best mosquito repellant?,"Anecdotally - yes. I've heard it is possible, ...",https://outdoors.stackexchange.com/questions/4311


In [6]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import tqdm

model_name = 'deepset/roberta-base-squad2'
pipeline_type = 'question-answering'

device=-1 #CPU
#device=0 #<-- Uncomment to use GPU, if you are running in Google Colab

def answerQuestion(question):
    qa_nlp = pipeline(pipeline_type, model=model_name, tokenizer=model_name, device=device)
    answers = []
    contexs = getCandidateContexts(question)
    for idx,row in tqdm.tqdm(contexts.iterrows(),total=len(contexts)):
        result = qa_nlp({"question":row["question"],"context":row["context"]})
        result["id"] = row["id"]
        result["url"] = row["url"]
        answers.append(result)
    sorted_answers = sorted(answers, key=lambda k: k['score'], reverse=True)
    return sorted_answers

In [9]:
question = 'What is the best mosquito repellant?'
answers = answerQuestion(question)
display(HTML('<h1>'+question+'</h1>'))
for answer in answers:
    display(HTML('<a href="'+answer["url"]+'">'+answer["id"]+'</a>&nbsp;<strong>'+answer["answer"]+'</strong><em>('+str(answer["score"])+')</em>'))

Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
