## QA on wikipedia with Dense Passage Retrieval

#### document_store

In [1]:
HOST = 'localhost' 
PORT = 9200 
INDEX_NAME = 'wikipedia_en'

import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)
from haystack import Finder
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME)

#### retriever (ES)

In [2]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever_es = ElasticsearchRetriever(document_store=document_store)

In [3]:
ES_TOP_K = 100
question = 'Who is obama?'
docs = retriever_es.retrieve(question, top_k=ES_TOP_K)

In [6]:
docs[1]

Document(id='DYVLUHQBdihk5qAl5ZFu', text='Public image of Barack Obama\n\nBarack Obama, who was elected as the 44th President of the United States, has elicited a number of public perceptions regarding his personality and background. As the first African-American President of the United States, his race and culture played a prominent role, both positively and negatively. His relative youth (47 when elected) has alternately resulted in his being praised for his freshness and criticized for his inexperience. His temperament and demeanor have been praised for perceived unflappability, but criticized for a perception of lacking emotional attachment.\n\nObama, who is of biracial background, is regarded and self-identifies as African-American. His father was a black Kenyan from the Luo ethnic group and his mother was white of European descent, mainly of English lineage. Obama, who grew to adulthood after the Civil Rights Movement, had early life experiences that differed from most African Am

#### retriever (dense) - only use it as embedder

In [7]:
from haystack.retriever.dense import DensePassageRetriever
retriever_dense = DensePassageRetriever(document_store=document_store,
                                  embedding_model='../models/dpr/multi_hf_bert_base.cp',
                                  use_gpu=True,
                                  batch_size=16,
                                  do_lower_case=True)

In [43]:
q_vecs = retriever_dense.embed_queries([question])
print('#questions: ', len(q_vecs))

#questions:  1


In [9]:
passages = [d.text for d in docs]
print('#passages: ', len(passages))
p_vecs = retriever_dense.embed_passages(passages)

#passages:  100


In [24]:
import sys
import numpy as np
from os.path import join
sys.path.insert(1, join('..', 'common'))
from utils import get_faiss_gpu_index

p_vecs_array = np.array(p_vecs)
faiss_index = get_faiss_gpu_index(d=768)
faiss_index.add(p_vecs_array)

In [45]:
DENSE_TOP_K = 3
D, I = faiss_index.search(np.array(q_vecs), DENSE_TOP_K)
print(I)

[[ 9 36  3]]


In [49]:
docs[9].text

'Barack Obama (disambiguation)\n\nBarack Obama (born 1961) is an American attorney and politician who served as the 44th President of the United States from 2009 to 2017. \n\nBarack Obama may also refer to :\n\n\n'

#### reader (electra-base-squad2)

In [50]:
READER_DiR = "../models/electra-base-squad2"
reader = TransformersReader(model=READER_DiR, tokenizer=READER_DiR,  use_gpu=0)

In [51]:
candidate_docs = []
for i in I[0]:
    candidate_docs.append(docs[i])
    
prediction = reader.predict(question=question, documents=candidate_docs, top_k=1)

In [52]:
prediction

{'question': 'Who is obama?',
 'answers': [{'answer': 'U.S. president Barack Obama.',
   'context': 'f the paternal grandfather of U.S. president Barack Obama. She is known for short as Sar',
   'offset_start': 152,
   'offset_end': 180,
   'probability': 0.9632195578607494,
   'score': None,
   'document_id': 'laFWUHQBdihk5qAldKdY',
   'meta': {'id': '16335015',
    'url': 'https://en.wikipedia.org/wiki?curid=16335015',
    'name': 'Sarah Onyango Obama'}}]}