## Task: Question Answering for NTT Healthcare






In [1]:
from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [2]:
# Connect to Elasticsearch

from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

09/13/2020 18:06:52 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.152s]
09/13/2020 18:06:52 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:200 request:0.153s]


## Cleaning documents and inserting in elasticsearch





In [4]:
import os
doc_dir = "../data"
print(os.getcwd())

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>, "text": "<the-actual-text>"}
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Finder)

# Let's have a look at the first 3 entries:
print(dicts[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)
print("==============FINISHED TO INTRODUCE DOCUMENTS=================")

/home/teo/haystack/tutorials
[{'text': "Most eukaryotic genes are interrupted by intronic sequences that must be removed from pre-messenger RNAs by the splicing machinery. These introns are typically spliced out in a sequential order, resulting in the production of a linear mRNA. However, the splicing machinery can also ''backsplice'' and join a splice donor to an upstream splice acceptor (e.g., join the end of exon 2 to the beginning of exon 2), thereby generating a circular RNA with covalently linked ends ( Figure 1 ). Thousands of genes can generate circular RNAs that accumulate in the cytoplasm, but most rarely do so because backsplicing is far less efficient (1%) than canonical splicing (reviewed in Wilusz, 2018) . Nevertheless, some circular RNAs accumulate to high levels and sequester microRNAs or RNA binding proteins or, alternatively, serve as templates for translation. Most other individual circular RNAs are expressed at exceedingly low levels, so it has remained unclear what

09/13/2020 18:08:26 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.262s]
09/13/2020 18:08:28 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.077s]
09/13/2020 18:08:31 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.030s]
09/13/2020 18:08:33 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.109s]
09/13/2020 18:08:35 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.097s]
09/13/2020 18:08:38 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.108s]
09/13/2020 18:08:40 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.086s]
09/13/2020 18:08:42 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:2.112s]




## Initialize Retriever


In [4]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [5]:
# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.

# from haystack.retriever.sparse import TfidfRetriever
# retriever = TfidfRetriever(document_store=document_store)

## Initialize model to search question in elasticsearch passages

In [6]:
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
print("=========teminado FARMReader")

09/11/2020 13:07:13 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
09/11/2020 13:07:13 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
09/11/2020 13:07:29 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
09/11/2020 13:07:30 - INFO - farm.infer -   Got ya 3 parallel workers to do inference ...
09/11/2020 13:07:30 - INFO - farm.infer -    0    0    0 
09/11/2020 13:07:30 - INFO - farm.infer -   /w\  /w\  /w\
09/11/2020 13:07:30 - INFO - farm.infer -   /'\  / \  /'\
09/11/2020 13:07:30 - INFO - farm.infer -       




In [7]:
# Alternative:
# reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

### Take the passages and find the best answer/passage

In [8]:
finder = Finder(reader, retriever)

## Question and Answer

In [19]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers. 
prediction = finder.get_answers(question="does microsoft help to stop the covid??", top_k_retriever=10, top_k_reader=5)

09/11/2020 13:53:46 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.062s]
09/11/2020 13:53:47 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
09/11/2020 13:53:47 - INFO - haystack.finder -   Reader is looking for detailed answer in 170592 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:12<00:00, 12.76s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:27<00:00, 27.68s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:08<00:00,  8.05s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:13<00:00, 13.48s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:27<00:00, 27.85s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:49<00:00, 24.96s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:20<00:00, 20.06s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:10<00:00, 10.78s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:30<00:00, 30.86s/ Batches]
Inferencing Samples: 100%

In [20]:
print_answers(prediction, details="minimal")

[   {   'answer': 'we do not suggest this for small virtual meetings as it '
                  'stagnates collaboration',
        'context': 'ntended interruptions. Importantly, we do not suggest this '
                   'for small virtual meetings as it stagnates collaboration. '
                   'Third, encourage participants to '},
    {   'answer': 'More data will likely reduce the confidence intervals for '
                  'the frequencies of each incubation day (Figure 1) , giving '
                  'a clearer picture of the actual frequency distribution of '
                  'all incubation periods',
        'context': ' More data will likely reduce the confidence intervals for '
                   'the frequencies of each incubation day (Figure 1) , giving '
                   'a clearer picture of the actual frequency distribution of '
                   'all incubation periods'},
    {   'answer': 'timely city lockdown prevented a subsequent outbreak burst',
        'c