In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
torch.cuda.is_available()

False

In [3]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.pipelines import ExtractiveQAPipeline

#haystack contains a search system for retrieval and QA across documents.
#designed for large documents, but pipeline also works for single document QA 

INFO - haystack.document_stores.base -  Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [4]:
# In-Memory Document Store
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore() #to enable documents stored in local memory

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0


In [5]:
doc_dir = "testdata"

dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)

## if there are multiple documents for database, this puts all relevant docs into dictionary

INFO - haystack.utils.preprocessing -  Converting testdata\23919537.txt


In [6]:
dicts

[{'content': "TECHNICAL ADVANCE Open Access Tear fluid proteomics multimarkers for diabetic retinopathy screening Zsolt Torok1,6*, Tunde Peto2, Eva Csosz3, Edit Tukacs1,6, Agnes Molnar4, Zsuzsanna Maros Szabo1,6, Andras Berta5,7, Jozsef Tozser3,7, Andras Hajdu1, Valeria Nagy5, Balint Domokos6 and Adrienne Csutak5,7 Abstract Background: The aim of the project was to develop a novel method for diabetic retinopathy screening based on the examination of tear fluid biomarker changes. In order to evaluate the usability of protein biomarkers for pre  screening purposes several different approaches were used, including machine learning algorithms. Methods: All persons involved in the study had diabetes. Diabetic retinopathy (DR) was diagnosed by capturing 7  field fundus images, evaluated by two independent ophthalmologists. 165 eyes were examined (from 119 patients), 55 were diagnosed healthy and 110 images showed signs of DR. Tear samples were taken from all eyes and state  of the art nano H

In [7]:
##now writes the dict to haystack document store

document_store.write_documents(dicts)

In [8]:
# An in-memory TfidfRetriever based on Pandas dataframes
# retrievers narrow down Reader scope to smaller text units
# see haystack documentation -> other retrievers

from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

INFO - haystack.nodes.retriever.sparse -  Found 1 candidate paragraphs from 1 docs in DB


In [9]:
# Reader scans text returned by retriever and extracts k-best answers
# Load a fine-tuned  model (e.g. RoBERTa QA = "deepset/roberta-base-squad2")
# alternatives (Reader): TransformersReader (leveraging the pipeline of the Transformers package)
# alternatives (Models): e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
# can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
# alternatively, QA models on model hub (https://huggingface.co/models)
#sota: ahotrod/albert_xxlargev1_squad2_512
#dmis-lab/biobert-large-cased-v1.1-squad
#

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.infer -  Got ya 15 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
INFO - haystack.modeling.infer -  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /|\  /w\  /w\  /w\  /w\  /w\  /w\  /|\
INFO - h

In [10]:
# Alternative example:
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

In [11]:
#sets pipeline to contain retriever and reader

pipe = ExtractiveQAPipeline(reader, retriever)

In [15]:
q1="what was the study objective?"
q2="what disease is being studied?"
q3="how many patient data samples were included in this study?"
q4="how much data was used to train the model?"
q5="what country was the study conducted in?"
q6="what organisation did the data come from?"
q7="what city did the data come from?"
q8="was the model tested on a separate dataset?"
q9="what type of data is used in this study?"

In [16]:
# Number of candidates the reader and retriever return
# Higher top_k for retriever = better accuracy (but slower)
qlist = [q1, q2, q3, q4, q5, q6, q7, q8, q9]
plist = qlist.copy() #keep same length
l = len(qlist)

for i in range(0,l):
    plist[i] = pipe.run(
            query=qlist[i], params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
        )

#p1 = pipe.run(
#    query=q1, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
#)

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.81s/ Batches]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.65s/ Batches]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.58s/ Batches]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.69s/ Batches]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.67s/ Batches]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|████████████████████████████████████████████████████████

In [22]:
for i in range(0,l):
    print_answers(plist[i], details='minimum')


Query: what was the study objective?
Answers:
[   {   'answer': 'to ensure the objectiv  ity of the assessment',
        'context': 'non diseased groups. In our pilot study, we intended to '
                   'ensure the objectiv  ity of the assessment by using the '
                   'following 6 different machine learning'},
    {   'answer': 'to develop a novel method for diabetic retinopathy '
                  'screening based on the examination of tear fluid biomarker '
                  'changes',
        'context': ' the project was to develop a novel method for diabetic '
                   'retinopathy screening based on the examination of tear '
                   'fluid biomarker changes. In order to ev'},
    {   'answer': 'to describe a pilot study, conducted as a first attempt to '
                  'examine the use of tear fluid proteomics for DR pre '
                  'screening',
        'context': ' this paper is to describe a pilot study, conducted as a '
      