In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.pipelines import ExtractiveQAPipeline

#haystack contains a search system for retrieval and QA across documents.
#designed for large documents, but pipeline also works for single document QA 

INFO - haystack.document_stores.base -  Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [4]:
# In-Memory Document Store
from haystack.document_stores import InMemoryDocumentStore

In [5]:
fulldf = pd.read_csv('data/included_abstracts.csv', index_col=0)

## extract abstracts for annotation

In [6]:
#annotatedf = pd.DataFrame({'document_identifier' : range(0,len(fulldf))})

In [7]:
#annotatedf['document_text'] = fulldf['abstract']

In [8]:
#sampled = annotatedf.sample(300)

In [9]:
#sampled.to_csv('data/sample_abstracts.csv')

## import docs and load into document store

In [43]:
abstract_list = fulldf['abstract']
title_list = list(range(0,len(abstract_list)))

In [44]:
n = 55
test = abstract_list[n]
title = title_list[n]

In [45]:
test_dict = {'content': test, 'meta': {'name': title}}

In [46]:
test_dict

{'content': "Heart failure (HF) is a major cause of mortality. Accurately monitoring HF progress and adjusting therapies are critical for improving patient outcomes. An experienced cardiologist can make accurate HF stage diagnoses based on combination of symptoms, signs, and lab results from the electronic health records (EHR) of a patient, without directly measuring heart function. We examined whether machine learning models, more specifically the XGBoost model, can accurately predict patient stage based on EHR, and we further applied the SHapley Additive exPlanations (SHAP) framework to identify informative features and their interpretations. Our results indicate that based on structured data from EHR, our models could predict patients' ejection fraction (EF) scores with moderate accuracy. SHAP analyses identified informative features and revealed potential clinical subtypes of HF. Our findings provide insights on how to design computing systems to accurately monitor disease progress

In [47]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=2000,
    split_respect_sentence_boundary=True,
)

docs_proc = preprocessor.process(test_dict)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_proc)}")

n_docs_input: 1
n_docs_output: 1


In [48]:
docs_proc

[{'content': "Heart failure (HF) is a major cause of mortality. Accurately monitoring HF progress and adjusting therapies are critical for improving patient outcomes. An experienced cardiologist can make accurate HF stage diagnoses based on combination of symptoms, signs, and lab results from the electronic health records (EHR) of a patient, without directly measuring heart function. We examined whether machine learning models, more specifically the XGBoost model, can accurately predict patient stage based on EHR, and we further applied the SHapley Additive exPlanations (SHAP) framework to identify informative features and their interpretations. Our results indicate that based on structured data from EHR, our models could predict patients' ejection fraction (EF) scores with moderate accuracy. SHAP analyses identified informative features and revealed potential clinical subtypes of HF. Our findings provide insights on how to design computing systems to accurately monitor disease progres

In [49]:
document_store = InMemoryDocumentStore() #to enable documents stored in local memory

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


In [70]:
document_store.delete_documents()

In [68]:
##now writes the dict to haystack document store

document_store.write_documents(docs_proc)

## load pipeline components

In [71]:
# An in-memory TfidfRetriever based on Pandas dataframes
# retrievers narrow down Reader scope to smaller text units
# see haystack documentation -> other retrievers

from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

INFO - haystack.nodes.retriever.sparse -  Found 0 candidate paragraphs from 0 docs in DB
INFO - haystack.nodes.retriever.sparse -  Found 0 candidate paragraphs from 0 docs in DB


In [52]:
# Reader scans text returned by retriever and extracts k-best answers
# Load a fine-tuned  model (e.g. RoBERTa QA = "deepset/roberta-base-squad2")
# alternatives (Reader): TransformersReader (leveraging the pipeline of the Transformers package)
# alternatives (Models): e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
# can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
# alternatively, QA models on model hub (https://huggingface.co/models)
#sota: ahotrod/albert_xxlargev1_squad2_512
#dmis-lab/biobert-large-cased-v1.1-squad
#
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, use_confidence_scores=True)
#sets pipeline to contain retriever and reader
pipe = ExtractiveQAPipeline(reader, retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 15 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
INFO - haystack.modeling.infer -  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /|\  /w\  /w\  /w\  /w\  /w\  /w\  /|\
INFO -

## pipeline with fine-tuned model

In [53]:
tuned_reader = FARMReader(model_name_or_path="pubmed_tuned", use_gpu=True, use_confidence_scores=True)
tuned_pipe = ExtractiveQAPipeline(tuned_reader, retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at pubmed_tuned
INFO - haystack.modeling.model.language_model -  Loaded pubmed_tuned
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from pubmed_tuned\prediction_head_0.bin
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modelin

## Questions

In [54]:
q1="what disease is being studied?"
q2="What is the objective of the study?"
###
q3="how many patient data samples were included in this study?"
q4="what modality of data is used in this study?"
###
q5="what country was the study conducted in?"
q6="what hospital did the data come from?"
q7="What existing database did the data come from?"
q8="What organisation did the data come from?"
###
#q10="how does the model perform relative to a human?"
#q11="how does the model perform in prospective testing"
#q12="what were the results of the study?"
#q12="what was the area under the curve (AUC) value?"

## question answering (new model)

In [62]:
# Number of candidates the reader and retriever return
# Higher top_k for retriever = better accuracy (but slower)
qlist = [q1, q2, q3, q4, q5, q6, q7, q8]
plist = qlist.copy() #keep same length
l = len(qlist)

for i in range(0,l):
    plist[i] = tuned_pipe.run(
            query=qlist[i], params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
        )

#p1 = pipe.run(
#    query=q1, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
#)

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.70 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.16 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.42 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.66 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.97 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|████████████████████████████████████████████████████████

In [63]:
for i in range(0,l):
    print_answers(plist[i], details='medium')


Query: what disease is being studied?
Answers:
[   {   'answer': 'Heart failure (HF)',
        'context': 'Heart failure (HF) is a major cause of mortality. '
                   'Accurately monitoring HF progress and adjusting therapies '
                   'are critical for improving patient outcome',
        'score': 0.6165068447589874}]

Query: What is the objective of the study?
Answers:
[   {   'answer': 'Accurately monitoring HF progress and adjusting therapies '
                  'are critical for improving patient outcomes. An experienced '
                  'cardiologist can make accurate HF stage diagnoses based on '
                  'combination of symptoms, signs, and lab results from the '
                  'electronic health records (EHR) of a patient, without '
                  'directly measuring heart function. We examined whether '
                  'machine learning models, more specifically the XGBoost '
                  'model, can accurately predict patient st

## question answering (original model)

In [57]:
# Number of candidates the reader and retriever return
# Higher top_k for retriever = better accuracy (but slower)
qlist = [q1, q2, q3, q4, q5, q6, q7, q8]
plist = qlist.copy() #keep same length
l = len(qlist)

for i in range(0,l):
    plist[i] = pipe.run(
            query=qlist[i], params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
        )

#p1 = pipe.run(
#    query=q1, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
#)

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.53 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.85 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.70 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.27 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.92 Batches/s]
  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|████████████████████████████████████████████████████████

In [58]:
for i in range(0,l):
    print_answers(plist[i], details='medium')


Query: what disease is being studied?
Answers:
[   {   'answer': 'Heart failure',
        'context': 'Heart failure (HF) is a major cause of mortality. '
                   'Accurately monitoring HF progress and adjusting therapies '
                   'are critical for improving patient outcome',
        'score': 0.3558764010667801}]

Query: What is the objective of the study?
Answers:
[   {   'answer': 'Our findings provide insights on how to design computing '
                  'systems to accurately monitor disease progression of HF '
                  'patients',
        'context': 'ubtypes of HF. Our findings provide insights on how to '
                   'design computing systems to accurately monitor disease '
                   'progression of HF patients through conti',
        'score': 0.1781071275472641}]

Query: how many patient data samples were included in this study?
Answers:
[   {   'answer': 'EHR data',
        'context': 'e insights on how to design computing syste