In [1]:
import pandas as pd
import numpy as np
import time
import re
from tqdm.notebook import tqdm

In [2]:
import torch
torch.cuda.is_available()

False

In [3]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.pipelines import ExtractiveQAPipeline

#haystack contains a search system for retrieval and QA across documents.
#designed for large documents, but pipeline also works for single document QA 

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [4]:
# In-Memory Document Store
from haystack.document_stores import InMemoryDocumentStore

In [5]:
# Retrieve from store
from haystack.nodes import TfidfRetriever

# import file containing all abstracts

In [None]:
all_abstracts = pd.read_csv('data/included_abstracts.csv', index_col=0)

all_abstracts['article_date'] = pd.to_datetime(all_abstracts['article_date'])
decade_df = all_abstracts[(all_abstracts['article_date'] > '2012-01-01') & (all_abstracts['article_date'] <'2022-01-01')].reset_index(drop=True)
len(decade_df)

In [None]:
abstract_list = decade_df['abstract']
title_list = list(range(0,len(abstract_list)))
pmid_list = decade_df['pmid']

In [None]:
abstract_list.head()

In [None]:
len(abstract_list)

In [None]:
#n = 55
#test = abstract_list[n]
#title = title_list[n]
#
#test_dict = {'content': test, 'meta': {'name': title}}
#
#test_dict

# set up question bank

In [None]:
q1="what disease is being studied?"
#q2="What is the objective of the study?"
###
q3="how many patient data samples were included in this study?"
q4="what modality of data is used in this study?"
###
#q5="what country was the study conducted in?"
#q6="what hospital did the data come from?"
q7="What existing database did the data come from?"
q8="What organisation did the data come from?"
###
#q10="how does the model perform relative to a human?"
#q11="how does the model perform in prospective testing"
#q12="what were the results of the study?"
#q12="what was the area under the curve (AUC) value?"
qlist = [q1, q3, q4, q7, q8]
#qlist = [q1, q2, q3, q4, q5, q6, q7, q8]

# initialising pre-processor module

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=2000,
    split_respect_sentence_boundary=True,
)

document_store = InMemoryDocumentStore()

# hugging face model

In [None]:
# Reader scans text returned by retriever and extracts k-best answers
# Load a fine-tuned  model (e.g. RoBERTa QA = "deepset/roberta-base-squad2")
# alternatives (Reader): TransformersReader (leveraging the pipeline of the Transformers package)
# alternatives (Models): e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
# can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
# alternatively, QA models on model hub (https://huggingface.co/models)
#sota: ahotrod/albert_xxlargev1_squad2_512
#dmis-lab/biobert-large-cased-v1.1-squad
#
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, use_confidence_scores=True)
#sets pipeline to contain retriever and reader
#pipe = ExtractiveQAPipeline(reader, retriever)

# question answer pipeline

In [None]:
##create dataframe to hold results

resultsdf = decade_df[['pmid', 'title', 'abstract']].copy()

for q in qlist:
    resultsdf[q] = pd.Series(dtype='object')
    
#resultsdf['q1_disease'] = pd.Series(dtype='object')
#resultsdf['q2_objective'] = pd.Series(dtype='object')
#resultsdf['q3_size'] = pd.Series(dtype='object')
#resultsdf['q4_modality'] = pd.Series(dtype='object')
#resultsdf['q5_country'] = pd.Series(dtype='object')
#resultsdf['q6_hospital'] = pd.Series(dtype='object')
#resultsdf['q7_database'] = pd.Series(dtype='object')
#resultsdf['q8_organisation'] = pd.Series(dtype='object')

print(len(resultsdf))
resultsdf.head(2)

In [None]:
#testdf = resultsdf[0:50]
testdf = resultsdf.copy()
testdf.head()

In [None]:
retriever = TfidfRetriever(document_store=document_store)
tuned_reader = FARMReader(model_name_or_path="pubmed_tuned", use_gpu=True, use_confidence_scores=True)
tuned_pipe = ExtractiveQAPipeline(tuned_reader, retriever)

In [None]:
start = time.time()

for i, row in tqdm(testdf.iterrows()):
    try:
        #set up dict for new abstract
        abstract_dict = {'content': abstract_list[i], 'meta': {'name': title_list[i]}}
        docs_proc = preprocessor.process(abstract_dict)
            #####test#####print(f"n_docs_input: 1\nn_docs_output: {len(docs_proc)}")
    
        #dump old doc store, and import next document into doc store
        document_store.delete_documents()
        document_store.write_documents(docs_proc)
    
        #set up pipeline
        retriever = TfidfRetriever(document_store=document_store)
        tuned_pipe = ExtractiveQAPipeline(tuned_reader, retriever)
    
        #run pipeline for current 'i'
        plist = qlist.copy() #same length list to set-up iteration through question bank
        l = len(qlist)
        
        temp_list = []
        temp_list = [row['pmid'], row['title'], row['abstract']]
        
        for answer in range(0,l):
            plist[answer] = tuned_pipe.run(
                query=qlist[answer], params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
            )
            
            #append top answer for each row/question
            #print(plist[answer]['answers'][0])
            temp_list.append(plist[answer]['answers'][0])
        
        testdf.loc[i] = temp_list
    
    except:
        pass
    
#measure time
end = time.time()
print(end - start)

In [None]:
testdf.to_csv('output/abstracts_interim.csv')

# clean data

In [6]:
testdf = pd.read_csv('output/abstracts_interim.csv', index_col=0)

In [7]:
cleandf = testdf.copy().applymap(str)
cleandf.head()

Unnamed: 0,pmid,title,abstract,what disease is being studied?,how many patient data samples were included in this study?,what modality of data is used in this study?,What existing database did the data come from?,What organisation did the data come from?
0,35309968,mTeeth: Identifying Brushing Teeth Surfaces Using Wrist-Worn Inertial Sensors.,Ensuring that all the teeth surfaces are adequately covered during daily bru...,"<Answer: answer='oral diseases', score=0.30402445793151855, context='quately...","<Answer: answer='114', score=0.4147607386112213, context='tion. We annotate ...","<Answer: answer='inertial sensors', score=0.5593151301145554, context='tooth...",<Answer: answer='wrist-worn inertial sensor dataset collected from the natur...,"<Answer: answer='the natural environment', score=0.098945751786232, context=..."
1,35330785,Development of a Machine Learning Algorithm for Prediction of Complications ...,Reverse total shoulder arthroplasty (rTSA) offers tremendous promise for the...,"<Answer: answer='Reverse total shoulder arthroplasty', score=0.3776613026857...","<Answer: answer='2799', score=0.4943038523197174, context=' scores. The key ...",<Answer: answer='Office of Statewide Health Planning and Development databas...,<Answer: answer='Office of Statewide Health Planning and Development databas...,"<Answer: answer='Office of Statewide Health Planning and Development', score..."
2,35330977,ECG Restitution Analysis and Machine Learning to Detect Paroxysmal Atrial Fi...,Atrial fibrillation is the most frequent arrhythmia in both equine and human...,"<Answer: answer='paroxysmal atrial fibrillation', score=0.3911014795303345, ...","<Answer: answer='control and horses with PAF', score=0.11062590032815933, co...","<Answer: answer='normal sinus-rhythm ECGs', score=0.3160859942436218, contex...","<Answer: answer='normal sinus-rhythm ECGs', score=0.07146770507097244, conte...","<Answer: answer='horses with PAF', score=0.004044723929837346, context='ordi..."
3,35330920,Segmenting Thoracic Cavities with Neoplastic Lesions: A Head-to-head Benchma...,Automatic segmentation of thoracic cavity structures in computer tomography ...,"<Answer: answer='neoplastic disease', score=0.4418141692876816, context='per...","<Answer: answer='402', score=0.7768610417842865, context='uated using a data...","<Answer: answer='CT images', score=0.5029252022504807, context='-processing ...","<Answer: answer='402 cancer patients', score=0.757216066122055, context='ing...","<Answer: answer='402 cancer patients', score=0.19367507100105286, context='i..."
4,30814403,Combined machine learning and functional magnetic resonance imaging allows i...,Hypoxia exposure during high-altitude expedition cause psychomotor impairmen...,"<Answer: answer='psychomotor impairment', score=0.34731435775756836, context...","<Answer: answer='69', score=0.7202786207199097, context='utilized to predict...","<Answer: answer='Rs-fMRI', score=0.3410738706588745, context='calculated, wh...","<Answer: answer='Shaanxi-Tibet immigrant cohort', score=0.850153923034668, c...","<Answer: answer='Shaanxi-Tibet immigrant cohort', score=0.6297355592250824, ..."


In [8]:
## create lists for answer/score pairs
disease_score = []
disease_answer = []
question_score = []
question_answer = []
sample_score = []
sample_answer = []
modality_score = []
modality_answer = []
country_score = []
country_answer = []
hospital_score = []
hospital_answer = []
database_score = []
database_answer = []
organisation_score = []
organisation_answer = []

#categories = [disease_score, disease_answer, sample_score, sample_answer, modality_score, modality_answer,
#              country_score, country_answer, hospital_score, hospital_answer, database_score, database_answer,
#              organisation_score, organisation_answer]


In [9]:
## start/end strings for scores and answers
score_start = 'score='
score_end = ', context'
#print((s.split(score_start))[1].split(score_end)[0])

answer_start = 'answer='
answer_end = ', score'
#print((s.split(answer_start))[1].split(answer_end)[0])

In [10]:
for i, row in cleandf.iterrows():
    disease_score.append((row['what disease is being studied?'].split(score_start))[1].split(score_end)[0])
    disease_answer.append((row['what disease is being studied?'].split(answer_start))[1].split(answer_end)[0])
    sample_score.append((row['how many patient data samples were included in this study?'].split(score_start))[1].split(score_end)[0])
    sample_answer.append((row['how many patient data samples were included in this study?'].split(answer_start))[1].split(answer_end)[0])
    #question_score.append((row['What is the objective of the study?'].split(score_start))[1].split(score_end)[0])
    #question_answer.append((row['What is the objective of the study?'].split(answer_start))[1].split(answer_end)[0])
    modality_score.append((row['what modality of data is used in this study?'].split(score_start))[1].split(score_end)[0])
    modality_answer.append((row['what modality of data is used in this study?'].split(answer_start))[1].split(answer_end)[0])
    #country_score.append((row['what country was the study conducted in?'].split(score_start))[1].split(score_end)[0])
    #country_answer.append((row['what country was the study conducted in?'].split(answer_start))[1].split(answer_end)[0])    
    #hospital_score.append((row['what hospital did the data come from?'].split(score_start))[1].split(score_end)[0])
    #hospital_answer.append((row['what hospital did the data come from?'].split(answer_start))[1].split(answer_end)[0])
    database_score.append((row['What existing database did the data come from?'].split(score_start))[1].split(score_end)[0])
    database_answer.append((row['What existing database did the data come from?'].split(answer_start))[1].split(answer_end)[0])
    organisation_score.append((row['What organisation did the data come from?'].split(score_start))[1].split(score_end)[0])
    organisation_answer.append((row['What organisation did the data come from?'].split(answer_start))[1].split(answer_end)[0])        

In [11]:
scoredf = cleandf[['pmid', 'title', 'abstract']].copy()

In [12]:
scoredf['disease_answer'] = disease_answer
scoredf['disease_answer'] = scoredf['disease_answer'].str[1:-1]
scoredf['disease_score'] = disease_score

scoredf['sample_answer'] = sample_answer
scoredf['sample_answer'] = scoredf['sample_answer'].str[1:-1]
scoredf['sample_score'] = sample_score

#scoredf['question_answer'] = question_answer
#scoredf['question_answer'] = scoredf['question_answer'].str[1:-1]
#scoredf['question_score'] = question_score

scoredf['modality_answer'] = modality_answer
scoredf['modality_answer'] = scoredf['modality_answer'].str[1:-1]
scoredf['modality_score'] = modality_score

#scoredf['country_answer'] = country_answer
#scoredf['country_answer'] = scoredf['country_answer'].str[1:-1]
#scoredf['country_score'] = country_score

#scoredf['hospital_answer'] = hospital_answer
#scoredf['hospital_answer'] = scoredf['hospital_answer'].str[1:-1]
#scoredf['hospital_score'] = hospital_score

scoredf['database_answer'] = database_answer
scoredf['database_answer'] = scoredf['database_answer'].str[1:-1]
scoredf['database_score'] = database_score

scoredf['organisation_answer'] = organisation_answer
scoredf['organisation_answer'] = scoredf['organisation_answer'].str[1:-1]
scoredf['organisation_score'] = organisation_score

In [13]:
scoredf.head(5)

Unnamed: 0,pmid,title,abstract,disease_answer,disease_score,sample_answer,sample_score,modality_answer,modality_score,database_answer,database_score,organisation_answer,organisation_score
0,35309968,mTeeth: Identifying Brushing Teeth Surfaces Using Wrist-Worn Inertial Sensors.,Ensuring that all the teeth surfaces are adequately covered during daily bru...,oral diseases,0.3040244579315185,114,0.4147607386112213,inertial sensors,0.5593151301145554,wrist-worn inertial sensor dataset collected from the natural environment,0.1690878719091415,the natural environment,0.098945751786232
1,35330785,Development of a Machine Learning Algorithm for Prediction of Complications ...,Reverse total shoulder arthroplasty (rTSA) offers tremendous promise for the...,Reverse total shoulder arthroplasty,0.3776613026857376,2799,0.4943038523197174,Office of Statewide Health Planning and Development database,0.3635886907577514,Office of Statewide Health Planning and Development database,0.6894474029541016,Office of Statewide Health Planning and Development,0.5047770738601685
2,35330977,ECG Restitution Analysis and Machine Learning to Detect Paroxysmal Atrial Fi...,Atrial fibrillation is the most frequent arrhythmia in both equine and human...,paroxysmal atrial fibrillation,0.3911014795303345,control and horses with PAF,0.1106259003281593,normal sinus-rhythm ECGs,0.3160859942436218,normal sinus-rhythm ECGs,0.0714677050709724,horses with PAF,0.0040447239298373
3,35330920,Segmenting Thoracic Cavities with Neoplastic Lesions: A Head-to-head Benchma...,Automatic segmentation of thoracic cavity structures in computer tomography ...,neoplastic disease,0.4418141692876816,402,0.7768610417842865,CT images,0.5029252022504807,402 cancer patients,0.757216066122055,402 cancer patients,0.1936750710010528
4,30814403,Combined machine learning and functional magnetic resonance imaging allows i...,Hypoxia exposure during high-altitude expedition cause psychomotor impairmen...,psychomotor impairment,0.3473143577575683,69,0.7202786207199097,Rs-fMRI,0.3410738706588745,Shaanxi-Tibet immigrant cohort,0.850153923034668,Shaanxi-Tibet immigrant cohort,0.6297355592250824


In [14]:
scoredf.to_csv('output/abstracts_scored.csv')