In [2]:
import pandas as pd
import numpy as np
import time
import re
from tqdm.notebook import tqdm

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.pipelines import ExtractiveQAPipeline

#haystack contains a search system for retrieval and QA across documents.
#designed for large documents, but pipeline also works for single document QA 

INFO - haystack.document_stores.base -  Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [5]:
# In-Memory Document Store
from haystack.document_stores import InMemoryDocumentStore

In [6]:
# Retrieve from store
from haystack.nodes import TfidfRetriever

# import file containing all abstracts

In [12]:
basedf = pd.read_csv('data/decade_abstracts.csv', index_col=0)
basedf.head(5)

Unnamed: 0,pmid,doi,title,abstract
23767,35011970,10.3390/jcm11010229,An Application of Machine Learning That Uses the Magnetic Resonance Imaging ...,This retrospective single-center study included patients diagnosed with epit...
34412,34972109,10.1371/journal.pone.0261698,DAVS-NET: Dense Aggregation Vessel Segmentation Network for retinal vasculat...,"In this era, deep learning-based medical image analysis has become a reliabl..."
29263,34971557,10.1371/journal.pone.0260600,Optical coherence tomography for identification of malignant pulmonary nodul...,To explore the feasibility of using random forest (RF) machine learning algo...
26000,35005197,10.1002/dad2.12264,Deep learning improves utility of tau PET in the study of Alzheimer's disease.,Positron emission tomography (PET) imaging targeting neurofibrillary tau tan...
39158,34969093,10.1093/cercor/bhab474,Predicting Superagers by Machine Learning Classification Based on the Functi...,Superagers are defined as older adults who have youthful memory performance ...


In [21]:
methods = pd.read_csv('data/05_methods_combined.csv', index_col=0)

  methods = pd.read_csv('data/05_methods_combined.csv', index_col=0)


In [23]:
methodsdf = methods[['pmid', '0']].copy()
methodsdf = methodsdf.rename(columns={'0':'methods'})
methodsdf.head(5)

Unnamed: 0,pmid,methods
0,35005676,methods we enrolled patients from the university of washington healthcare sy...
1,34998166,"materials and methods in this paper, a novel tmsf-net is proposed to compens..."
2,34979435,materials and methods deep learning architectures are formed by a sequential...
3,35045383,dataset the cnn model used in this manuscript and was trained on the patch c...
4,34979213,methods this retrospective study was conducted under a hippa-compliant irb p...


In [24]:
fulldf = basedf.merge(methodsdf, on='pmid', how='left')

In [25]:
fulldf_m = fulldf[fulldf['methods'].notnull()].reset_index(drop=True) 

In [26]:
len(fulldf_m)

27252

In [27]:
methods_list = fulldf_m['methods'].tolist()
title_list = list(range(0,len(methods_list)))

pmid_list = fulldf_m['pmid'].tolist()

In [28]:
fulldf_m.head()

Unnamed: 0,pmid,doi,title,abstract,methods
0,35011970,10.3390/jcm11010229,An Application of Machine Learning That Uses the Magnetic Resonance Imaging ...,This retrospective single-center study included patients diagnosed with epit...,"2. methods this retrospective, single-centre study included patients who und..."
1,34972109,10.1371/journal.pone.0261698,DAVS-NET: Dense Aggregation Vessel Segmentation Network for retinal vasculat...,"In this era, deep learning-based medical image analysis has become a reliabl...","3 proposed methodology in this work, we propose davs-net architecture for ro..."
2,34971557,10.1371/journal.pone.0260600,Optical coherence tomography for identification of malignant pulmonary nodul...,To explore the feasibility of using random forest (RF) machine learning algo...,methods to explore the feasibility of using random forest (rf) machine learn...
3,35005197,10.1002/dad2.12264,Deep learning improves utility of tau PET in the study of Alzheimer's disease.,Positron emission tomography (PET) imaging targeting neurofibrillary tau tan...,methods research in context 1. systematic review: the authors reviewed the l...
4,34969093,10.1093/cercor/bhab474,Predicting Superagers by Machine Learning Classification Based on the Functi...,Superagers are defined as older adults who have youthful memory performance ...,['materials and methods community-dwelling volunteers aged 60\xa0years or ol...


In [29]:
methods_list[4]

"['materials and methods community-dwelling volunteers aged 60\\xa0years or older were recruited from the gangseo center for dementia or yangcheon dementia center, two public facilities for dementia prevention in seoul. from march to august in 2018, a total of 134 older adults agreed to participate in this study. a neurologist evaluated eligibility using the following inclusion criteria: aged 60\\xa0years or older, able to read and write, scored >\\u2009\\u20091.5 standard deviation (sd) of the mean of age and education-matched norm on the korean version of mini-mental state examination (k-mmse) ( ), and with typical cognitive function defined as scoring higher than 1 sd (16th percentile) of the demographically matched norm on the tests of memory, attention, language, and visuospatial and frontal executive functions in the seoul neuropsychological screening battery-ii (snsb-ii) ( ). we excluded individuals with any of the following characteristics: 1) suspected or diagnosed with mild c

In [None]:
print(len(methods_list))
print(len(fulldf_m))

In [None]:
#test match
#n = 9
#
#test = methods_list[n]
#pmid = pmid_list[n]
#test_dict = {'content': test, 'meta': {'name': pmid}}
#
#test_dict

# set up question bank

In [None]:
#q1="what disease is being studied?"
#q2="What is the objective of the study?"
###
q3="how many patient data samples were included in this study?"
#q4="what modality of data is used in this study?"
###
#q5="what country was the study conducted in?"
#q6="what hospital did the data come from?"
q7="What existing data source did the data come from?"
q8="What location did the data come from?"
###
#q10="how does the model perform relative to a human?"
#q11="how does the model perform in prospective testing"
#q12="what were the results of the study?"
#q12="what was the area under the curve (AUC) value?"
qlist = [q3, q7, q8]
#qlist = [q1, q2, q3, q4, q5, q6, q7, q8]

# initialising pre-processor module

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=2000,
    split_respect_sentence_boundary=True,
)

document_store = InMemoryDocumentStore()

# hugging face model

In [None]:
# Reader scans text returned by retriever and extracts k-best answers
# Load a fine-tuned  model (e.g. RoBERTa QA = "deepset/roberta-base-squad2")
# alternatives (Reader): TransformersReader (leveraging the pipeline of the Transformers package)
# alternatives (Models): e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
# can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"
# alternatively, QA models on model hub (https://huggingface.co/models)
#sota: ahotrod/albert_xxlargev1_squad2_512
#dmis-lab/biobert-large-cased-v1.1-squad
#
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, use_confidence_scores=True)
#sets pipeline to contain retriever and reader
#pipe = ExtractiveQAPipeline(reader, retriever)

# question answer pipeline

In [None]:
##create dataframe to hold results

resultsdf = fulldf_m[['pmid', 'title', 'methods']].copy()

for q in qlist:
    resultsdf[q] = pd.Series(dtype='object')
    
#resultsdf['q1_disease'] = pd.Series(dtype='object')
#resultsdf['q2_objective'] = pd.Series(dtype='object')
#resultsdf['q3_size'] = pd.Series(dtype='object')
#resultsdf['q4_modality'] = pd.Series(dtype='object')
#resultsdf['q5_country'] = pd.Series(dtype='object')
#resultsdf['q6_hospital'] = pd.Series(dtype='object')
#resultsdf['q7_database'] = pd.Series(dtype='object')
#resultsdf['q8_organisation'] = pd.Series(dtype='object')

print(len(resultsdf))
resultsdf.head(5)

In [None]:
#qa_df = resultsdf[0:50]
qa_df = resultsdf.copy()

In [None]:
retriever = TfidfRetriever(document_store=document_store)
tuned_reader = FARMReader(model_name_or_path="pubmed_tuned_methods", use_gpu=True, use_confidence_scores=True)
tuned_pipe = ExtractiveQAPipeline(tuned_reader, retriever)

In [None]:
start = time.time()

for i, row in tqdm(qa_df.iterrows()):
    try:
        #set up dict for new methods
        methods_dict = {'content': methods_list[i], 'meta': {'name': title_list[i]}}
        docs_proc = preprocessor.process(methods_dict)
            #####test#####print(f"n_docs_input: 1\nn_docs_output: {len(docs_proc)}")
    
        #dump old doc store, and import next document into doc store
        document_store.delete_documents()
        document_store.write_documents(docs_proc)
    
        #set up pipeline
        retriever = TfidfRetriever(document_store=document_store)
        tuned_pipe = ExtractiveQAPipeline(tuned_reader, retriever)
        
        #run pipeline for current 'i'
        plist = qlist.copy() #same length list to set-up iteration through question bank
        l = len(qlist)
    
        temp_list = []
        temp_list = [row['pmid'], row['title'], row['methods']]
        
        try:
            for answer in range(0,l):
                plist[answer] = tuned_pipe.run(
                    query=qlist[answer], params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
                )
        
            #append top answer for each row/question
            #print(plist[answer]['answers'][0])
                temp_list.append(plist[answer]['answers'][0])
        
            qa_df.loc[i] = temp_list
    
        except:
            pass
        
    except:
        pass
    
#measure time
end = time.time()
print(end - start)

In [None]:
qa_df.to_csv('output/methods_interim.csv')

# clean data

In [None]:
cleandf = qa_df.copy().applymap(str)
cleandf

In [None]:
## create lists for answer/score pairs
disease_score = []
disease_answer = []
question_score = []
question_answer = []
sample_score = []
sample_answer = []
modality_score = []
modality_answer = []
country_score = []
country_answer = []
hospital_score = []
hospital_answer = []
database_score = []
database_answer = []
organisation_score = []
organisation_answer = []

#categories = [disease_score, disease_answer, sample_score, sample_answer, modality_score, modality_answer,
#              country_score, country_answer, hospital_score, hospital_answer, database_score, database_answer,
#              organisation_score, organisation_answer]


In [None]:
s = cleandf.iloc[34,5]

## start/end strings for scores and answers
score_start = 'score='
score_end = ', context'
print((s.split(score_start))[1].split(score_end)[0])

answer_start = 'answer='
answer_end = ', score'
print((s.split(answer_start))[1].split(answer_end)[0])

In [None]:
for i, row in tqdm(cleandf.iterrows()):
    try:
        #disease_score.append((row['what disease is being studied?'].split(score_start))[1].split(score_end)[0])
        #disease_answer.append((row['what disease is being studied?'].split(answer_start))[1].split(answer_end)[0])
        sample_score.append((row['how many patient data samples were included in this study?'].split(score_start))[1].split(score_end)[0])
        sample_answer.append((row['how many patient data samples were included in this study?'].split(answer_start))[1].split(answer_end)[0])
        #question_score.append((row['What is the objective of the study?'].split(score_start))[1].split(score_end)[0])
        #question_answer.append((row['What is the objective of the study?'].split(answer_start))[1].split(answer_end)[0])
        #modality_score.append((row['what modality of data is used in this study?'].split(score_start))[1].split(score_end)[0])
        #modality_answer.append((row['what modality of data is used in this study?'].split(answer_start))[1].split(answer_end)[0])
        #country_score.append((row['what country was the study conducted in?'].split(score_start))[1].split(score_end)[0])
        #country_answer.append((row['what country was the study conducted in?'].split(answer_start))[1].split(answer_end)[0])    
        #hospital_score.append((row['what hospital did the data come from?'].split(score_start))[1].split(score_end)[0])
        #hospital_answer.append((row['what hospital did the data come from?'].split(answer_start))[1].split(answer_end)[0])
        database_score.append((row["What existing data source did the data come from?"].split(score_start))[1].split(score_end)[0])
        database_answer.append((row["What existing data source did the data come from?"].split(answer_start))[1].split(answer_end)[0])
        organisation_score.append((row["What location did the data come from?"].split(score_start))[1].split(score_end)[0])
        organisation_answer.append((row["What location did the data come from?"].split(answer_start))[1].split(answer_end)[0])  
    except:
        #disease_score.append('nan')
        #disease_answer.append('nan')
        sample_score.append('nan')
        sample_answer.append('nan')
        #question_score.append('nan')
        #question_answer.append('nan')
        #modality_score.append('nan')
        #modality_answer.append('nan')
        #country_score.append('nan')
        #country_answer.append('nan') 
        #hospital_score.append('nan')
        #hospital_answer.append('nan')
        database_score.append('nan')
        database_answer.append('nan')
        organisation_score.append('nan')
        organisation_answer.append('nan')

In [None]:
scoredf = cleandf[['pmid', 'title', 'methods']].copy()

In [None]:
#scoredf['disease_answer'] = disease_answer
#scoredf['disease_answer'] = scoredf['disease_answer'].str[1:-1]
#scoredf['disease_score'] = disease_score

scoredf['sample_answer'] = sample_answer
scoredf['sample_answer'] = scoredf['sample_answer'].str[1:-1]
scoredf['sample_score'] = sample_score

#scoredf['question_answer'] = question_answer
#scoredf['question_answer'] = scoredf['question_answer'].str[1:-1]
#scoredf['question_score'] = question_score

#scoredf['modality_answer'] = modality_answer
#scoredf['modality_answer'] = scoredf['modality_answer'].str[1:-1]
#scoredf['modality_score'] = modality_score

#scoredf['country_answer'] = country_answer
#scoredf['country_answer'] = scoredf['country_answer'].str[1:-1]
#scoredf['country_score'] = country_score

#scoredf['hospital_answer'] = hospital_answer
#scoredf['hospital_answer'] = scoredf['hospital_answer'].str[1:-1]
#scoredf['hospital_score'] = hospital_score

scoredf['database_answer'] = database_answer
scoredf['database_answer'] = scoredf['database_answer'].str[1:-1]
scoredf['database_score'] = database_score

scoredf['organisation_answer'] = organisation_answer
scoredf['organisation_answer'] = scoredf['organisation_answer'].str[1:-1]
scoredf['organisation_score'] = organisation_score

In [None]:
scoredf.head(5)

In [None]:
scoredf.to_csv('output/methods_scored.csv')