In [None]:
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import torch
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
#from question_answering import load_pretrained_qa_model
#from question_answering import answer_question

In [2]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
save_path=root_path+'results/'+task+'/'
submission_path='./'
ranking_file_name='pseudo_label.pkl' #'ranking.tsv'
question_file_name='questions_structured.csv'
literature_file_name='metadata_hypercoagulable.tsv'
answer_confidence_threshold=1
top_k=300 # top_k articles
col='abstract'

In [3]:
def load_pretrained_qa_model(model_str=None, use_cuda=True):
    if model_str is None:
        model_str = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
    
    tokenizer = BertTokenizer.from_pretrained(model_str)
    model = BertForQuestionAnswering.from_pretrained(model_str).to(device)

    model.eval()
    return tokenizer, model

def answer_question(question, document, model, tokenizer):
    device = model.device
    
    encoded = tokenizer.encode_plus(question, document, return_tensors='pt', max_length=512)
    start_scores, end_scores = model(encoded['input_ids'].to(device),
                                     token_type_ids=encoded['token_type_ids'].to(device))

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'].squeeze())
    ans_start, ans_end = torch.argmax(start_scores), torch.argmax(end_scores)
    
    ans_tokens = tokens[ans_start: ans_end+1]
    if '[SEP]' in ans_tokens:
        ans_tokens = ans_tokens[ans_tokens.index('[SEP]')+1:]
    ans = tokenizer.convert_tokens_to_string(ans_tokens)
    ans = ans.replace(' - ', '-').replace('[CLS]', '')
    ans_score = start_scores.max() + end_scores.max()

    return ans, ans_score.item()

def ask_all_questions(abstract, ncord_uid):
    answers = []
    for question in questions['question'].values:
        ans, score= answer_question(question, abstract, model=model, tokenizer=tokenizer)
        if ans !='':
            answers.append((ncord_uid, question, ans, score))
    if len(answers) == 0:
        return None
    return answers

Load QA models

In [5]:
tokenizer, model = load_pretrained_qa_model()

Prepare ranked articles

In [4]:
ranking=pickle.load(open(save_path+ranking_file_name, 'rb'))
#ranking=ranking.drop(columns=['Unnamed: 0'],axis=1).reset_index(drop=True)
ranking=ranking.reset_index(drop=True)
ranking=ranking.loc[ranking['label']==1]
ranking_top_k=ranking.iloc[:top_k,:]

Ask questions and get answers

In [6]:
questions=pd.read_csv(data_path+question_file_name, header=None, names=['type','question'], )

In [7]:
_answers_all=ranking_top_k.apply(lambda row: ask_all_questions(row[col], row['ncord_uid']),axis=1)


In [8]:
answers_all = [item for list in _answers_all.dropna().values for item in list]
answers_all=pd.DataFrame(answers_all, columns=['ncord_uid', 'question', 'answer', 'score'])
answers_all=answers_all.loc[answers_all['score']>answer_confidence_threshold]
answers_all_pivot=answers_all.pivot(index='ncord_uid',columns='question', values='answer')

In [9]:
index_category_question=pd.MultiIndex.from_frame(questions, names=['category','question'])
answers_all_pivot.set_axis(index_category_question, axis=1)

  


Load article's meta file

In [10]:
literature=pd.read_csv(data_path+literature_file_name, sep='\t', index_col='ncord_uid').drop(columns='Unnamed: 0')

In [11]:
metadata=['published','journal', 'title', 'abstract','cord_uid']
idx=[('meta', col) for col in metadata]
idx=pd.MultiIndex.from_tuples(idx)
literature_multiindex=literature[metadata]
literature_multiindex.set_axis(idx, axis=1)

  """


Merge the meta information with extracted information

In [12]:
summary=pd.merge(answers_all_pivot, literature_multiindex, how='left',left_index=True, right_index=True )

In [15]:
summary.to_csv(save_path+'summary.csv')

In [13]:
summary

category,study type,study type,study type,study type,Therapeutic method,Therapeutic method,Therapeutic method,Therapeutic method,Therapeutic method,Therapeutic method,...,Endpoint,Endpoint,Endpoint,Endpoint,Endpoint,meta,meta,meta,meta,meta
question,What was the type of the study?,"Was it a Prospective, Retrospective Studies?",Was it a Randomized Controlled Trial?,Was it a case control or Cohort Study?,What therapeutic method was utilized?,"Was the patient on antiviral, antiInflammatory , antibactrial treatment?",What was the treatment of thrombosis?,Where any anticoagulation used?,What was the dose of the treatment?,What were all the interventions?,...,Which Covid-19 patients are at a higher risk of developing thrombosis risk?,What drug is recommended for the patients with mild symptoms to prevent thrombotic implication?,What drug is recommend for patient at hospital?,What is the best drug/treatment with patients who have a stroke background ?,What is the best drug/treatment with patients who have background of coagulation?,published,journal,title,abstract,cord_uid
ncord_uid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,.,,. this prospective cohort,,,,...,,,,,,2015-01-01 00:00:00.000000000,Stroke,Early Recurrence and Cerebral Bleeding in Pati...,AND PURPOSE The best time for administering a...,7e4niv98
94,"objective to provide an update to the "" surviv...",,,,,,,,,,...,,"objective to provide an update to the "" surviv...",,,,2013-01-01 00:00:00.000000000,Critical care medicine,Surviving sepsis campaign: international guide...,"OBJECTIVE To provide an update to the ""Survivi...",1uwyrst6
259,,,,,,,,,,,...,,,4133,,,2010-01-01 00:00:00.000000000,Journal of clinical gastroenterology,Hemorrhagic complications of percutaneous radi...,Although radiofrequency ablation (RFA) is wid...,o692zuj6
297,,,,,this was a retrospective multicenter study,,retrospective multicenter study,,,,...,,,,,,2017-01-01 00:00:00.000000000,AJNR. American journal of neuroradiology,Embolization of Intracranial Dural Arterioveno...,AND PURPOSE The introduction of liquid emboli...,mqg4pp8e
314,,,,fifty-two,prospective observational study,,,,patients admitted to critical care following c...,,...,"acetylsalicylic acid [ asa ] , low molecular w...",,,,,2020-03-24 00:00:00.000000000,Cardiology journal,Filter life span in postoperative cardiovascul...,Regional citrate anticoagulation (RCA) is the...,stmu1j0j
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7600,,,,117,"a prospective , multicentre series was conduct...",,,,,,...,,,117,,,2013-01-01 00:00:00.000000000,European radiology,Endovascular treatment of brain arteriovenous ...,OBJECTIVES To evaluate the safety and efficacy...,981lsckw
7602,199,,,,,"randomized , open-label , blinded end point tr...",importance although retrievable inferior vena ...,,,,...,,,,,,2015-01-01 00:00:00.000000000,JAMA,Effect of a retrievable inferior vena cava fil...,IMPORTANCE Although retrievable inferior vena ...,otd3j630
7655,,,,,prospective parallel-group randomized study,,,,,,...,,differences between the treatment group and th...,,,,2019-01-01 00:00:00.000000000,Journal of thrombosis and haemostasis : JTH,Intraoperative infusion of noradrenaline impro...,Essentials Strategies to improve platelet func...,3ozef29x
7713,,,,2162,prospective assessment of dual antiplatelet th...,,,,,,...,,,,,,2015-01-01,Circulation. Cardiovascular interventions,"Proton Pump Inhibitors, Platelet Reactivity, a...",Certain proton pump inhibitors (PPIs) interfe...,0uugfnn2
