In [None]:
#!pip install transformers

In [88]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import torch
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
#from question_answering import load_pretrained_qa_model
#from question_answering import answer_question

In [83]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
save_path=root_path+'results/'+task+'/'
submission_path='./'
ranking_file_name='round_2_ab.pkl' #'ranking.tsv'
question_file_name='questions_structured.csv'
literature_file_name='metadata_hypercoagulable.tsv'
answer_confidence_threshold=1
top_k=300 # top_k articles

In [3]:
def load_pretrained_qa_model(model_str=None, use_cuda=True):
    if model_str is None:
        model_str = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
    
    tokenizer = BertTokenizer.from_pretrained(model_str)
    model = BertForQuestionAnswering.from_pretrained(model_str).to(device)

    model.eval()
    return tokenizer, model

def answer_question(question, document, model, tokenizer):
    device = model.device
    
    encoded = tokenizer.encode_plus(question, document, return_tensors='pt', max_length=512)
    start_scores, end_scores = model(encoded['input_ids'].to(device),
                                     token_type_ids=encoded['token_type_ids'].to(device))

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'].squeeze())
    ans_start, ans_end = torch.argmax(start_scores), torch.argmax(end_scores)
    
    ans_tokens = tokens[ans_start: ans_end+1]
    if '[SEP]' in ans_tokens:
        ans_tokens = ans_tokens[ans_tokens.index('[SEP]')+1:]
    ans = tokenizer.convert_tokens_to_string(ans_tokens)
    ans = ans.replace(' - ', '-').replace('[CLS]', '')
    ans_score = start_scores.max() + end_scores.max()

    return ans, ans_score.item()

def ask_all_questions(abstract, ncord_uid):
    answers = []
    for question in questions['question'].values:
        ans, score= answer_question(question, abstract, model=model, tokenizer=tokenizer)
        if ans !='':
            answers.append((ncord_uid, question, ans, score))
    if len(answers) == 0:
        return None
    return answers

Load QA models

In [4]:
tokenizer, model = load_pretrained_qa_model()

Prepare ranked articles

In [7]:
ranking=pickle.load(open(save_path+ranking_file_name, 'rb'))
#ranking=ranking.drop(columns=['Unnamed: 0'],axis=1).reset_index(drop=True)
ranking=ranking.reset_index(drop=True)
ranking=ranking.loc[ranking['label']==1]
ranking_top_k=ranking.iloc[:top_k,:]

Ask questions and get answers

In [8]:
questions=pd.read_csv(data_path+question_file_name, header=None, names=['type','question'], )

In [32]:
_answers_all=ranking_top_k.apply(lambda row: ask_all_questions(row['sentence'], row['ncord_uid']),axis=1)


TypeError: 'NoneType' object is not iterable

In [39]:
answers_all = [item for list in _answers_all.dropna().values for item in list]
answers_all=pd.DataFrame(answers_all, columns=['ncord_uid', 'question', 'answer', 'score'])
answers_all=answers_all.loc[answers_all['score']>answer_confidence_threshold]
answers_all_pivot=answers_all.pivot(index='ncord_uid',columns='question', values='answer')

In [None]:
index_category_question=pd.MultiIndex.from_frame(questions, names=['category','question'])
answers_multi_index=answers_all_pivot.set_axis(index_category_question, axis=1)

Load article's meta file

In [49]:
literature=pd.read_csv(data_path+literature_file_name, sep='\t', index_col='ncord_uid').drop(columns='Unnamed: 0')

In [80]:
metadata=['published','journal', 'title', 'abstract','cord_uid']
idx=[('meta', col) for col in metadata]
idx=pd.MultiIndex.from_tuples(idx)
literature=literature[metadata].set_axis(idx, axis=1)

In [82]:
summary=pd.merge(answers_multi_index, literature, how='left',left_index=True, right_index=True )


In [84]:
summary

category,study type,study type,study type,study type,study type,study type,study type,Therapeutic method,Therapeutic method,Therapeutic method,...,Endpoint,Endpoint,Endpoint,Endpoint,Endpoint,meta,meta,meta,meta,meta
question,Was it a Meta-Analysis study?,Was it a Systematic Review study?,Was it a Randomized Controlled Trial?,Was it a Cohort Study?,Was it a Case-control Study?,Was it a Cross-sectional study?,Was it a Case Reports and Series study?,What therapeutic method was utilized?,What was the treatment of thrombosis?,What was the effect of heparin?,...,What drug is recommended for the patients with mild symptoms to prevent thrombotic implication?,What drug is recommend for non-ICU patient at hospital?,What drug or treatment strategy is recommended for ICU patient?,What is the best drug/treatment with patients who have a stroke background ?,What is the best drug/treatment with patients who have background of coagulation?,published,journal,title,abstract,cord_uid
ncord_uid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12,,,,,,81,,,,,...,,,81,,,2020-01-01 00:00:00.000000000,J. thromb. haemost,Inaccurate conclusions by Tang and colleagues,I read with interest the study by Tang and col...,1m6ipibb
66,,,,,,,,,,,...,,,,,,2010-01-01 00:00:00.000000000,Acta obstetricia et gynecologica Scandinavica,Treatment of concomitant prolapse and stress u...,OBJECTIVE Evaluate the efficacy of a transobtu...,45g493vr
161,,,,,,,,,,,...,,,,,,2013-01-01 00:00:00.000000000,"Surgical laparoscopy, endoscopy & percutaneous...",A prospective randomized comparison of single-...,This prospective randomized study aimed to ev...,rtzqaoib
225,,,the epidemiological history and clinical chara...,,,the epidemiological history and clinical chara...,,,,,...,,the other patients ' condition was effectively...,7,,,2018-01-01 00:00:00.000000000,BJU international,Contemporary management of adult-acquired buri...,OBJECTIVES To describe our buried penis repair...,7h8dmzwf
372,,"importance in december 2019 , a novel coronavi...",,,"importance in december 2019 , a novel coronavi...","importance in december 2019 , a novel coronavi...",,,retrospective case series of 1591 consecutive ...,,...,,,"importance in december 2019 , a novel coronavi...","importance in december 2019 , a novel coronavi...",,2016-01-01 00:00:00.000000000,Arquivos brasileiros de cirurgia digestiva : A...,LAPAROSCOPIC RESECTION OF GASTROINTESTINAL STR...,Gastrointestinal mesenchymal or stromal tumor...,glxtxywf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,,,,"12 , 331 patients were referred to the emergen...",,56 more patients isolated with suspected / pot...,,,,,...,,the emergency department in trondheim has prep...,,,,,,,,
8467,,,,,5-week,,,deep venous thrombosis,,,...,deep venous,,objective to evaluate inpatient outcomes among...,,,,,,,
8575,,,,,,,,,,,...,,,,,,,,,,
8650,,,,,,,,,,,...,,,,,,,,,,


In [92]:
summary.to_csv(submission_path+'summary.csv')

ImportError: cannot import name 'UnicodeWriter' from 'pandas.io.common' (/opt/conda/lib/python3.7/site-packages/pandas/io/common.py)

In [91]:
pickle.dump(summary, open('summary.pkl', 'wb'))