In [None]:
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import torch
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
#from question_answering import load_pretrained_qa_model
#from question_answering import answer_question

In [13]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
save_path=root_path+'results/'+task+'/'
submission_path='./'
ranking_file_name='pseudo_label.pkl' #'ranking.tsv'
question_file_name='questions_structured.csv'
literature_file_name='metadata_hypercoagulable.tsv'
answer_confidence_threshold=1
top_k=300 # top_k articles
col='abstract'

In [3]:
def load_pretrained_qa_model(model_str=None, use_cuda=True):
    if model_str is None:
        model_str = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
    
    tokenizer = BertTokenizer.from_pretrained(model_str)
    model = BertForQuestionAnswering.from_pretrained(model_str).to(device)

    model.eval()
    return tokenizer, model

def answer_question(question, document, model, tokenizer):
    device = model.device
    
    encoded = tokenizer.encode_plus(question, document, return_tensors='pt', max_length=512)
    start_scores, end_scores = model(encoded['input_ids'].to(device),
                                     token_type_ids=encoded['token_type_ids'].to(device))

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'].squeeze())
    ans_start, ans_end = torch.argmax(start_scores), torch.argmax(end_scores)
    
    ans_tokens = tokens[ans_start: ans_end+1]
    if '[SEP]' in ans_tokens:
        ans_tokens = ans_tokens[ans_tokens.index('[SEP]')+1:]
    ans = tokenizer.convert_tokens_to_string(ans_tokens)
    ans = ans.replace(' - ', '-').replace('[CLS]', '')
    ans_score = start_scores.max() + end_scores.max()

    return ans, ans_score.item()

def ask_all_questions(abstract, ncord_uid):
    answers = []
    for question in questions['question'].values:
        ans, score= answer_question(question, abstract, model=model, tokenizer=tokenizer)
        if ans !='':
            answers.append((ncord_uid, question, ans, score))
    if len(answers) == 0:
        return None
    return answers

Load QA models

In [4]:
tokenizer, model = load_pretrained_qa_model()

Prepare ranked articles

In [5]:
ranking=pickle.load(open(save_path+ranking_file_name, 'rb'))
#ranking=ranking.drop(columns=['Unnamed: 0'],axis=1).reset_index(drop=True)
ranking=ranking.reset_index(drop=True)
ranking=ranking.loc[ranking['label']==1]
ranking_top_k=ranking.iloc[:top_k,:]

Ask questions and get answers

In [10]:
questions=pd.read_csv(data_path+question_file_name, header=None, names=['type','question'], )

In [14]:
_answers_all=ranking_top_k.apply(lambda row: ask_all_questions(row[col], row['ncord_uid']),axis=1)


In [15]:
answers_all = [item for list in _answers_all.dropna().values for item in list]
answers_all=pd.DataFrame(answers_all, columns=['ncord_uid', 'question', 'answer', 'score'])
answers_all=answers_all.loc[answers_all['score']>answer_confidence_threshold]
answers_all_pivot=answers_all.pivot(index='ncord_uid',columns='question', values='answer')

In [16]:
index_category_question=pd.MultiIndex.from_frame(questions, names=['category','question'])
answers_all_pivot.set_axis(index_category_question, axis=1)

  


Load article's meta file

In [30]:
literature=pd.read_csv(data_path+literature_file_name, sep='\t', index_col='ncord_uid').drop(columns='Unnamed: 0')

In [31]:
metadata=['published','journal', 'title', 'abstract','cord_uid']
idx=[('meta', col) for col in metadata]
idx=pd.MultiIndex.from_tuples(idx)
literature_multiindex=literature[metadata]
literature_multiindex.set_axis(idx, axis=1)

  """


Merge the meta information with extracted information

In [32]:
summary=pd.merge(answers_all_pivot, literature_multiindex, how='left',left_index=True, right_index=True )

In [34]:
summary.to_csv(submission_path+'summary.csv')