In [None]:
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import torch
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
#from question_answering import load_pretrained_qa_model
#from question_answering import answer_question

In [6]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
save_path=root_path+'results/'+task+'/'
ranking_file_name='round_2_ab.pkl' #'ranking.tsv'
question_file_name='questions_structured.csv'
#literature_path=data_path+'CORD-19-research-challenge/'#path to save retrieved articles abstract
#sentence_file_name='hypercoagulable_sentences.tsv'

answer_confidence_threshold=1


In [3]:
def load_pretrained_qa_model(model_str=None, use_cuda=True):
    if model_str is None:
        model_str = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
    
    tokenizer = BertTokenizer.from_pretrained(model_str)
    model = BertForQuestionAnswering.from_pretrained(model_str).to(device)

    model.eval()
    return tokenizer, model

def answer_question(question, document, model, tokenizer):
    device = model.device
    
    encoded = tokenizer.encode_plus(question, document, return_tensors='pt', max_length=512)
    start_scores, end_scores = model(encoded['input_ids'].to(device),
                                     token_type_ids=encoded['token_type_ids'].to(device))

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'].squeeze())
    ans_start, ans_end = torch.argmax(start_scores), torch.argmax(end_scores)
    
    ans_tokens = tokens[ans_start: ans_end+1]
    if '[SEP]' in ans_tokens:
        ans_tokens = ans_tokens[ans_tokens.index('[SEP]')+1:]
    ans = tokenizer.convert_tokens_to_string(ans_tokens)
    ans = ans.replace(' - ', '-').replace('[CLS]', '')
    ans_score = start_scores.max() + end_scores.max()

    return ans, ans_score.item()

def ask_all_questions(abstract, ncord_uid):
    answers = []
    for question in questions['question'].values:
        ans, score= answer_question(question, abstract, model=model, tokenizer=tokenizer)
        if ans !='':
            answers.append((ncord_uid, question, ans, score))
    if len(answers) == 0:
        return None
    #sids, questions_zip, answers, scores = zip(*answers)
    #ans_idx = np.argmax(scores)
    #return sids[ans_idx], questions_zip[ans_idx], answers[ans_idx], scores[ans_idx]
    return answers

In [4]:
tokenizer, model = load_pretrained_qa_model()

In [7]:
ranking=pickle.load(open(save_path+ranking_file_name, 'rb'))
#ranking=ranking.drop(columns=['Unnamed: 0'],axis=1).reset_index(drop=True)
ranking=ranking.reset_index(drop=True)
ranking=ranking.loc[ranking['label']==1]

In [8]:
questions=pd.read_csv(data_path+question_file_name, header=None, names=['type','question'], )

In [9]:
questions.head()

Unnamed: 0,type,question
0,study type,Was it a Meta-Analysis study?
1,study type,Was it a Systematic Review study?
2,study type,Was it a Randomized Controlled Trial?
3,study type,Was it a Cohort Study?
4,study type,Was it a Case-control Study?


In [None]:
#answers_all_=ask_all_questions(ranking.loc[0,'abstract'], 12)

In [None]:
_answers_all=ranking.apply(lambda row: ask_all_questions(row['sentence'], row['ncord_uid']),axis=1)
answers_all = [item for list in _answers_all.values for item in list]

In [None]:
answers_all=pd.DataFrame(answers_all, columns=['ncord_uid', 'question', 'answer', 'score'])

In [None]:
answers_all

In [None]:
answers_all=answers_all.loc[answers_all['score']>answer_confidence_threshold]

In [None]:
answers_all_pivot=answers_all.pivot(index='ncord_uid',columns='question', values='answer')

In [None]:
answers_all_pivot

In [None]:
answers_all_pivot.to_csv(save_path+'summary.csv')