In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from question_answering import load_pretrained_qa_model
from question_answering import answer_question

In [2]:
tokenizer, model = load_pretrained_qa_model()

In [3]:
question_templates = [
#     'Can %s treat COVID-19', # Q1
    'What are the potential complication of %s in vaccine recipients',  # Q2
    'What are the animal models for %s',  # Q3
    'What are the animal models for %s',  # Q7: the question and value list are very similar to Q3
]

sentence_files = [
#     '../results/Q1/ALLRanking_TASK4_Q1_1000.csv',  # Q1
    '../results/Q2/ALLRanking_TASK4_Q2_1000.csv',  # Q2
    '../results/Q3/ALLRanking_TASK4_Q3_1000.csv',  # Q3
    '../results/Q7/ALLRanking_TASK4_Q7_1000.csv',  # Q7
]

keyword_files = [
#     '../results/Q1/valuelist.txt',  # Q1
    '../results/Q2/valuelist.txt',  # Q2
    '../results/Q3/valuelist.txt',  # Q3
    '../results/Q7/valuelist.txt',  # Q7
]

output_files = [
#     '../results/Q1/q1_answers_top1000.csv',  # Q1
    '../results/Q2/q2_answers_top1000.csv',  # Q2
    '../results/Q3/q3_answers_top1000.csv',  # Q3
    '../results/Q7/q7_answers_top1000.csv',  # Q7
]

In [4]:
def ask_all_possible_questions(question_template, keyword_list, sentence):
    answers = []
    for keyword in keyword_list:
        question = question_template % keyword
        ans, score = answer_question(question, sentence, model, tokenizer)
        if ans != '':
            answers.append((ans, score, keyword))
    if len(answers) == 0:
        return '', ''
    answers, scores, keywords = zip(*answers)
    ans_idx = np.argmax(scores)
    return answers[ans_idx], keywords[ans_idx]

In [5]:
def answer_all_possible_questions(question_template, sentence_file, keyword_file, output_file):
    sentences_df = pd.read_csv(sentence_file, usecols=range(1, 6))

    with open(keyword_file, 'r') as f:
        keyword_list = f.read().strip().split(',')
        
    sentences = sentences_df['sent'].tolist()
    answers = []
    for i in tqdm(range(len(sentences))):
        sent = sentences[i]
        ans = ask_all_possible_questions(question_template, keyword_list, sent)
        answers.append(ans)
    
    ans_lst, val_lst = zip(*answers)
    sentences_df = sentences_df.assign(answer=ans_lst)
    sentences_df = sentences_df.assign(keyword=val_lst)
    
    sentences_df.to_csv(output_file)
    
    return sentences_df

In [6]:
answers_to_all = []
for args in zip(question_templates, sentence_files, keyword_files, output_files):
    answer_df = answer_all_possible_questions(*args)
    answers_to_all.append(answer_df)

100%|██████████| 1000/1000 [03:22<00:00,  4.93it/s]
100%|██████████| 1000/1000 [03:59<00:00,  4.17it/s]
100%|██████████| 1000/1000 [04:01<00:00,  4.14it/s]


In [8]:
answer_to_all[0]

Unnamed: 0,pid,newpid,category,sent,newscore,answer,keyword
0,3qdjmb2j,8652,ADE,"In addition, both vaccines also induced insert...",0.180748,t cell response,disease enhancement
1,ynef1d1t,8058,ADE,Type I interferon is important in anti-viral r...,0.180612,anti-viral,interferon
2,mbvwh2ky,17361,ADE,"Importantly, attenuation of the dORF3-5 mutant...",0.180191,"disrupted cell processes , augmented interfero...",interferon
3,zq58ot3c,18412,ADE,"Furthermore, T-cell proliferative responses an...",0.179922,t-cell proliferative responses and increased p...,immune priming
4,cwfujgya,29285,ADE,By contrast there may be a lack of type 1 inte...,0.179907,lack of type 1 interferon response,antibody dependent enhancement
...,...,...,...,...,...,...,...
995,gq9age1b,15673,ADE,"Immunization of adult mice with UV-V, with or ...",0.086258,extensive eosinophil infiltration in the lungs,immune priming
996,y4dc91nw,34104,ADE,"Here, using a high-throughput yeast two-hybrid...",0.086255,inhibited its activity,nonneutralizing antibody
997,nl8nzz16,17580,ADE,These include host and pathogen factors as wel...,0.086254,host and pathogen factors,nonneutralizing antibody
998,skaom5z7,25435,ADE,The mucosa is the largest immune organ of the ...,0.086252,immune,disease enhancement
