In [35]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

## Load data from the previous step

In [46]:
question_list = pd.read_csv('task4_question1_question_keyword.csv', usecols=[2, 3]).drop_duplicates()
questions = question_list.set_index('category').to_dict()['question']

In [48]:
sentence_candidates = pd.read_csv('ALLRanking_TASK4_Q1_TOP_1000.csv')

In [50]:
sentence_candidates['question'] = sentence_candidates['category'].map(questions)

In [68]:
sentence_candidates

Unnamed: 0.1,Unnamed: 0,pid,newpid,category,sent,newscore,question
0,180179,cszqykpu,246,antihiv,"Interestingly, lopinavir, ritonavir, and darun...",0.808423,Can anti HIV drug cure COVID-2019
1,180181,qdt90c22,31,antihiv,Our results showed that several HIV inhibitors...,0.806379,Can anti HIV drug cure COVID-2019
2,180173,vbgf50os,192,antihiv,"Ritonavir, lopinavir and darunavir were then d...",0.805869,Can anti HIV drug cure COVID-2019
3,180189,vbgf50os,190,antihiv,"Three anti-HIV drugs, ritonavir, lopinavir and...",0.805594,Can anti HIV drug cure COVID-2019
4,180176,qmg58cgr,128,antihiv,The focused drug repurposing of known approved...,0.803729,Can anti HIV drug cure COVID-2019
...,...,...,...,...,...,...,...
7995,7265,7wfdwou8,11864,receptor,It has been known that SARS coronavirus (SARS_...,0.164469,What Kinds of receptor are involved in COVID-...
7996,1021,ebwxryai,12866,receptor,Different forms of lung injury can activate se...,0.164468,What Kinds of receptor are involved in COVID-...
7997,6439,ck0jpp7e,24472,receptor,"Within this mutant gene, we defined a G-rich s...",0.164465,What Kinds of receptor are involved in COVID-...
7998,6412,nn9gj0z1,3892,receptor,A variety of perturbants were employed to char...,0.164459,What Kinds of receptor are involved in COVID-...


## Load model and generate answer

In [5]:
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [60]:
def reconstructText(tokens, start=0, stop=-1):
    tokens = tokens[start: stop]
    if '[SEP]' in tokens:
        sepind = tokens.index('[SEP]')
        tokens = tokens[sepind+1:]
    txt = ' '.join(tokens)
    txt = txt.replace(' ##', '')
    txt = txt.replace('##', '')
    txt = txt.strip()
    txt = " ".join(txt.split())
    txt = txt.replace(' .', '.')
    txt = txt.replace('( ', '(')
    txt = txt.replace(' )', ')')
    txt = txt.replace(' - ', '-')
    txt_list = txt.split(' , ')
    txt = ''
    nTxtL = len(txt_list)
    if nTxtL == 1:
        return txt_list[0]
    newList =[]
    for i,t in enumerate(txt_list):
        if i < nTxtL -1:
            if t[-1].isdigit() and txt_list[i+1][0].isdigit():
                newList += [t,',']
            else:
                newList += [t, ', ']
        else:
            newList += [t]
    return ''.join(newList)

def bert_squad_qa(question, document, model, tokenizer):
    input_ids = tokenizer.encode(question, document)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    n_ids = len(segment_ids)

    start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    start_scores = start_scores[:,1:-1]
    end_scores = end_scores[:,1:-1]
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    ans = reconstructText(tokens, answer_start, answer_end+2)
    if ans.startswith('. ') or ans.startswith(', '):
        ans = ans[2:]
    ans_score = start_scores[0,answer_start].item()+end_scores[0,answer_end].item()
    return ans, ans_score

In [54]:
sentence_candidates_examples = sentence_candidates.iloc[:100, :]

In [61]:
ans_and_scores = sentence_candidates_examples.apply(lambda x: bert_squad_qa(x['question'], x['sent'], qa_model, qa_tokenizer), axis=1)

In [63]:
sentence_candidates_examples['answer'] = ans_and_scores.apply(lambda x: x[0])
sentence_candidates_examples['answer_score'] = ans_and_scores.apply(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [67]:
for _, row in sentence_candidates_examples.iterrows():
    print('Question:', row['question'])
    print('Sentence:', row['sent'])
    print('Answer: ', row['answer'])
    print('\n\n')

Question: Can anti HIV drug  cure COVID-2019 
Sentence: Interestingly, lopinavir, ritonavir, and darunavir are all designed to target viral proteinases.
Answer:  lopinavir



Question: Can anti HIV drug  cure COVID-2019 
Sentence: Our results showed that several HIV inhibitors such as lopinavir, ritonavir, and saquinavir produce strong interaction with the active site of SARS-CoV-2 main protease.
Answer:  our results showed that several hiv inhibitors such as lopinavir, ritonavir, and saquinavir produce strong interaction with the active site of sars-cov-2 main protease



Question: Can anti HIV drug  cure COVID-2019 
Sentence: Ritonavir, lopinavir and darunavir were then docked to the models, respectively, followed by energy minimization of the protease-drug complexes.
Answer:  by energy minimization of the protease-drug complexes



Question: Can anti HIV drug  cure COVID-2019 
Sentence: Three anti-HIV drugs, ritonavir, lopinavir and darunavir, might have therapeutic effect on corona