In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [2]:
#import CoQA dataset
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [3]:
del coqa["version"]

In [4]:
#required columns in our dataframe
cols = ["text","question","answer"]

#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list, columns=cols) 

#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)

In [5]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [7]:
#calling the BERT model and tokenizer from HugginFace

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [8]:
#Calling a random row from CoQA

random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [9]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 392 tokens.


In [10]:
#tokenizing context and question

tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
did        2,106
he         2,002
establish   5,323
his        2,010
framework   7,705
im        10,047
##media   16,969
##tly     14,626
?          1,029
[SEP]        102
augustus  11,668
(          1,006
;          1,025
23         2,603
september   2,244
63         6,191
bc         4,647
–          1,516
19         2,539
august     2,257
14         2,403
ad         4,748
)          1,007
was        2,001
the        1,996
founder    3,910
of         1,997
the        1,996
roman      3,142
pri       26,927
##nc      12,273
##ip      11,514
##ate      3,686
and        1,998
considered   2,641
the        1,996
first      2,034
roman      3,142
emperor    3,750
,          1,010
controlling   9,756
the        1,996
roman      3,142
empire     3,400
from       2,013
27         2,676
bc         4,647
until      2,127
his        2,010
death      2,331
in         1,999
ad         4,748
14         2,403
.          1,012
he         2,002
was        2,001
born       2,141
gaius 

In [11]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)

print("SEP token index: ", sep_idx)

#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)

#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a

print("Number of tokens in segment B: ", num_seg_b)

#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b

#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  10
Number of tokens in segment A:  11
Number of tokens in segment B:  381


In [12]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [13]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
Did he establish his framework immediatly?

Answer:
It took several years.


In [14]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [15]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""
question = "Where was the Auction held?"
question_answer(question, text)
#original answer from the dataset
print("Original answer:\n", data.loc[data["question"] == question]["answer"].values[0])


Predicted answer:
Hard rock cafe in new york ' s times square
Original answer:
 Hard Rock Cafe


In [16]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
while True:
    question_answer(question, text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Please enter your text: 
 Which country grows the most tea? The answer is India. It grows three times as much as China. Which country drinks the most tea? It's neither China nor Japan. It's Great Britain. In the wild, tea plants may be 30 feet tall. But a plant grown for market is pruned. Pruning keeps the plant only three or four feet tall. This is an easy height for tea picking. Only the two top leaves and bud of each new shoot are picked. So to make money, tea plantations must be huge. In general, there are two kinds of tea. Black tea and green tea. Black tea is fermented. In the process, the tea loses nearly all of its healthy qualities. Green tea is steamed right after the leaves are picked. Green tea _ its healthy qualities. For example, it may prevent heart disease. How did we get tea bag? 

Please enter your question: 
What did they do to green tea after picking it?

Predicted answer:
Steamed

Do you want to ask another question based on this text (Y/N)? N

Bye!
