In [None]:
#Note if you are using google colab - please go to Runtime -> Change runtime type  and select GPU as Hardware accelerator. This will make notebook run faster.
#github link: https://github.com/sanigam/BERT_QA_Medium


#Install following libraries before first run. For subsequent runs, you may comment these
!pip install transformers
!pip install torch

#Import libraries
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
import numpy as np



In [None]:
# Loading BERT model already fine-tuned on SQuAD Question Answer Dataset. This 1.3 GB download and may take sometime
# Note that we are using uncased model so all answers will be in lower case

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
# Getting bert tokenizer
tokenizer_for_bert = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
def bert_answering_machine ( question, passage, max_len =  512):
    ''' Function to provide answer from passage for question asked.
        This function takes question as well as the passage
        It retuns answer from the passage, along with start/end token index for the answer and start/end token scores
        The scores can be used to rank answers if we are searching answers for same question in multiple passages
        Value of max_len can not exceed 512. If length of question + passage + special tokens is bigger than max_len, function will truncate extra portion.

    '''

    #Tokenize input question and passage. Keeping maximum number of tokens as specified by max_len parameter. This will also add special tokens - [CLS] and [SEP]
    input_ids = tokenizer_for_bert.encode ( question, passage,  max_length= max_len, truncation=True)


    #Getting number of tokens in 1st sentence (question) and 2nd sentence (passage)
    cls_index = input_ids.index(102) #Getting index of first SEP token
    len_question = cls_index + 1       # length of question (1st sentence)
    len_answer = len(input_ids)- len_question  # length of answer (2nd sentence)


    #BERT need Segment Ids to understand which tokens belong to sentence 1 and which to sentence 2
    segment_ids =  [0]*len_question + [1]*(len_answer)  #Segment ids will be 0 for question and 1 for answer

    #Converting token ids to tokens
    tokens = tokenizer_for_bert.convert_ids_to_tokens(input_ids)


    # getting start and end scores for answer. Converting input arrays to torch tensors before passing to the model
    start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[0]
    end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[1]

    #Converting scores tensors to numpy arrays so that we can use numpy functions
    start_token_scores = start_token_scores.detach().numpy().flatten()
    end_token_scores = end_token_scores.detach().numpy().flatten()

    #Picking start index and end index of answer based on start/end indices with highest scores
    answer_start_index = np.argmax(start_token_scores)
    answer_end_index = np.argmax(end_token_scores)

    #Getting scores for start token and end token of the answer. Also rounding it to 2 decimal digits
    start_token_score = np.round(start_token_scores[answer_start_index], 2)
    end_token_score = np.round(end_token_scores[answer_end_index], 2)


    #Combining subwords starting with ## so that we can see full words in output. Note tokenizer breaks words which are not in its vocab.
    answer = tokens[answer_start_index] #Answer starts with start index, we got based on highest score
    for i in range(answer_start_index + 1, answer_end_index + 1):
        if tokens[i][0:2] == '##':  # Token for a splitted word starts with ##
            answer += tokens[i][2:] # If token start with ## we remove ## and combine it with previous word so as to restore the unsplitted word
        else:
            answer += ' ' + tokens[i]  # If token does not start with ## we just put a space in between while combining tokens

    # Few patterns indicating that BERT does not get answer from the passage for question asked
    if ( answer_start_index == 0) or (start_token_score < 0 ) or  (answer == '[SEP]') or ( answer_end_index <  answer_start_index):
        answer = "Sorry!, I could not find  an answer in the passage."

    return ( answer_start_index, answer_end_index, start_token_score, end_token_score,  answer)


#Testing function
bert_answering_machine ("Which state john's friend lives", 'My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington')

(32, 32, 6.13, 6.94, 'washington')

In [None]:
# BERT Question-Answer Sample 1
passage="John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis"


print('Passage:\n', passage )
print (f'Length of the passage: {len(passage.split())} words')

question1 ="Who is John's sister" #BERT needs to apply some logic to answer this
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question1, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question2 ="Which college does John's sister attend"   #BERT needs to answer intermediate question (Question 1) to answer this
print ('\nQuestion 2:\n', question2)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question2, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question3 ="Who is the president of UC Davis" # BERT can not answer this from this passage
print ('\nQuestion 3:\n', question3)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question3, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

Passage:
 John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis
Length of the passage: 34 words

Question 1:
 Who is John's sister

Answer from BERT:  sophia smith 


Question 2:
 Which college does John's sister attend

Answer from BERT:  uc davis 


Question 3:
 Who is the president of UC Davis

Answer from BERT:  Sorry!, I could not find  an answer in the passage. 



In [None]:
# BERT Question-Answer Sample 2

passage = " Apple has told employees it'll provide them with paid time off to vote in the US presidential election on Nov. 3, according to a report. \
Workers, who wish to vote that Tuesday will be given up to four hours of pay, Bloomberg reported Friday citing an internal Apple memo. \
It follows Twitter in June making Election Day a paid holiday for US employees. For retail team members and hourly workers across the company, \
if you are scheduled to work this Election Day, we will be providing up to four hours of paid time off if you need it to get to the polls, \
said Deirdre O'Brien, Apple's senior vice president of retail and people, in the reported memo. Teams can also use this time to volunteer as an \
election worker at one of your local polling stations. Apple didn't immediately respond to a request for comment. \
Since Election Day in the US falls on a Tuesday, it can be difficult for people to find time outside of work to visit a polling place and vote "

print('Passage:\n', passage )
print (f'Length of the passage: {len(passage.split())} words')

question1 ="On what date we have Election Day"
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question1, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question2 ="What's the concern discussed here"
print ('\nQuestion 2:\n', question2)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question2, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question3 ="Who is Senior VP at Apple mentioned in this passage "
print ('\nQuestion 3:\n', question3)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question3, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question4 ="How Apple is addressing the issue "
print ('\nQuestion 4:\n', question4)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question4, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question5 ="What's the alternate use of paid time off "
print ('\nQuestion 5:\n', question5)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question5, passage)
print('\nAnswer from BERT: ', ans ,  '\n')



Passage:
  Apple has told employees it'll provide them with paid time off to vote in the US presidential election on Nov. 3, according to a report. Workers, who wish to vote that Tuesday will be given up to four hours of pay, Bloomberg reported Friday citing an internal Apple memo. It follows Twitter in June making Election Day a paid holiday for US employees. For retail team members and hourly workers across the company, if you are scheduled to work this Election Day, we will be providing up to four hours of paid time off if you need it to get to the polls, said Deirdre O'Brien, Apple's senior vice president of retail and people, in the reported memo. Teams can also use this time to volunteer as an election worker at one of your local polling stations. Apple didn't immediately respond to a request for comment. Since Election Day in the US falls on a Tuesday, it can be difficult for people to find time outside of work to visit a polling place and vote 
Length of the passage: 175 words


In [None]:
# Let BERT read a version of my medium article and answer  some technical questions!
# BERT Question-Answer Sample 3

passage= 'BERT, which is an acronym for Bi-directional Encoder Representation from Transformer, is a state of the art language model which can be used for various natural language \
 processing (NLP) tasks. My objective is to introduce BERT at a high level, and enable you to create practical applications using BERT. You need to have basic knowledge of Python \
 as well as a basic idea of machine learning. After going through this post, you should be able to use BERT for Binary or multiclass classification or Regression model or Question \
 Answering Application. BERT brings the transfer learning paradigm into the natural language processing area. Transfer learning means a model developed for a task can be reused as \
 a starting point for another task. BERT is trained on the entirety of Wikipedia text (~2.5 billion words) as well as a book corpus (800 million words). You don’t need to repeat \
 this compute intensive process to make practical use of BERT. For specific tasks such as classification or question answering you just need to do incremental training on a much \
 smaller dataset. This process is called fine tuning. This is like getting a person who is proficient in English and providing them extra guidance on how to identify positive \
 sentiment vs negative sentiment from movie reviews.This is a quick introduction on the BERT pre-training process. For practical uses, you will get a BERT pretrained model and you \
 do not need to perform this step. BERT takes 2 chunks of text, which may include multiple sentences, as inputs (shown in the diagram above). These 2 chunks are referred to as \
 Sentence 1 and Sentence 2 in the diagram above.  Before feeding sentences to BERT, 15% of words are masked. Also, sentence 2 may not always be in continuation of sentence 1. \
 BERT’s pre-training process is like teaching the English language to the BERT model so that it can be used for various tasks which need knowledge of English. This is accomplished \
 by 2 practice tasks given to BERT: 1) Predict masked (hidden) tokens: To illustrate, words “favorite” and “to” are masked in the diagram. BERT will try to predict these masked \
 tokens as part of pre-training. This is similar to the “fill in the blanks” task we may give to a student learning English. While trying to fill the missing words, the student \
 will learn the language. This is referred to as Masked Language Model (MLM) in the natural language processing area. 2) Next sentence prediction: Along with the technique \
 mentioned above, BERT tries to predict if sentence 2 comes right after sentence 1 or not. This provides deeper understanding on sentence dependencies. To use BERT for practical \
 use, we need to fine-tune it for specific tasks. This process finetunes the pre-trained model so that it can perform specific tasks such as text classification, sentiment \
 analysis, question answering. In this process,BERT parameters get adjusted to do the specific task. '

print('Passage:\n', passage )

print (f'Length of the passage: {len(passage.split())} words')


question ="What is full form of BERT"
print ('\nQuestion 1:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is author's purpose in writing this article "
print ('\nQuestion 2:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is full form of NLP "
print ('\nQuestion 3:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is Transfer Learning "
print ('\nQuestion 4:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is pre-training "
print ('\nQuestion 5:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What corpus BERT was pre-trained "
print ('\nQuestion 6:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="Do we need to pre-train BERT model for general practical applications "
print ('\nQuestion 7:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is fine-tuning "
print ('\nQuestion 8:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is MLM "
print ('\nQuestion 9:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="Which words are masked in the diagram "
print ('\nQuestion 10:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What perentage of words are hidden when feeding text to BERT for pre-trainining "
print ('\nQuestion 11:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question ="What is the analogy provided by the author, for masked language modelling used in BERT pre-training "
print ('\nQuestion 12:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question ="What is Random Forest alogorithm "
print ('\nQuestion 13:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question ="How this post may help you "
print ('\nQuestion 14:\n', question)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question, passage)
print('\nAnswer from BERT: ', ans ,  '\n')



Passage:
 BERT, which is an acronym for Bi-directional Encoder Representation from Transformer, is a state of the art language model which can be used for various natural language  processing (NLP) tasks. My objective is to introduce BERT at a high level, and enable you to create practical applications using BERT. You need to have basic knowledge of Python  as well as a basic idea of machine learning. After going through this post, you should be able to use BERT for Binary or multiclass classification or Regression model or Question  Answering Application. BERT brings the transfer learning paradigm into the natural language processing area. Transfer learning means a model developed for a task can be reused as  a starting point for another task. BERT is trained on the entirety of Wikipedia text (~2.5 billion words) as well as a book corpus (800 million words). You don’t need to repeat  this compute intensive process to make practical use of BERT. For specific tasks such as classificatio