<a href="https://colab.research.google.com/github/supertime1/BERT/blob/main/QA_BERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1.Import Dependencies

In [5]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np
import json

tf.get_logger().setLevel('ERROR')

In [6]:
import sys
sys.path.append(r'C:\Users\57lzhang.US04WW4008\Documents\GitHub\BERT')

#2.Preprocess the input data

In [7]:
dataset_path = r'C:\Users\57lzhang.US04WW4008\Documents\GitHub\BERT\dataset'
train_ds = os.path.join(dataset_path, 'train-v2.0.json')
with open(train_ds) as f:
  data = json.load(f)

In [26]:
def extract_data_label_pairs(dataset_path):
  
  pairs = []
  labels = []
  
  #load training data
  train_ds = os.path.join(dataset_path, 'train-v2.0.json')
  with open(train_ds) as f:
    data = json.load(f)

  #extract passage
  for topic in data['data']:
    for segment in topic['paragraphs']:
        passage = segment['context']
        questions = segment['qas']
        for q_dict in questions:
          question = q_dict['question']
          
          #if answer exists in the context
          if not q_dict['is_impossible']:
            #extract labels i.e. answer start and end indices
            answers = q_dict['answers']
            answer_start_index = answers[0]['answer_start']
            answer_end_index = len(answers[0]['text']) + answer_start_index - 1
            pairs.append([question, passage])
            labels.append([answer_start_index, answer_end_index])

          #if answer does not exist in the context
          else:
            pairs.append([question, passage])
            labels.append([-1, -1])

  return pairs, labels

In [27]:
pairs, labels = extract_data_label_pairs(r'C:\Users\57lzhang.US04WW4008\Documents\GitHub\BERT\dataset')
print(f'There are {sum([1 for i in range(len(pairs)) if labels[i] != [-1, -1]])} Question-Answer pairs,')
print(f'and {sum([1 for i in range(len(pairs)) if labels[i] == [-1, -1]])} Questions that have no answers')

There are 86821 Question-Answer pairs,
and 43498 Questions that have no answers


In [28]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/albert_en_base/2'
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/albert_en_preprocess/2"

In [36]:
def make_bert_preprocess_model(sentence_features, seq_length=512):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(tfhub_handle_preprocess)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [37]:
test_preprocess_model = make_bert_preprocess_model(['my_input1', 'my_input2'])
test_text = [np.array([pairs[0][0]]),
             np.array([pairs[0][1]])]

text_preprocessed = test_preprocess_model(test_text)

print('Question       : ', pairs[0][0])
print('Context        : ', pairs[0][1])
print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :50])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :50])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :50])

Question       :  When did Beyonce start becoming popular?
Context        :  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Keys           :  ['input_type_ids', 'input_word_ids', 'input_mask']
Shape Word Ids :  (1, 512)
Word Ids       :  tf.Tensor(
[    2    76   144 24809   799  1535   844    60     3 24809 16004  3745
   143  1355     8  1367   815

In [None]:
train_input = [make_bert_preprocess_model(i) for i in pairs]

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(pairs)
train_lb = tf.data.Dataset.from_tensor_slices(labels)
train = tf.data.Dataset.zip((train_ds, train_lb))

In [6]:
def prepare_training_input(pairs, tokenizer, max_seq_length=512):
  lst = []
  for question, passage, _ in pairs:
    if len(passage) > max_seq_length:
      continue

    lst.append((prepare_bert_input(question, passage, tokenizer, max_seq_length=max_seq_length)))
  
  return lst

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = tf.data.Dataset.from_tensor_slices(pairs)

In [8]:
#find the tokenizer 
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [9]:
def prepare_bert_input(question, passage, tokenizer, max_seq_length=384):
    """
    Prepare question and passage for input to BERT. 

    Args:
        question (string): question string
        passage (string): passage string where answer should lie
        tokenizer (Tokenizer): used for transforming raw string input
        max_seq_length (int): length of BERT input
    o
    Returns:
        input_ids (tf.Tensor): tensor of size (1, max_seq_length) which holds
                               ids of tokens in input
        input_mask (list): list of length max_seq_length of 1s and 0s with 1s
                           in indices corresponding to input tokens, 0s in
                           indices corresponding to padding
        tokens (list): list of length of actual string tokens corresponding to input_ids
    """
    # tokenize question
    question_tokens = tokenizer.tokenize(question)
    
    # tokenize passage
    passage_token = tokenizer.tokenize(passage)

    # get special tokens 
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
        
    # manipulate tokens to get input in correct form (not adding padding yet)
    # CLS {question_tokens} SEP {answer_tokens} 
    # This should be a list of tkens
    tokens = [CLS] + question_tokens + [SEP] + passage_token

    
    # Convert tokens into integer IDs.
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Create an input mask which has integer 1 for each token in the 'tokens' list
    input_mask = [1]*len(tokens)

    # pad input_ids with 0s until it is the max_seq_length
    # Create padding for input_ids by creating a list of zeros [0,0,...0]
    # Add the padding to input_ids so that its length equals max_seq_length
    input_ids = input_ids + [0]*(max_seq_length - len(tokens))
    
    # Do the same to pad the input_mask so its length is max_seq_length
    input_mask = input_mask + [0]*(max_seq_length - len(input_mask))

    return tf.expand_dims(tf.convert_to_tensor(input_ids), 0), input_mask, tokens  

In [26]:
passage = "My name is Bob."

question = "What is my name?"

input_ids, input_mask, tokens = prepare_bert_input(question, passage, tokenizer, 20)
print("Test Case:\n")
print("Passage: {}".format(passage))
print("Question: {}".format(question))
print()
print("Tokens:")
print(tokens)
print("\nCorresponding input IDs:")
print(input_ids)
print("\nMask:")
print(input_mask)

Test Case:

Passage: My name is Bob.
Question: What is my name?

Tokens:
['[CLS]', 'what', 'is', 'my', 'name', '?', '[SEP]', 'my', 'name', 'is', 'bob', '.']

Corresponding input IDs:
tf.Tensor(
[[ 101 2054 2003 2026 2171 1029  102 2026 2171 2003 3960 1012    0    0
     0    0    0    0    0    0]], shape=(1, 20), dtype=int32)

Mask:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]


#3.Download a BERT model and fine-tune it

#4.Evaluate the QA model

In [None]:
def get_span_from_scores(start_scores, end_scores, input_mask, verbose=False):
    """
    Find start and end indices that maximize sum of start score
    and end score, subject to the constraint that start is before end
    and both are valid according to input_mask.

    Args:
        start_scores (list): contains scores for start positions, shape (1, n)
        end_scores (list): constains scores for end positions, shape (1, n)
        input_mask (list): 1 for valid positions and 0 otherwise
    """
    n = len(start_scores)
    max_start_i = -1
    max_end_j = -1
    max_start_score = -np.inf
    max_end_score = -np.inf
    max_sum = -np.inf
    
    # Find i and j that maximizes start_scores[i] + end_scores[j]
    # so that i <= j and input_mask[i] == input_mask[j] == 1
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # set the range for i
    for i in range(n): # complete this line
        
        # set the range for j
        for j in range(i,n): #complete this line

            # both input masks should be 1
            if input_mask[i] == input_mask[j] == 1: # complete this line
                
                # check if the sum of the start and end scores is greater than the previous max sum
                if start_scores[i] + end_scores[j] > max_sum: # complete this line

                    # calculate the new max sum
                    max_sum = start_scores[i] + end_scores[j]
        
                    # save the index of the max start score
                    max_start_i = i
                
                    # save the index for the max end score
                    max_end_j = j
                    
                    # save the value of the max start score
                    max_start_val = start_scores[i]
                    
                    # save the value of the max end score
                    max_end_val = end_scores[j]
                                        
    ### END CODE HERE ###
    if verbose:
        print(f"max start is at index i={max_start_i} and score {max_start_val}")
        print(f"max end is at index i={max_end_j} and score {max_end_val}")
        print(f"max start + max end sum of scores is {max_sum}")
    return max_start_i, max_end_j

In [None]:
start_scores = tf.convert_to_tensor([-1, 2, 0.4, -0.3, 0, 8, 10, 12], dtype=float)
end_scores = tf.convert_to_tensor([5, 1, 1, 3, 4, 10, 10, 10], dtype=float)
input_mask = [1, 1, 1, 1, 1, 0, 0, 0]

start, end = get_span_from_scores(start_scores, end_scores, input_mask, verbose=True)

print("Expected: (1, 4) \nReturned: ({}, {})".format(start, end))

In [None]:
# Test 2

start_scores = tf.convert_to_tensor([0, 2, -1, 0.4, -0.3, 0, 8, 10, 12], dtype=float)
end_scores = tf.convert_to_tensor([0, 5, 1, 1, 3, 4, 10, 10, 10], dtype=float)
input_mask = [1, 1, 1, 1, 1, 0, 0, 0, 0 ]

start, end = get_span_from_scores(start_scores, end_scores, input_mask, verbose=True)

print("Expected: (1, 1) \nReturned: ({}, {})".format(start, end))

In [None]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def construct_answer(tokens):
    """
    Combine tokens into a string, remove some hash symbols, and leading/trailing whitespace.
    Args:
        tokens: a list of tokens (strings)
    
    Returns:
        out_string: the processed string.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # join the tokens together with whitespace
    out_string = " ".join(tokens)
    
    # replace ' ##' with empty string
    out_string = out_string.replace(' ##','')
    
    # remove leading and trailing whitespace
    out_string = out_string.strip()

    ### END CODE HERE ###
    
    # if there is an '@' symbol in the tokens, remove all whitespace
    if '@' in tokens:
        out_string = out_string.replace(' ', '')

    return out_string

In [None]:
# Test

tmp_tokens_1 = [' ## hello', 'how ', 'are ', 'you?      ']
tmp_out_string_1 = construct_answer(tmp_tokens_1)

print(f"tmp_out_string_1: {tmp_out_string_1}, length {len(tmp_out_string_1)}")


tmp_tokens_2 = ['@',' ## hello', 'how ', 'are ', 'you?      ']
tmp_out_string_2 = construct_answer(tmp_tokens_2)
print(f"tmp_out_string_2: {tmp_out_string_2}, length {len(tmp_out_string_2)}")


In [None]:
def get_model_answer(model, question, passage, tokenizer, max_seq_length=384):
    """
    Identify answer in passage for a given question using BERT. 

    Args:
        model (Model): pretrained Bert model which we'll use to answer questions
        question (string): question string
        passage (string): passage string
        tokenizer (Tokenizer): used for preprocessing of input
        max_seq_length (int): length of input for model
        
    Returns:
        answer (string): answer to input question according to model
    """ 
    # prepare input: use the function prepare_bert_input
    input_ids, input_mask, tokens = prepare_bert_input(question, passage, tokenizer, max_seq_length)
    
    # get scores for start of answer and end of answer
    # use the model returned by TFAutoModelForQuestionAnswering.from_pretrained("./models")
    # pass in in the input ids that are returned by prepare_bert_input
    start_scores, end_scores = model(input_ids)
    
    # start_scores and end_scores will be tensors of shape [1,max_seq_length]
    # To pass these into get_span_from_scores function, 
    # take the value at index 0 to get a tensor of shape [max_seq_length]
    start_scores = start_scores[0]
    end_scores = end_scores[0]
    
    # using scores, get most likely answer
    # use the get_span_from_scores function
    span_start, span_end = get_span_from_scores(start_scores, end_scores, input_mask)
    
    # Using array indexing to get the tokens from the span start to span end (including the span_end)
    answer_tokens = tokens[span_start:span_end+1]
    
    # Combine the tokens into a single string and perform post-processing
    # use construct_answer
    answer = construct_answer(answer_tokens)
    
    return answer

In [None]:
passage = "Computational complexity theory is a branch of the theory \
           of computation in theoretical computer science that focuses \
           on classifying computational problems according to their inherent \
           difficulty, and relating those classes to each other. A computational \
           problem is understood to be a task that is in principle amenable to \
           being solved by a computer, which is equivalent to stating that the \
           problem may be solved by mechanical application of mathematical steps, \
           such as an algorithm."

question = "What branch of theoretical computer science deals with broadly \
            classifying computational problems by difficulty and class of relationship?"

print("Output: {}".format(get_model_answer(model, question, passage, tokenizer)))
print("Expected: Computational complexity theory")

In [None]:
passage = "The word pharmacy is derived from its root word pharma which was a term used since \
           the 15th–17th centuries. However, the original Greek roots from pharmakos imply sorcery \
           or even poison. In addition to pharma responsibilities, the pharma offered general medical \
           advice and a range of services that are now performed solely by other specialist practitioners, \
           such as surgery and midwifery. The pharma (as it was referred to) often operated through a \
           retail shop which, in addition to ingredients for medicines, sold tobacco and patent medicines. \
           Often the place that did this was called an apothecary and several languages have this as the \
           dominant term, though their practices are more akin to a modern pharmacy, in English the term \
           apothecary would today be seen as outdated or only approproriate if herbal remedies were on offer \
           to a large extent. The pharmas also used many other herbs not listed. The Greek word Pharmakeia \
           (Greek: φαρμακεία) derives from pharmakon (φάρμακον), meaning 'drug', 'medicine' (or 'poison')."

question = "What word is the word pharmacy taken from?"

print("Output: {}".format(get_model_answer(model, question, passage, tokenizer)))
print("Expected: pharma")

In [None]:
passage = "Abnormal echocardiogram findings and followup. Shortness of breath, congestive heart failure, \
           and valvular insufficiency. The patient complains of shortness of breath, which is worsening. \
           The patient underwent an echocardiogram, which shows severe mitral regurgitation and also large \
           pleural effusion. The patient is an 86-year-old female admitted for evaluation of abdominal pain \
           and bloody stools. The patient has colitis and also diverticulitis, undergoing treatment. \
           During the hospitalization, the patient complains of shortness of breath, which is worsening. \
           The patient underwent an echocardiogram, which shows severe mitral regurgitation and also large \
           pleural effusion. This consultation is for further evaluation in this regard. As per the patient, \
           she is an 86-year-old female, has limited activity level. She has been having shortness of breath \
           for many years. She also was told that she has a heart murmur, which was not followed through \
           on a regular basis."

q1 = "How old is the patient?"
q2 = "Does the patient have any complaints?"
q3 = "What is the reason for this consultation?"
q4 = "What does her echocardiogram show?"
q5 = "What other symptoms does the patient have?"
q6 = "What is the gender of this patient?"

questions = [q6]

for i, q in enumerate(questions):
    print("Question {}: {}".format(i+1, q))
    print()
    print("Answer: {}".format(get_model_answer(model, q, passage, tokenizer)))
    print()
    print()

#5.Deploy the model on Django