In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 9.3MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.9.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install tqdm



In [None]:
import json
import os
import re
import string
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from tqdm import tqdm

In [None]:
# squad, info = tfds.load('squad/v1.1', with_info=True, batch_size=-1)
# print(squad.keys())
train_path = keras.utils.get_file("train.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
eval_path = keras.utils.get_file("eval.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")
with open(train_path) as f: raw_train_data = json.load(f)
with open(eval_path) as f: raw_eval_data = json.load(f)

Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


In [None]:
max_seq_length = 384
input_word_ids_model = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask_model = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids_model = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)


In [None]:
pooled_output, sequence_output = bert_layer([input_word_ids_model, input_mask_model, input_type_ids_model])
#vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
#vocab_file = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
#print(vocab_file)
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab='/content/bert-base-uncased-vocab.txt', lowercase=True)

In [None]:
def preprocess(data):
  processed_data = []
  for item in tqdm(data['data']):
    for para in item['paragraphs']:
      context = para['context']
      for qas in para['qas']:
        proc_data = {
            'start_token_idx' :  -1,
            'end_token_idx' : -1
        }
        answer = None
        try:
          answer = qas['answers'][0]['text']
          answer_start = qas['answers'][0]['answer_start']
        except:
          pass
        question = qas['question']
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        proc_data['context'] = tokenizer.encode(context)
        proc_data['question'] = tokenizer.encode(question)
        proc_data['raw_context'] = context
        proc_data['raw_question'] = question
        

        if answer is not None:

          answer_end = len(answer) + answer_start

          #If the end of answer exceeds context.
          if(answer_end >= len(context)):
            continue

          #Array of characters indicating where the answer is in the context.
          answer_char_indices = [0] * len(context)
          for idx in range(answer_start, answer_end):
                answer_char_indices[idx] = 1

          #Storing the encoded legal answer offsets (start and stop of encoded answer)
          ans_token_idx = []
          for idx, (start, end) in enumerate(proc_data['context'].offsets):
                if sum(answer_char_indices[start:end]) > 0:
                    ans_token_idx.append(idx)

          #skip if there are no legal answers.
          if len(ans_token_idx) == 0:
            continue
          
          #Storing the start and end index of the tokenized(encoded) answer gotten from the context.
          proc_data['start_token_idx'] = ans_token_idx[0]
          proc_data['end_token_idx'] = ans_token_idx[-1]

        #Setting up input_word_ids, input_type_ids, input_mask.
        input_ids = proc_data['context'].ids + proc_data['question'].ids[1:]
        token_type_ids = [0] * len(proc_data['context'].ids) + [1] * len(proc_data['question'].ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        if padding_length > 0:
          input_ids = input_ids + ([0] * padding_length)
          attention_mask = attention_mask + ([0] * padding_length)
          token_type_ids = token_type_ids + ([0] * padding_length)

        #Skip if padding length is 0.
        elif padding_length < 0:
          continue
        proc_data['input_word_ids'] = input_ids
        proc_data['input_type_ids'] = token_type_ids
        proc_data['input_mask'] = attention_mask
        proc_data['context_token_to_char'] = proc_data['context'].offsets

        processed_data.append(proc_data)
  return processed_data

def create_inputs_targets(processed_data):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in processed_data:
          for key in dataset_dict:
              dataset_dict[key].append(item[key])
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


In [None]:
#Creating the inputs targets.
processed_data_train = preprocess(raw_train_data)
processed_data_eval = preprocess(raw_eval_data)
x_train, y_train = create_inputs_targets(processed_data_train)
x_eval, y_eval = create_inputs_targets(processed_data_eval)

#Adding the layer.
start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)
model = keras.Model(inputs=[input_word_ids_model, input_mask_model, input_type_ids_model], outputs=[start_probs, end_probs])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()


100%|██████████| 442/442 [00:55<00:00,  8.03it/s]
100%|██████████| 48/48 [00:06<00:00,  7.39it/s]


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 384)]        0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [None]:
class ValidationCallback(keras.callbacks.Callback):

    def normalize_text(self, text):
        text = text.lower()
        text = "".join(ch for ch in text if ch not in set(string.punctuation))
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = self.normalize_text(pred_ans)
            normalized_true_ans = [self.normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}")

In [None]:
model.fit(x_train, y_train, epochs=2, batch_size=8)

Epoch 1/2
Epoch 2/2


<bound method Model.fit of <tensorflow.python.keras.engine.functional.Functional object at 0x7f723bf64470>>

In [None]:
model.save_weights("/content/drive/MyDrive/Bert2/weights.h5")
model.save("/content/drive/MyDrive/Bert2/mymodel")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Bert2/mymodel/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Bert2/mymodel/assets


# **Testing the model**

In [None]:
model = tf.keras.models.load_model('/content/gdrive/MyDrive/Bert2/mymodel')

In [None]:
f = open("/content/context.txt", "r")
cont = f.read()
data = {"data":
    [
        {"title": "Project Apollo",
         "paragraphs": [
             {
                 "context": cont,
                 "qas": [
                     {"question": "What makes gpu more efficient than cpu?",
                      "id": "Q1"
                      }
                 ]}]}]}
# data = {
#     'context' : [cont],
#     'id' : '1',
#     'question' : ["What was Maria Curie the first female recipient of?"]
# }
# x, _, offsets = preprocess(data)
processedData = preprocess(data)
x,y = create_inputs_targets(processedData)
# print(x)

100%|██████████| 1/1 [00:00<00:00, 808.46it/s]


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
pred_start, pred_end = model.predict(x)
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = processedData[idx]
    offset = test_sample['context_token_to_char']
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offset):
        continue
    pred_char_start = offset[start][0]

    if end < start:
      #Get the sentence
      lines = nltk.sent_tokenize(test_sample['raw_context'])
      line_length = 0
      print(test_sample['raw_context'][offset[start][0]:offset[start][1]])
      for i, line in enumerate(lines):
        line_length += len(line)
        if offset[start][0] < line_length:
          pred_ans = line
          break
    elif end < len(offset):
      pred_ans = test_sample['raw_context'][pred_char_start:offset[end][1]]
    else:
        pred_ans = test_sample['raw_context'][pred_char_start:]
    print(start, end, len(offset))
    print("Q: " + test_sample['raw_question'])
    print("A: " + pred_ans)

76 78 141
Q: What makes gpu more efficient than cpu?
A: highly parallel structure
