<a href="https://colab.research.google.com/github/the-SQuAD-squad/QA/blob/huggingface/QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Init { form-width: "25%" }
import os
import random
import math
import numpy as np
import tensorflow as tf
import json
import pandas as pd
import re
import string
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers

!pip install tokenizers
from tokenizers import BertWordPieceTokenizer

pd.set_option('display.max_colwidth', -1)

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# BERT params
max_seq_length = 512

# Dataset creation

In [None]:
#@title df creation { form-width: "25%" }

# the official dataset is identical to the provided one
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O training_set.json

with open("training_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (qas['answers'][0]['answer_start'], 
                    qas['answers'][0]['answer_start'] + len(qas['answers'][0]['text'])),
        'answer_text' : qas['answers'][0]['text']
      })

df_original = pd.DataFrame(rows)

In [None]:
#@title clean dataset { form-width: "25%" }

!gcloud config set project feisty-mechanic-221914
!gsutil cp gs://squad_squad/error_IDs.txt ./error_IDs.txt

with open("error_IDs.txt", "r") as f:
    unwanted_id = f.read()

unwanted_id = unwanted_id.split("\n")[:-1]
df_bert = df_original.set_index('id')
df_bert = df_bert.drop(unwanted_id)
df_bert.head()

In [None]:
!pip install transformers
import transformers

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.is_fast

In [None]:
def preprocess_bert(text):
    #preprocesed_text = [" ".join(str(line).split()) for line in text]
    tokenized_text = tokenizer(text, return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

preprocess_bert(["hi mum", "how are you"])


In [None]:

from tqdm import tqdm
for i in tqdm(range(10)):
  print(i)

In [None]:
#@title BERT preprocessing { form-width: "25%" }
from tqdm import tqdm
#vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
#tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)

def preprocess_bert(text):
    tokenized_text = tokenizer(list(text), return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

def labeling(df):
    skip = []
    ans_token_start = []
    ans_token_end = []
    input_word_ids = []
    input_type_ids = []
    input_mask = []
    context_token_to_char = []

    for id in tqdm(df.index):
        answer = " ".join(str(df.loc[id]['answer_text']).split())
        tokenized_context = df.loc[id]['passage']
        tokenized_question = df.loc[id]['question']

        # mark all the character indexes in context that are also in answer     
        is_char_in_ans = [0] * len(df_bert.loc[id]['passage'])
        for idx in range(*df.loc[id]['answer_idx']):
            is_char_in_ans[idx] = 1
        ans_token_idx = []
        # find all the tokens that are in the answers
        for idx, (start, end) in enumerate(tokenized_context["offsets"]): #start is index of the first character of the word, end is the index of the last character of the word
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)
        if len(ans_token_idx) == 0:
            skip.append(id)
            continue
        # create inputs as usual
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:] #removing CLS from the beginning of the question 
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            skip.append(id)
            continue
        input_word_ids.append(np.array(input_ids))
        input_type_ids.append(np.array(token_type_ids))
        input_mask.append(np.array(attention_mask))
        context_token_to_char.append(np.array(tokenized_context["offsets"]))
        ans_token_start.append(ans_token_idx[0])
        ans_token_end.append(ans_token_idx[-1])

    df = df.drop(skip)
    df['input_word_ids'] = input_word_ids
    df['input_type_ids'] = input_type_ids
    df['input_mask'] = input_mask
    df['context_token_to_char'] = context_token_to_char
    df['ans_token_start'] = ans_token_start
    df['ans_token_end'] = ans_token_end

    return df    

df_bert_preprocessed = df_bert.copy()
# pre-process passage and question text
print("Preprocessing passage...")
df_bert_preprocessed['passage'] = preprocess_bert(df_bert['passage'])
print("Preprocessing question...")
df_bert_preprocessed['question'] = preprocess_bert(df_bert['question'])
print("Building attention masks...")
df_bert_preprocessed = labeling(df_bert_preprocessed)
df_bert_preprocessed.head()

In [None]:
    df_bert_preprocessed.to_pickle("df_bert_preprocessed_fast.pkl")
    !zip df_bert_preprocessed.pkl.zip df_bert_preprocessed_fast.pkl

#Skip preprocessing

In [None]:
#@title load/store { form-width: "25%" }
import pickle
load = True #@param {type: "boolean"}

if load:
    !gcloud config set project feisty-mechanic-221914
    !gsutil cp gs://squad_squad/df_bert_preprocessed.pkl.zip ./df_bert_preprocessed.pkl.zip
    !unzip -o ./df_bert_preprocessed.pkl.zip
    df_bert_preprocessed = pd.read_pickle("./df_bert_preprocessed.pkl")
else:
    df_bert_preprocessed.to_pickle("df_bert_preprocessed.pkl")
    !zip df_bert_preprocessed.pkl.zip df_bert_preprocessed.pkl

    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project feisty-mechanic-221914
    !gsutil cp ./df_bert_preprocessed.pkl.zip gs://squad_squad/df_bert_preprocessed.pkl.zip

!nvidia-smi

#Model

In [None]:
#@title split { form-width: "25%" }

split_value = 0.1 #@param {type:"number"} 
val_dim = int(len(df_bert_preprocessed['title'].unique()) * split_value)
val_titles = np.random.choice(df_bert_preprocessed['title'].unique(), size=val_dim, replace=False)

df_bert_val = df_bert_preprocessed[df_bert_preprocessed['title'].isin(val_titles)]
df_bert_train = df_bert_preprocessed[~(df_bert_preprocessed['title'].isin(val_titles))]

x_train, y_train = [np.stack(df_bert_train["input_word_ids"]),np.stack(df_bert_train["input_mask"]),np.stack(df_bert_train["input_type_ids"])],[np.stack(df_bert_train["ans_token_start"]),np.stack(df_bert_train["ans_token_end"])]
x_eval, y_eval = [np.stack(df_bert_val["input_word_ids"]),np.stack(df_bert_val["input_mask"]),np.stack(df_bert_val["input_type_ids"])],[np.stack(df_bert_val["ans_token_start"]),np.stack(df_bert_val["ans_token_end"])]

In [None]:
bert_hf_layer = transformers.TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True)

In [None]:
#@title model definition { form-width: "25%" }

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

#pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

#HUGGINGFACE 🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗
sequence_output = bert_hf_layer([input_word_ids, 
                                 input_mask, 
                                 input_type_ids]).last_hidden_state

#do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten(name="flatten_start")(start_logits)

end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten(name="flatten_end")(end_logits)

start_probs = layers.Activation(keras.activations.softmax, name="softmax_start")(start_logits)
end_probs = layers.Activation(keras.activations.softmax, name="softmax_end")(end_logits)

model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], 
                    outputs=[start_probs, end_probs],
                    name="BERT_QA")

loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)

optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

model.summary(line_length=150)

In [None]:
#@title metrics { form-width: "25%" }

def prec(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return 1 - tf.math.count_nonzero(tf.squeeze(tf.cast(y_true, tf.int64)) - sampled) / tf.cast(len(sampled), tf.int64)

def dist(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return tf.reduce_sum(tf.abs(tf.squeeze(tf.cast(y_true, tf.int64)) - sampled)) / tf.cast(len(sampled), tf.int64)

In [None]:
#@title train { form-width: "25%" }
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
batch_size = 8
epochs = 200
steps_per_epoch = 20
saveDir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(saveDir):
    os.makedirs(saveDir)
chkpt = saveDir + '/squad_check.hdf5'

ENABLE_WANDB = True        #@param {type:"boolean"}
wandb_experiment_name = "HF_BERT_FAST"  #@param {type: "string"}
if ENABLE_WANDB:
    !pip install wandb > /dev/null
    !wandb login wandb_api_token
    import wandb
    from wandb.keras import WandbCallback
    wandb.init(project="SQUAD", name=wandb_experiment_name)
    wandb.config.batch_size = batch_size
    wandb.config.epochs = epochs
    
es_cb = EarlyStopping(monitor='val_loss', patience=2,verbose=1, mode='auto')
cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, 
                        save_best_only=False, mode='auto', 
                        save_weights_only=True)

callbacks = [es_cb, cp_cb]

if ENABLE_WANDB:
    callbacks.append(WandbCallback(log_batch_frequency=10,
                                   save_weights_only=True))

tf.keras.backend.clear_session()

model.compile(optimizer=optimizer, loss=[loss,loss], metrics=[prec,dist])
history = model.fit(x_train, y_train, epochs=epochs,callbacks=callbacks, 
                    validation_data=(x_eval,y_eval),batch_size=batch_size)#,
                    #steps_per_epoch = steps_per_epoch)



#Evaluation

In [None]:
#@title download best weights
!wget https://wandb.ai/veri/SQUAD/runs/1t78812w/files/model-best.h5
model.load_weights("model-best.h5")

In [None]:
import matplotlib.pyplot as plt

predictions = model.predict(x_eval)
sampled_start = np.argmax(predictions[0], axis=-1)
sampled_end = np.argmax(predictions[1], axis=-1)
plt.figure(figsize=(30,30))
plt.plot(y_eval[0], y_eval[1], ".")
plt.plot(sampled_start, sampled_end,"*")

In [None]:
!gsutil cp gs://squad_squad/evaluate.py ./evaluate.py
!chmod +x ./evaluate.py

In [None]:
#@title BERT preprocessing { form-width: "25%" }
from tqdm import tqdm
#vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
#tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)



df_bert_preprocessed = df_bert.copy()
# pre-process passage and question text
print("Preprocessing passage...")
df_bert_preprocessed['passage'] = preprocess_bert(df_bert['passage'])
print("Preprocessing question...")
df_bert_preprocessed['question'] = preprocess_bert(df_bert['question'])
print("Building attention masks...")
df_bert_preprocessed = labeling(df_bert_preprocessed)
df_bert_preprocessed.head()

In [None]:
# preprocess dev set

!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O test_set.json

with open("test_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (qas['answers'][0]['answer_start'], 
                    qas['answers'][0]['answer_start'] + len(qas['answers'][0]['text'])),
        'answer_text' : qas['answers'][0]['text']
      })

df_dev = pd.DataFrame(rows)

def preprocess_bert(text):
    tokenized_text = tokenizer(list(text), return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

def labeling(df):
    skip = []
    ans_token_start = []
    ans_token_end = []
    input_word_ids = []
    input_type_ids = []
    input_mask = []
    context_token_to_char = []

    for id in tqdm(df.index):
        answer = " ".join(str(df.loc[id]['answer_text']).split())
        tokenized_context = df.loc[id]['passage']
        tokenized_question = df.loc[id]['question']

        # mark all the character indexes in context that are also in answer     
        is_char_in_ans = [0] * len(df_bert.loc[id]['passage'])
        for idx in range(*df.loc[id]['answer_idx']):
            is_char_in_ans[idx] = 1
        ans_token_idx = []
        # find all the tokens that are in the answers
        for idx, (start, end) in enumerate(tokenized_context["offsets"]): #start is index of the first character of the word, end is the index of the last character of the word
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)
        if len(ans_token_idx) == 0:
            skip.append(id)
            continue
        # create inputs as usual
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:] #removing CLS from the beginning of the question 
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            skip.append(id)
            continue
        input_word_ids.append(np.array(input_ids))
        input_type_ids.append(np.array(token_type_ids))
        input_mask.append(np.array(attention_mask))
        context_token_to_char.append(np.array(tokenized_context["offsets"]))
        ans_token_start.append(ans_token_idx[0])
        ans_token_end.append(ans_token_idx[-1])

    df = df.drop(skip)
    df['input_word_ids'] = input_word_ids
    df['input_type_ids'] = input_type_ids
    df['input_mask'] = input_mask
    df['context_token_to_char'] = context_token_to_char
    df['ans_token_start'] = ans_token_start
    df['ans_token_end'] = ans_token_end

    return df, skip

# pre-process passage and question text
df_dev = df_dev.set_index('id')
df_bert_dev = df_dev.copy()

df_bert_dev['passage'] = preprocess_bert(df_dev['passage'])
df_bert_dev['question'] = preprocess_bert(df_dev['question'])

df_bert_dev, skipped = labeling(df_bert_dev)
df_bert_dev.head(1)

In [None]:
x_test, y_test = [np.stack(df_bert_dev["input_word_ids"]),np.stack(df_bert_dev["input_mask"]),
                  np.stack(df_bert_dev["input_type_ids"])],[np.stack(df_bert_dev["ans_token_start"]),
                                                            np.stack(df_bert_dev["ans_token_end"])]

In [None]:
from IPython.display import clear_output

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)

# def preprocess_bert(text):
#     preprocessed_text = [" ".join(str(line).split()) for line in text]
#     tokenized_text = [tokenizer.encode(preprocessed_line) for preprocessed_line in preprocessed_text]
#     return tokenized_text

num_samples = len(df_bert_dev)
predictions = model.predict(x_test)
        
start, end = list(np.argmax(predictions, axis=-1).squeeze())
with open("dev_predictions.txt","w") as out:
    out.write("{")
    for id in skipped:
        out.write(f'''"{id}": "42",\n''')

    for ans_id in range(num_samples):
        predicted_ans = tokenizer.decode(df_bert_dev.iloc[ans_id]['passage'].ids[start[ans_id] : end[ans_id]+1])
        if ans_id == num_samples-1:
            out.write(f'''"{df_bert_dev.index[ans_id]}": "{predicted_ans.replace('"',"")}"''')
        else:
            out.write(f'''"{df_bert_dev.index[ans_id]}": "{predicted_ans.replace('"',"")}",\n''')
        #print(tokenizer.decode(df_bert_dev.iloc[ans_id]['question'].ids), predicted_ans, df_bert_dev.index[ans_id])

    out.write("}")

In [None]:
import matplotlib.pyplot as plt

#predictions = model.predict(x_test)
sampled_start = np.argmax(predictions[0], axis=-1)
sampled_end = np.argmax(predictions[1], axis=-1)

plt.figure(figsize=(30,30))
plt.plot(y_test[0], y_test[1], ".")
plt.plot(sampled_start, sampled_end,"*")

In [None]:
!python3 evaluate.py test_set.json dev_predictions.txt 

In [None]:
# inference with custom context and question

def custom_inference(context, question):
    preprocessed_context = " ".join(str(context).split())
    preprocessed_question = " ".join(str(question).split())
    tokenized_context = tokenizer(preprocessed_context)
    tokenized_question = tokenizer(preprocessed_question)
    input_ids = tokenized_context["input_ids"] + tokenized_question["input_ids"][1:]
    token_type_ids = [0] * len(tokenized_context["input_ids"]) + [1] * len(tokenized_question["input_ids"][1:])
    attention_mask = [1] * len(input_ids)
    padding_length = max_seq_length - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
    else:
        print("Error! The input is too long")
    input_word_ids = np.array(input_ids)
    input_mask = np.array(attention_mask)
    input_type_ids = np.array(token_type_ids)
    x = [np.expand_dims(input_word_ids, axis =0), np.expand_dims(input_mask, axis = 0), 
         np.expand_dims(input_type_ids,axis=0)]
    predictions = model.predict(x)
    start, end = list(np.argmax(predictions, axis=-1).squeeze())
    predicted_ans = tokenizer.decode(tokenized_context["input_ids"][start : end+1])
    return predicted_ans

In [None]:
context = "Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. He has received various accolades for his work, including three Golden Globe Awards and three nominations for Academy Awards. He is one of the highest-paid actors in the world. His films have grossed over $4 billion in North America and over $10.1 billion worldwide, making him one of the highest-grossing box office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action drama film Top Gun (1986). Critical acclaim came with his roles in the drama films The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992), the thriller The Firm (1993), the horror film Interview with the Vampire (1994), and the romance Jerry Maguire (1996). For his role in the latter, he won a Golden Globe Award for Best Actor and received his second Academy Award nomination."
question = "What was the first film Tom Cruise acted in?"


predicted_answer = custom_inference(context, question)
print(predicted_answer)