<a href="https://colab.research.google.com/github/the-SQuAD-squad/QA/blob/huggingface/QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Poject
## Question answering on SQuAD with BERT

Verì, Rambaldi, Serfilippi, Buiani

In [None]:
#@title Init { form-width: "35%" }
import os
import random
import math
import numpy as np
import tensorflow as tf
import json
import pandas as pd
import re
import string
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

!pip install 'Transformers==4.3'
import transformers
from transformers import TFBertModel, TFRobertaModel, TFElectraModel, TFLongformerModel
from transformers import AutoTokenizer
pd.set_option('display.max_colwidth', -1)

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# BERT params

huggingface_pretrained_model = "bert-base-uncased" #@param ["bert-base-uncased", "roberta-base", "google/electra-base-discriminator", "allenai/longformer-base-4096"]

# Huggingface bert and associated tokenizer
# hf model and input sequence max length

hf_Models = {"bert-base-uncased": (TFBertModel, 512),
             "roberta-base" : (TFRobertaModel, 512),
             "google/electra-base-discriminator" : (TFElectraModel, 512),
             "allenai/longformer-base-4096" : (TFLongformerModel, 1024)}

TFHFModel, max_seq_length = hf_Models[huggingface_pretrained_model]

# actual bert model
bert_hf_layer = TFHFModel.from_pretrained(huggingface_pretrained_model)

# actual tokenizer
tokenizer = AutoTokenizer.from_pretrained(huggingface_pretrained_model)

print("\n\n")
print(f"{bert_hf_layer.name} selected with max input length of {max_seq_length}")

[K     |████████████████████████████████| 1.9MB 7.0MB/s 
[K     |████████████████████████████████| 890kB 36.1MB/s 
[K     |████████████████████████████████| 3.2MB 49.5MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…





tf_bert_model selected with max input length of 512


# Dataset creation

In [None]:
#@title df creation { form-width: "25%" }

squad_ver = "v1"  #@param ["v1","v2"]

# the official dataset is identical to the provided one
if squad_ver == "v1":
    !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O training_set.json
else:
    !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O training_set.json

with open("training_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:

      if len(qas['answers']) == 0: #no answer
          ans_start = -1
          ans_end = -1
          ans_text = ""
      else:
          ans_start = qas['answers'][0]['answer_start']
          ans_end = ans_start + len(qas['answers'][0]['text'])
          ans_text = qas['answers'][0]['text']
          
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (ans_start, ans_end),
        'answer_text' : ans_text
      })

df_original = pd.DataFrame(rows)

--2021-03-12 21:26:17--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘training_set.json’


2021-03-12 21:26:18 (91.7 MB/s) - ‘training_set.json’ saved [42123633/42123633]



In [None]:
#@title clean dataset { form-width: "25%" }
if squad_ver == "v1":
    !gcloud config set project feisty-mechanic-221914
    !gsutil cp gs://squad_squad/error_IDs.txt ./error_IDs.txt

    with open("error_IDs.txt", "r") as f:
        unwanted_id = f.read()

    unwanted_id = unwanted_id.split("\n")[:-1]
    df_bert = df_original.set_index('id')
    df_bert = df_bert.drop(unwanted_id)
    df_bert.head()

In [None]:
def preprocess_bert(text):
    #preprocesed_text = [" ".join(str(line).split()) for line in text]
    tokenized_text = tokenizer(text, return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

[{'input_ids': [0, 3592, 8562, 2],
  'offsets': [(0, 0), (0, 2), (3, 6), (0, 0)]},
 {'input_ids': [0, 3592, 4252, 2],
  'offsets': [(0, 0), (0, 2), (3, 6), (0, 0)]}]

In [None]:
#@title BERT preprocessing { form-width: "1%" }

def preprocess_bert(text):
    tokenized_text = tokenizer(list(text), return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

def labeling(df):
    skip = []
    ans_token_start = []
    ans_token_end = []
    input_word_ids = []
    input_type_ids = []
    input_mask = []
    context_token_to_char = []

    for id in tqdm(df.index):

        answer = " ".join(str(df.loc[id]['answer_text']).split())
        tokenized_context = df.loc[id]['passage']
        tokenized_question = df.loc[id]['question']

        # mark all the character indexes in context that are also in answer     
        is_char_in_ans = [0] * len(df_bert.loc[id]['passage'])
        for idx in range(*df.loc[id]['answer_idx']):
            is_char_in_ans[idx] = 1


        ans_token_idx = []
        if df.loc[id]['answer_idx'] == (-1,-1):
            ans_token_idx.append(0)
        else:
        # find all the tokens that are in the answers
            for idx, (start, end) in enumerate(tokenized_context["offsets"]): #start is index of the first character of the word, end is the index of the last character of the word
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                skip.append(id)
                continue
        # create inputs as usual
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:] #removing CLS from the beginning of the question 
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            skip.append(id)
            continue
        input_word_ids.append(np.array(input_ids))
        input_type_ids.append(np.array(token_type_ids))
        input_mask.append(np.array(attention_mask))
        context_token_to_char.append(np.array(tokenized_context["offsets"]))
        ans_token_start.append(ans_token_idx[0])
        ans_token_end.append(ans_token_idx[-1])

    df = df.drop(skip)
    df['input_word_ids'] = input_word_ids
    df['input_type_ids'] = input_type_ids
    df['input_mask'] = input_mask
    df['context_token_to_char'] = context_token_to_char
    df['ans_token_start'] = ans_token_start
    df['ans_token_end'] = ans_token_end

    return df    

df_bert_preprocessed = df_bert.copy()
# pre-process passage and question text
print("Preprocessing passage...")
df_bert_preprocessed['passage'] = preprocess_bert(df_bert['passage'])
print("Preprocessing question...")
df_bert_preprocessed['question'] = preprocess_bert(df_bert['question'])
print("Building attention masks...")
df_bert_preprocessed = labeling(df_bert_preprocessed)


Preprocessing passage...
Preprocessing question...
Building attention masks...


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




#Model

In [None]:
#@title split { form-width: "25%" }

split_value = 0.1 #@param {type:"number"} 
val_dim = int(len(df_bert_preprocessed['title'].unique()) * split_value)
val_titles = np.random.choice(df_bert_preprocessed['title'].unique(), size=val_dim, replace=False)

df_bert_val = df_bert_preprocessed[df_bert_preprocessed['title'].isin(val_titles)]
df_bert_train = df_bert_preprocessed[~(df_bert_preprocessed['title'].isin(val_titles))]

x_train, y_train = [np.stack(df_bert_train["input_word_ids"]),np.stack(df_bert_train["input_mask"]),np.stack(df_bert_train["input_type_ids"])],[np.stack(df_bert_train["ans_token_start"]),np.stack(df_bert_train["ans_token_end"])]
x_eval, y_eval = [np.stack(df_bert_val["input_word_ids"]),np.stack(df_bert_val["input_mask"]),np.stack(df_bert_val["input_type_ids"])],[np.stack(df_bert_val["ans_token_start"]),np.stack(df_bert_val["ans_token_end"])]

In [None]:
#@title model definition { form-width: "25%" }
def build_model(bert_hf_layer):
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

    #HUGGINGFACE 🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗
    sequence_output = bert_hf_layer(input_ids=input_word_ids, attention_mask=input_mask, 
                                    token_type_ids=input_type_ids).last_hidden_state

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
    start_logits = layers.Flatten(name="flatten_start")(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
    end_logits = layers.Flatten(name="flatten_end")(end_logits)

    start_probs = layers.Activation(keras.activations.softmax, name="softmax_start")(start_logits)
    end_probs = layers.Activation(keras.activations.softmax, name="softmax_end")(end_logits)

    model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], 
                        outputs=[start_probs, end_probs],
                        name="BERT_QA")

    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)

    optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    model.summary(line_length=150)

    return model

In [None]:
#@title metrics { form-width: "25%" }

def accuracy(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return 1 - tf.math.count_nonzero(tf.squeeze(tf.cast(y_true, tf.int64)) - sampled) / tf.cast(len(sampled), tf.int64)

def dist(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return tf.reduce_sum(tf.abs(tf.squeeze(tf.cast(y_true, tf.int64)) - sampled)) / tf.cast(len(sampled), tf.int64)

In [None]:
#@title train { form-width: "25%" }
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
batch_size = 8
epochs = 200
steps_per_epoch = 20
saveDir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(saveDir):
    os.makedirs(saveDir)
chkpt = saveDir + '/squad_check.hdf5'

ENABLE_WANDB = False        #@param {type:"boolean"}
wandb_experiment_name = "HF_RoBERTa_SQuAD_V2.0" #@param {type: "string"}
if ENABLE_WANDB:
    !pip install wandb > /dev/null
    !wandb login
    import wandb
    from wandb.keras import WandbCallback
    wandb.init(project="SQUAD", name=wandb_experiment_name)
    wandb.config.batch_size = batch_size
    wandb.config.epochs = epochs
    
es_cb = EarlyStopping(monitor='val_loss', patience=2,verbose=1, mode='auto')
cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, 
                        save_best_only=False, mode='auto', 
                        save_weights_only=True)

callbacks = [es_cb, cp_cb]

if ENABLE_WANDB:
    callbacks.append(WandbCallback(log_batch_frequency=10,
                                   save_weights_only=True))

tf.keras.backend.clear_session()

model.compile(optimizer=optimizer, loss=[loss,loss], metrics=[accuracy,dist])
history = model.fit(x_train, y_train, epochs=epochs,callbacks=callbacks, 
                    validation_data=(x_eval,y_eval),batch_size=batch_size)#,
                    #steps_per_epoch = steps_per_epoch)

Epoch 1/200
  332/14645 [..............................] - ETA: 2:08:28 - loss: 0.8577 - softmax_start_loss: 0.4465 - softmax_end_loss: 0.4112 - softmax_start_prec: 0.8334 - softmax_start_dist: 4.1069 - softmax_end_prec: 0.8613 - softmax_end_dist: 4.3449

#Evaluation

In [None]:
#@title download best weights

huggingface_pretrained_model = "bert-base-uncased" #@param ["bert-base-uncased", "roberta-base", "google/electra-base-discriminator", "allenai/longformer-base-4096"]

# Huggingface bert and associated tokenizer

# hf model and input sequence max length
hf_Models = {"bert-base-uncased": (TFBertModel, 512, "https://api.wandb.ai/files/buio/SQUAD/2a1u1bxu/model-best.h5"),
             "roberta-base" : (TFRobertaModel, 512, "https://api.wandb.ai/files/buio/SQUAD/184b7gum/model-best.h5"),
             "google/electra-base-discriminator" : (TFElectraModel, 512, "https://api.wandb.ai/files/buio/SQUAD/2rab6oli/model-best.h5"),
             "allenai/longformer-base-4096" : (TFLongformerModel, 1024, "not_yet_trained")}

TFHFModel = hf_Models[huggingface_pretrained_model][0]
max_seq_length = hf_Models[huggingface_pretrained_model][1]
weights_path = hf_Models[huggingface_pretrained_model][2]

# actual bert model
bert_hf_layer = TFHFModel.from_pretrained(huggingface_pretrained_model)

# actual tokenizer
tokenizer = AutoTokenizer.from_pretrained(huggingface_pretrained_model)

# load bert weights form the weights and biases platform

os.system(f"wget {weights_path}")
model = build_model(bert_hf_layer)
model.load_weights("model-best.h5")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT_QA"
______________________________________________________________________________________________________________________________________________________
Layer (type)                                     Output Shape                     Param #           Connected to                                      
input_word_ids (InputLayer)                      [(None, 512)]                    0                                                                   
______________________________________________________________________________________________________________________________________________________
input_mask (InputLayer)                          [(None, 512)]                    0                                                                   
______________________________________________________________________________________________________________________________________________________
input_type_ids (InputLayer)                      [(None, 512)]               

In [None]:
#@title download evaluation script { form-width: "25%" }

!gsutil cp gs://squad_squad/evaluate.py ./evaluate.py
!chmod +x ./evaluate.py

Copying gs://squad_squad/evaluate.py...
/ [1 files][ 10.3 KiB/ 10.3 KiB]                                                
Operation completed over 1 objects/10.3 KiB.                                     


In [None]:
#@title preprocess dev set { form-width: "25%" }
from tqdm.notebook import tqdm
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O test_set.json

with open("test_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:

      if len(qas['answers']) == 0: #no answer
          ans_start = -1
          ans_end = -1
          ans_text = ""
      else:
          ans_start = qas['answers'][0]['answer_start']
          ans_end = ans_start + len(qas['answers'][0]['text'])
          ans_text = qas['answers'][0]['text']
          
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (ans_start, ans_end),
        'answer_text' : ans_text
      })


df_dev = pd.DataFrame(rows)

def preprocess_bert(text):
    tokenized_text = tokenizer(list(text), return_offsets_mapping=True)

    rows_out  = [{'input_ids': tokenized_text.input_ids[i],
                  'offsets': tokenized_text.offset_mapping[i]} for i in range(len(text))]

    return rows_out

def labeling(df):
    skip = []
    ans_token_start = []
    ans_token_end = []
    input_word_ids = []
    input_type_ids = []
    input_mask = []
    context_token_to_char = []

    for id in tqdm(df.index):

        answer = " ".join(str(df.loc[id]['answer_text']).split())
        tokenized_context = df.loc[id]['passage']
        tokenized_question = df.loc[id]['question']

        # mark all the character indexes in context that are also in answer     
        is_char_in_ans = [0] * len(df_dev.loc[id]['passage']) ###### BERT/DEV
        for idx in range(*df.loc[id]['answer_idx']):
            is_char_in_ans[idx] = 1


        ans_token_idx = []
        if df.loc[id]['answer_idx'] == (-1,-1):
            ans_token_idx.append(0)
        else:
        # find all the tokens that are in the answers
            for idx, (start, end) in enumerate(tokenized_context["offsets"]): #start is index of the first character of the word, end is the index of the last character of the word
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                skip.append(id)
                continue
        # create inputs as usual
        input_ids = tokenized_context['input_ids'] + tokenized_question['input_ids'][1:] #removing CLS from the beginning of the question 
        token_type_ids = [0] * len(tokenized_context['input_ids']) + [1] * len(tokenized_question['input_ids'][1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            skip.append(id)
            continue
        input_word_ids.append(np.array(input_ids))
        input_type_ids.append(np.array(token_type_ids))
        input_mask.append(np.array(attention_mask))
        context_token_to_char.append(np.array(tokenized_context["offsets"]))
        ans_token_start.append(ans_token_idx[0])
        ans_token_end.append(ans_token_idx[-1])

    df = df.drop(skip)
    df['input_word_ids'] = input_word_ids
    df['input_type_ids'] = input_type_ids
    df['input_mask'] = input_mask
    df['context_token_to_char'] = context_token_to_char
    df['ans_token_start'] = ans_token_start
    df['ans_token_end'] = ans_token_end
 
    return df, skip

# pre-process passage and question text
df_dev = df_dev.set_index('id')
df_bert_dev = df_dev.copy()

df_bert_dev['passage'] = preprocess_bert(df_dev['passage'])
df_bert_dev['question'] = preprocess_bert(df_dev['question'])

df_bert_dev, skipped = labeling(df_bert_dev)
df_bert_dev.head(1)

--2021-03-12 20:48:42--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘test_set.json’


2021-03-12 20:48:43 (25.8 MB/s) - ‘test_set.json’ saved [4854279/4854279]



Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors


HBox(children=(FloatProgress(value=0.0, max=10570.0), HTML(value='')))




Unnamed: 0_level_0,title,passage,question,answer_idx,answer_text,input_word_ids,input_type_ids,input_mask,context_token_to_char,ans_token_start,ans_token_end
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
56be4db0acb8001400a502ec,Super_Bowl_50,"{'input_ids': [101, 3565, 4605, 2753, 2001, 2019, 2137, 2374, 2208, 2000, 5646, 1996, 3410, 1997, 1996, 2120, 2374, 2223, 1006, 5088, 1007, 2005, 1996, 2325, 2161, 1012, 1996, 2137, 2374, 3034, 1006, 10511, 1007, 3410, 7573, 14169, 3249, 1996, 2120, 2374, 3034, 1006, 22309, 1007, 3410, 3792, 12915, 2484, 1516, 2184, 2000, 7796, 2037, 2353, 3565, 4605, 2516, 1012, 1996, 2208, 2001, 2209, 2006, 2337, 1021, 1010, 2355, 1010, 2012, 11902, 1005, 1055, 3346, 1999, 1996, 2624, 3799, 3016, 2181, 2012, 4203, 10254, 1010, 2662, 1012, 2004, 2023, 2001, 1996, 12951, 3565, 4605, 1010, 1996, 2223, 13155, 1996, 1000, 3585, 5315, ...], 'offsets': [(0, 0), (0, 5), (6, 10), (11, 13), (14, 17), (18, 20), (21, 29), (30, 38), (39, 43), (44, 46), (47, 56), (57, 60), (61, 69), (70, 72), (73, 76), (77, 85), (86, 94), (95, 101), (102, 103), (103, 106), (106, 107), (108, 111), (112, 115), (116, 120), (121, 127), (127, 128), (129, 132), (133, 141), (142, 150), (151, 161), (162, 163), (163, 166), (166, 167), (168, 176), (177, 183), (184, 191), (192, 200), (201, 204), (205, 213), (214, 222), (223, 233), (234, 235), (235, 238), (238, 239), (240, 248), (249, 257), (258, 266), (267, 269), (269, 270), (270, 272), (273, 275), (276, 280), (281, 286), (287, 292), (293, 298), (299, 303), (304, 309), (309, 310), (311, 314), (315, 319), (320, 323), (324, 330), (331, 333), (334, 342), (343, 344), (344, 345), (346, 350), (350, 351), (352, 354), (355, 359), (359, 360), (360, 361), (362, 369), (370, 372), (373, 376), (377, 380), (381, 390), (391, 394), (395, 399), (400, 402), (403, 408), (409, 414), (414, 415), (416, 426), (426, 427), (428, 430), (431, 435), (436, 439), (440, 443), (444, 448), (449, 454), (455, 459), (459, 460), (461, 464), (465, 471), (472, 482), (483, 486), (487, 488), (488, 494), (495, 506), ...]}","{'input_ids': [101, 2029, 5088, 2136, 3421, 1996, 10511, 2012, 3565, 4605, 2753, 1029, 102], 'offsets': [(0, 0), (0, 5), (6, 9), (10, 14), (15, 26), (27, 30), (31, 34), (35, 37), (38, 43), (44, 48), (49, 51), (51, 52), (0, 0)]}","(177, 191)",Denver Broncos,"[101, 3565, 4605, 2753, 2001, 2019, 2137, 2374, 2208, 2000, 5646, 1996, 3410, 1997, 1996, 2120, 2374, 2223, 1006, 5088, 1007, 2005, 1996, 2325, 2161, 1012, 1996, 2137, 2374, 3034, 1006, 10511, 1007, 3410, 7573, 14169, 3249, 1996, 2120, 2374, 3034, 1006, 22309, 1007, 3410, 3792, 12915, 2484, 1516, 2184, 2000, 7796, 2037, 2353, 3565, 4605, 2516, 1012, 1996, 2208, 2001, 2209, 2006, 2337, 1021, 1010, 2355, 1010, 2012, 11902, 1005, 1055, 3346, 1999, 1996, 2624, 3799, 3016, 2181, 2012, 4203, 10254, 1010, 2662, 1012, 2004, 2023, 2001, 1996, 12951, 3565, 4605, 1010, 1996, 2223, 13155, 1996, 1000, 3585, 5315, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]","[[0, 0], [0, 5], [6, 10], [11, 13], [14, 17], [18, 20], [21, 29], [30, 38], [39, 43], [44, 46], [47, 56], [57, 60], [61, 69], [70, 72], [73, 76], [77, 85], [86, 94], [95, 101], [102, 103], [103, 106], [106, 107], [108, 111], [112, 115], [116, 120], [121, 127], [127, 128], [129, 132], [133, 141], [142, 150], [151, 161], [162, 163], [163, 166], [166, 167], [168, 176], [177, 183], [184, 191], [192, 200], [201, 204], [205, 213], [214, 222], [223, 233], [234, 235], [235, 238], [238, 239], [240, 248], [249, 257], [258, 266], [267, 269], [269, 270], [270, 272], [273, 275], [276, 280], [281, 286], [287, 292], [293, 298], [299, 303], [304, 309], [309, 310], [311, 314], [315, 319], [320, 323], [324, 330], [331, 333], [334, 342], [343, 344], [344, 345], [346, 350], [350, 351], [352, 354], [355, 359], [359, 360], [360, 361], [362, 369], [370, 372], [373, 376], [377, 380], [381, 390], [391, 394], [395, 399], [400, 402], [403, 408], [409, 414], [414, 415], [416, 426], [426, 427], [428, 430], [431, 435], [436, 439], [440, 443], [444, 448], [449, 454], [455, 459], [459, 460], [461, 464], [465, 471], [472, 482], [483, 486], [487, 488], [488, 494], [495, 506], ...]",34,35


In [None]:
#@title save predictions { form-width: "25%" }
x_test = [np.stack(df_bert_dev["input_word_ids"]),
          np.stack(df_bert_dev["input_mask"]),
          np.stack(df_bert_dev["input_type_ids"])]

predictions = model.predict(x_test, verbose=1)

num_samples = len(predictions[0])

start, end = list(np.argmax(predictions, axis=-1).squeeze())
lines_c = 0
with open("dev_predictions.txt","w") as out:
    out.write("{")
    for id in skipped:
        out.write(f'''"{id}": "42",\n''')

    for ans_idx in range(num_samples):
        # no answer
        if end[ans_idx] == 0:
            if ans_idx == num_samples-1:
                out.write(f'''"{df_bert_dev.index[ans_idx]}": ""''')
            else:
                out.write(f'''"{df_bert_dev.index[ans_idx]}": "",\n''')

        # extract answer text
        else:
            predicted_ans = tokenizer.decode(df_bert_dev.iloc[ans_idx]['passage']["input_ids"][start[ans_idx] : end[ans_idx]+1]).replace("\n"," ")
            if ans_idx == num_samples-1:
                out.write(f'''"{df_bert_dev.index[ans_idx]}": "{predicted_ans.replace('"',"")}"''')
            else:
                out.write(f'''"{df_bert_dev.index[ans_idx]}": "{predicted_ans.replace('"',"")}",\n''')

    out.write("}")

In [None]:
#@title evaluate { form-width: "25%" }

evaluation = !python3 evaluate.py test_set.json dev_predictions.txt
print(evaluation)
if ENABLE_WANDB:
    wandb.log({"Evaluate": wandb.Html("<pre>"+str(evaluation)+"<pre>", inject=False)})

['{', '  "exact": 70.45411542100284,', '  "f1": 80.28784915480817,', '  "total": 10570,', '  "HasAns_exact": 70.45411542100284,', '  "HasAns_f1": 80.28784915480817,', '  "HasAns_total": 10570', '}']


In [None]:
# precedence plot

import matplotlib.pyplot as plt

predictions = model.predict(x_eval,verbose=1)
sampled_start = np.argmax(predictions[0], axis=-1)
sampled_end = np.argmax(predictions[1], axis=-1)
plt.figure(figsize=(30,30))
plt.plot(y_eval[0], y_eval[1], ".")
plt.plot(sampled_start, sampled_end,"*")

In [None]:
#@title custom inference { form-width: "25%" }

def custom_inference(context, question):
    preprocessed_context = " ".join(str(context).split())
    preprocessed_question = " ".join(str(question).split())
    tokenized_context = tokenizer(preprocessed_context)
    tokenized_question = tokenizer(preprocessed_question)
    input_ids = tokenized_context["input_ids"] + tokenized_question["input_ids"][1:]
    token_type_ids = [0] * len(tokenized_context["input_ids"]) + [1] * len(tokenized_question["input_ids"][1:])
    attention_mask = [1] * len(input_ids)
    padding_length = max_seq_length - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
    else:
        print("Error! The input is too long")
    input_word_ids = np.array(input_ids)
    input_mask = np.array(attention_mask)
    input_type_ids = np.array(token_type_ids)
    x = [np.expand_dims(input_word_ids, axis =0), np.expand_dims(input_mask, axis = 0), 
         np.expand_dims(input_type_ids,axis=0)]
    predictions = model.predict(x)
    start, end = list(np.argmax(predictions, axis=-1).squeeze())
    predicted_ans = tokenizer.decode(tokenized_context["input_ids"][start : end+1])
    return predicted_ans

In [None]:
context = "Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. He has received various accolades for his work, including three Golden Globe Awards and three nominations for Academy Awards. He is one of the highest-paid actors in the world. His films have grossed over $4 billion in North America and over $10.1 billion worldwide, making him one of the highest-grossing box office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action drama film Top Gun (1986). Critical acclaim came with his roles in the drama films The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992), the thriller The Firm (1993), the horror film Interview with the Vampire (1994), and the romance Jerry Maguire (1996). For his role in the latter, he won a Golden Globe Award for Best Actor and received his second Academy Award nomination."
question = "What was the first film Tom Cruise acted in?"


predicted_answer = custom_inference(context, question)
print(predicted_answer)