<a href="https://colab.research.google.com/github/the-SQuAD-squad/IR-QA/blob/regression/QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Init { form-width: "25%" }
import os
import random
import math
import numpy as np
import tensorflow as tf
import json
import pandas as pd

pd.set_option('display.max_colwidth', -1)

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)


In [None]:
#@title df creation { form-width: "25%" }

# dataset is copyed to public git repo for fast access within colab 
!wget 'https://raw.githubusercontent.com/the-SQuAD-squad/data/main/SQuAD/squad1.1.zip'
!unzip -o squad1.1.zip

with open("training_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (qas['answers'][0]['answer_start'], 
                    qas['answers'][0]['answer_start'] + len(qas['answers'][0]['text'])),
        'answer_text' : qas['answers'][0]['text']
      })

df_original = pd.DataFrame(rows)

In [None]:
df_original

In [None]:
import nltk
import re 
import math

def preprocess_text(text):
    """
    Given an iterable containing sentences, pre-process each sentence.

    :param: 
        - text: list of text to be pre-processed (Iterable)
    :return:
        - text: pre-processed text (List)
    """

    REPLACE_WITH_SPACE = re.compile(r"\n") 
    text = [REPLACE_WITH_SPACE.sub(" ", line) for line in text]

    # we don't remove symbols, but just put a space before and after them. We did this because we noticed that Glove contains an embedding also for
    # them, so, in this way, we are able to split these symbols from the text when computing sentence tokens
    text = [re.sub(r"([(.;:!\'ˈ~?,\"(\[\])\\\/\-–\t```<>_#$€@%*+—°′″“”×’^₤₹‘])", r' \1 ', line) for line in text]

    # we noticed that in the text sometimes we find numbers and the following word merged together (ex: 1980february),
    # so we put a space between the number and the word
    text = [re.sub(r"(\d+)([a-z]+)", r'\1 \2', line) for line in text] 
    text = [re.sub('\s{2,}', ' ', line.strip()) for line in text]   # replacing more than one consecutive blank spaces with only one of them

    return text


# Creating a copy of the original dataframe (we do this because we want to be able to compare the results of our processing with the original data)
df = df_original.copy()

# pre-process passage and question text
df['passage'] = preprocess_text(df_original['passage'])
df['question'] = preprocess_text(df_original['question'])
df['answer_text'] = preprocess_text(df_original['answer_text'])

In [None]:
import random as rand
# Comparing Original and Pre-Processed
for i in range(3):
    a = rand.randint(0,1000)
    print('ORIGINAL AND PREPROCESSED PASSAGE:')
    print(df_original.iloc[a]['passage'])
    print(df.iloc[a]['passage'])
    
    print()
    print('ORIGINAL AND PREPROCESSED QUESTION:')
    print(df_original.iloc[a]['question'])
    print(df.iloc[a]['question'])
    print()


In [None]:
unwanted_rows = set()   # this set will contain the indices of rows containing errors (thus we will remove these rows from the dataframe)
unwanted_id = set()    # this set will contain the IDs of rows containing errors 
word_idx = []   # will contain the start and end indices of each answer in the corresponing passage
impossible_count = 0

for i in df_original.index:

    # extracting one answer and the corresponding passage
    answer = np.array(df["answer_text"][i].split())
    passage = np.array(df["passage"][i].split())

    l = len(answer)
    idx = []    # this list will contain the start and end indices of the answer occurrence in the corresponding passage (these could be more than 1! ex: we find the answer "rome" in 2 distinct positions in the same passage)
    counts = []     # counts will contain how many characters there are before the answer start in the passage
    char_count = 0
    for j in range(len(passage)-l+1):   # for each token in the passage, check if the answer starts from that token
        if (answer == passage[j:j+l]).all():
            idx.append((j, j+l))    # j is the start_index and j+l is the end_index of the answer in the passage
            counts.append(char_count)    # char_count is the count of characters before the start of the answer
        char_count += len(passage[j])
    if len(counts) == 0:    # no answer found in the passage. Probably there is an error in the dataset (for instance the answer is "7", but in the text it is written like "seven")
        unwanted_rows.add(i)
        unwanted_id.add(df_original["id"][i])
        word_idx.append((-1, -1))   # stating that there was an error
        impossible_count+=1

        # printing original question, answer and passage (by using the answer id to find the right dataframe row)
        print(str(df_original["question"][df_original["id"] == df["id"][i]]))
        print(str(df_original['passage'][df_original['id'] == df['id'][i]]))
        print(str(df_original['answer_text'][df_original['id'] == df['id'][i]]))
        print("answer extracted from: {0}".format((df_original['passage'][i][df_original["answer_idx"][i][0]-3:df_original["answer_idx"][i][1]+3])))
        print()

    else:   # answer found in the passage
        # if more than one answer correspondence was found in the passage, we take the one whose start index is nearer the start index given in the dataset 
        # (these 2 starting indices do not match perfectly because we are working on the preoprocessed text, so our starting index is a little bit different from the original)
        n_spaces_original = df_original["passage"][i][:df_original["answer_idx"][i][0]].count(" ")    # counting how many spaces there are in the original passage before the answer
        n_newline_original = df_original["passage"][i][:df_original["answer_idx"][i][0]].count("\n")    # counting how many newline characters there are in the original passage before the answer
        s = np.abs(np.array(counts)-(df["answer_idx"][i][0]-n_spaces_original-n_newline_original))

        if (0 not in s) and len(s)>1:   # in this case the answer was found in the passage, but the start index specified in the dataset is not the right one (it indicates a wrong occurrence of the answer)
            unwanted_rows.add(i)
            unwanted_id.add(df_original["id"][i])
            word_idx.append((-1, -1))   # stating that there was an error
            impossible_count+=1

            print(str(df_original["question"][df_original["id"]==df["id"][i]]))
            print(str(df_original['passage'][df_original['id'] == df['id'][i]]))
            print(str(df_original['answer_text'][df_original['id'] == df['id'][i]]))
            print("answer extracted from: {0}".format((df_original['passage'][i][df_original["answer_idx"][i][0]-3:df_original["answer_idx"][i][1]+3])))
            print()
        else:
            word_idx.append(idx[np.argmin(s)])
            # print(df_original["answer_text"][i])
            # print(passage[idx[np.argmin(s)][0]:idx[np.argmin(s)][1]])

print("The number of rows that we will remove from the dataframe (because contain errors) are {0}".format(impossible_count))

In [None]:
# adding a new column to the dataframe containing the word indices of the answer in the splitted passage
df["word_idx_answer"] = word_idx
df["passage"]=df["passage"].str.lower()
df["question"]=df["question"].str.lower()
df["answer_text"]=df["answer_text"].str.lower()
df

In [None]:
import numpy as np

def build_vocabulary(text):
    """
    Given a list of words, builds the corresponding word vocabulary and the mappings from words to indices and vice-versa.

    :param: 
        - text: list of words from which we want to build the vocabularies (List)
    :return:
        - idx2word: index to word mapping (Dict)
        - word2idx: word to index mapping (Dict)
        - set_vocab: set of unique terms that build up the vocabulary
    """

    # Creating a set to eliminate repeated words
    set_vocab = ['<PAD>']+sorted(set(text)) # here we add the padding token as the first element of the set

    # Creating a mapping from unique words to indices
    word2idx = {u:i for i, u in enumerate(set_vocab)}   # the padding token will have 0 index
    # Creating a mapping from indices to unique words
    idx2word = {i:u for i, u in enumerate(set_vocab)}

    return idx2word,word2idx,set_vocab

# Creating a list containing all the passage and question text splitted in words
text =  ' '.join(np.concatenate((df['passage'],df['question']))).split(' ')
# Displaying first 100 words
print(text[:100])
# calling the build_vocabulary function to obtain the vocab and the mappings
idx_to_word, word_to_idx, word_listing = build_vocabulary(text)

print('[Debug] Index -> Word vocabulary size: {}'.format(len(idx_to_word)))
print('[Debug] Word -> Index vocabulary size: {}'.format(len(word_to_idx)))
print('[Debug] Some words: {}'.format([(idx_to_word[idx], idx) for idx in np.arange(100)]))

In [None]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type, embedding_dimension=50):
    """
    Loads a pre-trained word embedding model via gensim library.

    :params:
        - model_type: name of the word embedding model to load.
        - embedding_dimension: size of the embedding space to consider

    :return:
        - pre-trained word embedding model (gensim KeyedVectors object)
    """

    download_path = ""

    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove")

    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model


def check_OOV_terms(embedding_model, word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :params:
        - embedding_model: pre-trained word embedding model (gensim wrapper)
        - word_listing: dataset specific vocabulary (list)

    :return:
        - list of OOV terms
    """
    # Creating a list for the OOV words
    oov = []
    for word in word_listing:
        # Checking if the word is in the embedding_model
        if word not in embedding_model:
            oov.append(word)
    return oov


def build_embedding_matrix_w_random(embedding_model, embedding_dimension, word_to_idx, oov_terms):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :params: 
        - embedding_model: pre-trained word embedding model (gensim wrapper)
        - word_to_idx: vocabulary map (word -> index) (dict)
        - oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = []
    for word in word_to_idx:
        if word in oov_terms:
            embedding_matrix.append(np.random.rand(embedding_dimension))
        else:
             embedding_matrix.append(embedding_model[word])
    return np.array(embedding_matrix)

# we used Glove with embedding dimension 100 for our final tests
embedding_model_type = "Glove"
embedding_dimension = 100
embedding_model = load_embedding_model(embedding_model_type, embedding_dimension)

In [None]:
# checking how many OOV terms we have
oov_terms = check_OOV_terms(embedding_model, word_listing)

print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_terms), len(oov_terms)/len(word_listing)*100))

embedding_matrix = build_embedding_matrix_w_random(embedding_model, embedding_dimension, word_to_idx, oov_terms)

print("Embedding matrix shape: {}".format(embedding_matrix.shape))

In [None]:
print(oov_terms)   # this was useful to understand if we could improve pre-processing

In [None]:
for i,question in enumerate(df['question']):
    if len(question)<=10:
        print(df_original.iloc[i]) 
        print(df_original['passage'][i])
        print()
        unwanted_rows.add(i)
        unwanted_id.add(df["id"][i])

In [None]:
# ERROR CHECK before removing rows
print("Number of errors found: {0}\n".format(str(len(unwanted_id))))
some_error_id = random.sample(unwanted_id, 5)
for id in some_error_id:
    row_with_error = df_original[df_original["id"] == id].to_dict("list")
    print("Question: {0}".format(row_with_error["question"][0]))
    print("Passage: {0}".format(row_with_error["passage"][0]))
    print("Answer: {0}".format(row_with_error["answer_text"][0]))
    print("Answer extracted from: {0}".format(row_with_error["passage"][0][row_with_error["answer_idx"][0][0]-3:row_with_error["answer_idx"][0][1]+3]))
    print()

In [None]:
# Creating a txt file containing IDs of rows with errors for tutors
with open("error IDs.txt", "a") as f:
    for error_id in unwanted_id:
        f.write(error_id + "\n")

In [None]:
df

In [None]:
df_clean = df.drop(list(unwanted_rows))
df_clean = df_clean.reset_index()
df_clean.to_pickle("df_clean.pkl")
!zip df_clean.pkl.zip df_clean.pkl
df_clean

In [None]:
# PADDING
# all sequences in train and val sets will be padded with a number of tokens equal to the maximum sentence length 
MAX_LENGTH_PASSAGE = len(max(df_clean['passage'], key=len))   
passages = [[word_to_idx[el] for el in sentence.split()] for sentence in df_clean['passage']]  # passages extraction
passages_pad = tf.keras.preprocessing.sequence.pad_sequences(passages, maxlen=MAX_LENGTH_PASSAGE, padding='post') # padding passages

MAX_LENGTH_QUESTION = len(max(df_clean['question'], key=len))   
questions = [[word_to_idx[el] for el in sentence.split()] for sentence in df_clean['question']]  # questions extraction
questions_pad = tf.keras.preprocessing.sequence.pad_sequences(questions, maxlen=MAX_LENGTH_QUESTION, padding='post') # padding questions

df_clean['passage_pad'] = list(passages_pad)
df_clean['question_pad'] = list(questions_pad)

In [None]:
#@title Skip preprocessing
import pickle
load= True #@param {type: "boolean"}

if load:
    import os
    import random
    import math
    import numpy as np
    import tensorflow as tf
    import json
    import pandas as pd

    pd.set_option('display.max_colwidth', -1)

    # fix random seeds
    seed_value = 42 #@param {type:"integer"}
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.compat.v1.set_random_seed(seed_value)
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)


    !gcloud config set project feisty-mechanic-221914
    !gsutil cp gs://squad_squad/df_clean.pkl ./df_clean.pkl
    !gsutil cp gs://squad_squad/embedding_matrix.pkl ./embedding_matrix.pkl
    !gsutil cp gs://squad_squad/idx_to_word.pkl ./idx_to_word.pkl

    df_clean = pd.read_pickle("df_clean.pkl")
    with open('embedding_matrix.pkl', 'rb') as handle:
        embedding_matrix = pickle.load(handle)
    with open('idx_to_word.pkl', 'rb') as handle:
        idx_to_word = pickle.load(handle)
else:
    df_clean.to_pickle("df_clean.pkl")
    with open('embedding_matrix.pkl', 'wb') as handle:
        pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('idx_to_word.pkl', 'wb') as handle:
        pickle.dump(idx_to_word, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project feisty-mechanic-221914
    !gsutil cp ./df_clean.pkl gs://squad_squad/df_clean.pkl
    !gsutil cp ./embedding_matrix.pkl gs://squad_squad/embedding_matrix.pkl
    !gsutil cp ./idx_to_word.pkl gs://squad_squad/idx_to_word.pkl

!nvidia-smi

In [None]:
#@title split { form-width: "25%" }

split_value = 0.1 #@param {type:"number"} 
val_dim = int(len(df_clean['title'].unique()) * split_value)
val_titles = np.random.choice(df_clean['title'].unique(), size=val_dim, replace=False)
passage_length = len(df_clean['passage_pad'][0])
question_length = len(df_clean['question_pad'][0])

In [None]:
# creating train and val sets
df_val = df_clean[df_clean['title'].isin(val_titles)]
df_train = df_clean[~(df_clean['title'].isin(val_titles))]

# Model

In [None]:
def build_model():
    input_size=len(idx_to_word)
    embedding_dim=100
    encoding_units = 256
    dropout = 0

    input_passage = tf.keras.layers.Input(shape=[None])
    input_question = tf.keras.layers.Input(shape=[None])

    # EMBEDDING
    embedding = tf.keras.layers.Embedding(input_size,
                                        embedding_dim,  
                                        weights=[embedding_matrix],
                                        trainable=False,
                                        mask_zero=True
                                        )   # trainable param is False because we use pre-trained Glove embeddings, mask_zero param is True because we have padding
    embedding_passage = embedding(input_passage)
    embedding_question = embedding(input_question)


    # ENCODING passage AND question
    encoding_passage = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(encoding_units, 
                                kernel_initializer='glorot_uniform',
                                recurrent_initializer='orthogonal',
                                dropout=dropout,
                                stateful=False,
                                return_sequences=True))(embedding_passage)   

    encoding_question = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(encoding_units, 
                                kernel_initializer='glorot_uniform',
                                recurrent_initializer='orthogonal',
                                dropout=dropout,
                                stateful=False,
                                return_sequences=True))(embedding_question)

    # ATTENTION LAYER
    #encoded_question = tf.keras.layers.RepeatVector(MAX_LENGTH_PASSAGE)(encoding_question)
    query_value_attention_seq = tf.keras.layers.Attention()([encoding_passage, encoding_question])   # out shape: [batch_size, passage_len, encoding_dim]
    # reducing over the sequence axis to produce encodings of shape [batch_size, encoding_dimension]
    #query_value_attention_seq = tf.keras.layers.GlobalAveragePooling1D()(query_value_attention_seq)
    # concatenating passage and question encodings on the sequence length dimension
    #encoded_pair = tf.keras.layers.concatenate([encoding_passage, encoding_question],axis=1)
    # reducing over the sequence axis to produce encodings of shape [batch_size, encoding_dimension]
    #encoded_pair = tf.keras.layers.GlobalAveragePooling1D()(encoded_pair)   
    # concatenating passage and question encoding pair with the attention result
    #combined = tf.keras.layers.concatenate([encoded_pair, query_value_attention_seq])

    combined = tf.keras.layers.concatenate([encoding_passage, query_value_attention_seq])  # tried add but doesnt work
    combined = tf.keras.layers.LayerNormalization()(combined)

    lstm_start = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(encoding_units, 
                                kernel_initializer='glorot_uniform',
                                recurrent_initializer='orthogonal',
                                dropout=dropout,
                                stateful=False,
                                return_sequences=True))(combined)


    output_start = tf.keras.layers.Dense(1)(lstm_start)
    logits = tf.squeeze(output_start, axis=[2]) # shape (batch_size, seq_len)
    output_start=tf.keras.layers.Softmax(name="answ_start")(logits)

    # output_end= tf.keras.layers.Dense(256)(combined) # this last dense layer outputs the positive class probability
    # output_end = tf.keras.layers.Dense(1)(output_end)
    # logits = tf.squeeze(output_end, axis=[2]) # shape (batch_size, seq_len)
    # output_end=tf.keras.layers.Softmax(name="answ_end")(logits)

    # output_start = tf.keras.layers.Dense(1)(query_value_attention_seq) # this last dense layer outputs the positive class probability
    # output_start = tf.keras.layers.Reshape((passage_length,))(output_start)
    # output_start=tf.keras.layers.Softmax(name="answ_start")(output_start)



    #output_start_reshaped = tf.keras.layers.Reshape((passage_length,1))(output_start)

    #combined_with_start_probabilities = tf.keras.layers.concatenate([combined, output_start_reshaped])

    lstm_end = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(encoding_units, 
                                kernel_initializer='glorot_uniform',
                                recurrent_initializer='orthogonal',
                                dropout=dropout,
                                stateful=False,
                                return_sequences=False))(combined)

    output_end = tf.keras.layers.Dense(1)(lstm_end)
    #logits = tf.squeeze(output_end, axis=[2]) # shape (batch_size, seq_len)
    output_end=tf.keras.layers.ReLU(name="answ_end")(output_end)


    # output_end = tf.keras.layers.Dense(1)(lstm_end) # this last dense layer outputs the positive class probability
    # output_end = tf.keras.layers.Reshape((passage_length,))(output_end)
    # output_end=tf.keras.layers.Softmax(name="answ_end")(output_end)

    # out = tf.stack([output_start, output_end],axis=-1)

    # model = tf.keras.Model([input_passage,input_question], outputs=[out]) 

    # output start ha [batch, time steps]
    # output end ha [batch,1]

    output_end_repeated = tf.keras.layers.RepeatVector(passage_length)(output_end)
    output_end_repeated = tf.squeeze(output_end_repeated, axis=[-1])
    out = tf.stack([output_start, output_end_repeated],axis=-1)

    #out = tf.stack([output_start, output_end],axis=-1)
    model = tf.keras.Model([input_passage,input_question], outputs=[out])  

    model.summary()
    return model

In [None]:
#@title metrics {form-width: "10%"}

def prec(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return 1 - tf.math.count_nonzero(tf.cast(y_true, tf.int64) - sampled) / tf.cast(len(sampled), tf.int64)

def precision_start(y_true, y_pred):
    return prec(y_true[:,0], y_pred[:,:,0])

def precision_end(y_true, y_pred):
    pred_len = tf.gather(y_pred[:,:,1], tf.cast(y_true[:,0], tf.int64), axis=1)
    pred_len = tf.cast(pred_len, tf.int64)
    return 1 - tf.math.count_nonzero(tf.cast(y_true[:,1]-y_true[:,0], tf.int64) - pred_len) / tf.cast(len(y_pred), tf.int64)

def dist(y_true, y_pred):
    sampled = tf.argmax(y_pred, axis=-1)
    return tf.reduce_sum(tf.abs(tf.cast(y_true, tf.int64) - sampled)) / tf.cast(len(sampled), tf.int64)

def mean_abs_dist_start(y_true_tuple, y_pred_tuple):
    return dist(y_true_tuple[:,0], y_pred_tuple[:,:,0])
    
def mean_abs_dist_len(y_true, y_pred):

    mae = tf.keras.losses.MeanAbsoluteError()
    return mae(y_true[:,1] - y_true[:,0] +1, y_pred[:,0,1])

def exact_match(y_true, y_pred):
    
    sampled_start = tf.argmax(y_pred[:,:,0], axis=-1)

    start_diff = tf.cast(tf.math.abs(tf.cast(y_true[:,0], tf.int64) - sampled_start),tf.float32)
    
    span_diff = tf.math.abs(y_true[:,1] - y_true[:,0] + 1 - tf.cast(
        tf.math.round(y_pred[:,0,1]),tf.float32))
    
    count = tf.math.count_nonzero(tf.cast(start_diff + span_diff, tf.int64))
    return 1 - count / tf.cast(len(y_true), tf.int64)


In [None]:
#@title train {form-width: "10%"}
cc = tf.keras.losses.SparseCategoricalCrossentropy()
huber = tf.keras.losses.Huber()
α = 1
β = 1
def custom_loss(y_true, y_pred):

    sampled_start = tf.argmax(y_pred[:,:,0], axis=-1)

    start_crossentropy = cc(y_true[:,0], y_pred[:,:,0])
    # difference between span lengths
    len_answer_loss = huber(y_true[:,1] - y_true[:,0] + 1, y_pred[:,0,1])

    return α*start_crossentropy + β*len_answer_loss

batch_size = 32
epochs = 1

ENABLE_WANDB = False        #@param {type:"boolean"}
wandb_experiment_name = "still_plaing_with_layer_norm"  #@param {type: "string"}
if ENABLE_WANDB:
    !pip install wandb > /dev/null
    !wandb login wandb_api_token
    import wandb
    from wandb.keras import WandbCallback
    wandb.init(project="SQUAD", name=wandb_experiment_name)
    wandb.config.batch_size = batch_size
    wandb.config.epochs = epochs
    

saveDir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(saveDir):
    os.makedirs(saveDir)
chkpt = saveDir + '/squad_check.hdf5'

es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1, mode='auto')
cp_cb = tf.keras.callbacks.ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
if ENABLE_WANDB:
    callbacks = [es_cb, cp_cb, WandbCallback(log_batch_frequency=10)]
else:
    callbacks = [es_cb, cp_cb]

model = build_model()
tf.keras.backend.clear_session()
model.compile(optimizer='adam', 
              loss=custom_loss,
              metrics=[precision_start, mean_abs_dist_len, exact_match])

history = model.fit([np.stack(df_train['passage_pad']), np.stack(df_train['question_pad'])],
                    np.stack(df_train['word_idx_answer'].to_numpy()), epochs=epochs,
                        callbacks=callbacks, validation_data=([np.stack(df_val['passage_pad']), np.stack(df_val['question_pad'])],np.stack(df_val['word_idx_answer'].to_numpy())),
                        batch_size=batch_size)



In [None]:
#@title precedence plot {form-width: "20%"}

predictions = model.predict([np.stack(df_val['passage_pad']), np.stack(df_val['question_pad'])])
sampled_start = np.argmax(predictions[:,:,0], axis=-1)
sampled_end = np.argmax(predictions[:,:,1], axis=-1)
plt.figure(figsize=(30,30))
plt.plot(np.stack(df_val['word_idx_answer'])[:,0],np.stack(df_val['word_idx_answer'])[:,1], ".")
plt.plot(sampled_start, sampled_end,"*")

In [None]:
print("end before start ratio")
precedence_violation = sum(sampled_end - sampled_start < 0) / len(sampled_end) * 100
print(precedence_violation)
if ENABLE_WANDB:
    wandb.log({"precedence violation": wandb.Html(
        "<pre>precedence violation: "+str(precedence_violation)+" %<pre>", inject=False)})