In [1]:
import pandas as pd
import tensorflow as tf

from official.nlp import optimization as nlp_opt
from official.nlp.bert import tokenization as bert_token

from berts.berts import BertEQAModel
from berts.utils import get_bert_inputs

In [2]:
bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
model, bert_layer = BertEQAModel(
    bert_url,
    return_cls=True
)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_words_seq (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
input_attention_mask (InputLaye [(None, None)]       0                                            
__________________________________________________________________________________________________
input_segment_mask (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_words_seq[0][0]            
                                                                 input_attention_mask[0

In [3]:
# load vocabulary (must be same as pre-trained bert)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer = bert_token.FullTokenizer(vocab_file, to_lower_case)
print('vocabulary size:', len(bert_tokenizer.vocab))

vocabulary size: 30522


In [4]:
# use preprocessed SQuAD2.0 partial data
df_ans = pd.read_csv('data/SQuAD/convert_ans_pos/train_answer_pos.csv', na_filter= False)
df_ans = df_ans.drop(columns=['aid'])
df_ans.head()

Unnamed: 0,cid,qid,answer,ans_start,ans_end
0,1,56be85543aeaaa14008c9063,in the late 1990s,66,69
1,1,56be85543aeaaa14008c9065,singing and dancing,54,56
2,1,56be85543aeaaa14008c9066,2003,127,127
3,1,56bf6b0f3aeaaa14008c9601,"houston , texas",46,48
4,1,56bf6b0f3aeaaa14008c9602,late 1990s,68,69


In [5]:
df_ques = pd.read_csv('data/SQuAD/preprocess/train_question_tokenized.csv', na_filter= False)
df_ques.head()

Unnamed: 0,cid,qid,question,hasAnswer
0,1,56be85543aeaaa14008c9063,when did beyonce start becoming popular ?,1
1,1,56be85543aeaaa14008c9065,what areas did beyonce compete in when she was...,1
2,1,56be85543aeaaa14008c9066,when did beyonce leave destiny ' s child and b...,1
3,1,56bf6b0f3aeaaa14008c9601,in what city and state did beyonce grow up ?,1
4,1,56bf6b0f3aeaaa14008c9602,in which decade did beyonce become famous ?,1


In [6]:
df_context = pd.read_csv('data/SQuAD/preprocess/train_context_tokenized.csv', na_filter= False)
df_context.head()

Unnamed: 0,cid,context
0,1,beyonce gi ##selle knowles - carter ( / bi ##ː...
1,2,following the di ##sb ##and ##ment of destiny ...
2,3,"a self - described "" modern - day feminist "" ,..."
3,4,beyonce gi ##selle knowles was born in houston...
4,5,beyonce attended st . mary ' s elementary scho...


In [7]:
# join context, question, answer data together
df_qa = df_ans.join(df_ques.set_index(['cid', 'qid']), how='inner', on=['cid', 'qid']
                   ).join(df_context.set_index('cid'), how='inner', on='cid')
df_qa.head()

Unnamed: 0,cid,qid,answer,ans_start,ans_end,question,hasAnswer,context
0,1,56be85543aeaaa14008c9063,in the late 1990s,66,69,when did beyonce start becoming popular ?,1,beyonce gi ##selle knowles - carter ( / bi ##ː...
1,1,56be85543aeaaa14008c9065,singing and dancing,54,56,what areas did beyonce compete in when she was...,1,beyonce gi ##selle knowles - carter ( / bi ##ː...
2,1,56be85543aeaaa14008c9066,2003,127,127,when did beyonce leave destiny ' s child and b...,1,beyonce gi ##selle knowles - carter ( / bi ##ː...
3,1,56bf6b0f3aeaaa14008c9601,"houston , texas",46,48,in what city and state did beyonce grow up ?,1,beyonce gi ##selle knowles - carter ( / bi ##ː...
4,1,56bf6b0f3aeaaa14008c9602,late 1990s,68,69,in which decade did beyonce become famous ?,1,beyonce gi ##selle knowles - carter ( / bi ##ː...


In [8]:
df_qa['question'] = [s.split(" ") for s in df_qa['question']]
df_qa['context'] = [s.split(" ") for s in df_qa['context']]
print('original data size:', len(df_qa))

original data size: 130319


In [9]:
# only use partial data set (context + question < 160), b/c my GPU memory is not big enough for all data
df_qa = df_qa[df_qa.apply(lambda x: (len(x['question']) + len(x['context'])) < 300, axis=1)]
print('using data size:', len(df_qa))

using data size: 124975


In [10]:
# shuffle data
df_qa = df_qa.sample(frac=1).reset_index(drop=True)

In [11]:
# separate data set into training (80%) and validation (20%) data sets
val_size = int(len(df_qa) * 0.2)

# prepare bert input data
input_words, input_mask, input_seg = get_bert_inputs(bert_tokenizer,
                                                     df_qa['question'],
                                                     df_qa['context'],
                                                     tokenized=True)
question_len = [(len(s) + 1) * df_qa['hasAnswer'][i] for i, s in enumerate(df_qa['question'])] # + 1: <SEP>

train_input_words, train_input_mask, train_input_seg = input_words[:-val_size], input_mask[:-val_size], input_seg[:-val_size]
train_label_cls = tf.constant(df_qa['hasAnswer'][:-val_size])
train_label_start = tf.constant(df_qa['ans_start'][:-val_size]) + question_len[:-val_size]
train_label_end = tf.constant(df_qa['ans_end'][:-val_size]) + question_len[:-val_size]
print('training data shape:', train_input_words.shape, train_input_mask.shape, train_input_seg.shape)
print('training label shape:', train_label_cls.shape, train_label_start.shape, train_label_end.shape)

valid_input_words, valid_input_mask, valid_input_seg = input_words[-val_size:], input_mask[-val_size:], input_seg[-val_size:]
valid_label_cls = tf.constant(df_qa['hasAnswer'][-val_size:])
valid_label_start = tf.constant(df_qa['ans_start'][-val_size:]) + question_len[-val_size:]
valid_label_end = tf.constant(df_qa['ans_end'][-val_size:]) + question_len[-val_size:]
print('validation data shape:', valid_input_words.shape, valid_input_mask.shape, valid_input_seg.shape)
print('validation label shape:', valid_label_cls.shape, valid_label_start.shape, valid_label_end.shape)

training data shape: (99980, 302) (99980, 302) (99980, 302)
training label shape: (99980,) (99980,) (99980,)
validation data shape: (24995, 302) (24995, 302) (24995, 302)
validation label shape: (24995,) (24995,) (24995,)


In [12]:
def qa_loss(y_true, y_pred):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), loss.dtype) # y_true==0: no answer
    # loss.shape: (None,), mask.shape: (None, 1) -> squeeze
    loss *= tf.squeeze(mask, axis=1)
    return tf.math.reduce_sum(loss)/(tf.math.reduce_sum(mask) + 1e-7)

def qa_accuracy(y_true, y_pred):
    if y_pred.shape[1] == 1:
        return tf.math.reduce_mean(tf.keras.metrics.binary_accuracy(y_true, y_pred))
    
    acc = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), acc.dtype) # y_true==0: no answer
    acc *= tf.squeeze(mask, axis=1)
    return tf.math.reduce_sum(acc)/(tf.math.reduce_sum(mask) + 1e-7)

In [13]:
batch_size = 8 # b/c my GPU memory is not big enough for bigger batch size
epochs = 3
train_data_size = len(train_label_cls)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp_opt.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [14]:
model.compile(optimizer=optimizer,
              loss=[qa_loss, qa_loss, 'binary_crossentropy'],
              loss_weights=[1.4, 1, 0.6],
              metrics=qa_accuracy)

In [15]:
model.fit([train_input_words, train_input_mask, train_input_seg],
          [train_label_start, train_label_end, train_label_cls],
          validation_data=([valid_input_words, valid_input_mask, valid_input_seg],
                           [valid_label_start, valid_label_end, valid_label_cls]),
          batch_size=batch_size, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1c5a20d7708>

In [16]:
def convert_tokens_to_sentence(tokens):
    sentence = ''
    for i, tok in enumerate(tokens):
        if i == 0 or tok.startswith('##'):
            sentence += tok.lstrip('#')
        else:
            sentence += (' ' + tok)
    return sentence

def get_validation_prediction(model, tokenizer, idx):
    ds, de, cls = model.predict([valid_input_words[idx:idx+1], valid_input_mask[idx:idx+1], valid_input_seg[idx:idx+1]])
    ds, de, cls = tf.math.argmax(ds[0]).numpy(), tf.math.argmax(de[0]).numpy(), cls[0][0]
    answer = ''
    if cls > 0.5:
        answer = convert_tokens_to_sentence(tokenizer.convert_ids_to_tokens(valid_input_words[idx][ds:de+1].numpy()))
    
    return cls, ds, de, answer

In [17]:
def output_cls(cls):
    if cls > 0.5:
        return 'has answer'
    return 'no answer'

def output_result(model, tokenizer, idx):
    cls, ds, de, answer = get_validation_prediction(model, tokenizer, idx)
    print("context:")
    print(convert_tokens_to_sentence(df_qa['context'][len(df_qa) - val_size + idx]))
    print("question:")
    print(convert_tokens_to_sentence(df_qa['question'][len(df_qa) - val_size + idx]))
    print("validation data index '%d' prediction:" % (idx))
    print("\tcls(%f): %s, ds(%d), de(%d), answer: %s" % (cls, output_cls(cls), ds, de, answer))

    print("ground true data index '%d':" % (idx))
    print("\tcls(%d): %s, ds(%d), de(%d), answer: %s" %
          (valid_label_cls[idx].numpy(), output_cls(valid_label_cls[idx].numpy()),
           valid_label_start[idx].numpy(),
           valid_label_end[idx].numpy(),
           convert_tokens_to_sentence(df_qa['answer'][len(df_qa) - val_size + idx].split(" "))))

In [18]:
output_result(model, bert_tokenizer, 1000)

context:
synthetic crude oil , also known as syncrude , is the output from a bitumen upgrader facility used in connection with oil sand production in canada . bituminous sands are mined using enormous ( 100 ton capacity ) power shovels and loaded into even larger ( 400 ton capacity ) dump trucks for movement to an upgrading facility . the process used to extract the bitumen from the sand is a hot water process originally developed by dr . karl clark of the university of alberta during the 1920s . after extraction from the sand , the bitumen is fed into a bitumen upgrader which converts it into a light crude oil equivalent . this synthetic substance is fluid enough to be transferred through conventional oil pipelines and can be fed into conventional oil refineries without any further treatment . by 2015 canadian bitumen upgraders were producing over 1 million barrels ( 160×10 ^ 3 m3 ) per day of synthetic crude oil , of which 75 % was exported to oil refineries in the united states . [S

In [19]:
output_result(model, bert_tokenizer, 1100)

context:
in january 1989 , madonna signed an endorsement deal with soft - drink manufacturer , pepsi . in one of her pepsi commercials , she debuted her song " like a prayer " . the corresponding music video featured many catholic symbols such as stigmata and cross burning , and a dream of making love to a saint , leading the vatican to condemn the video . religious groups sought to ban the commercial and boycott pepsi products . pepsi revoked the commercial and canceled her sponsorship contract . the song was included on madonna ' s fourth studio album , like a prayer , which was co - written and co - produced by patrick leonard and stephen bray . madonna received positive feedback for the album , with rolling stone writing that it was " as close to art as pop music gets " . like a prayer peaked at number one on the billboard 200 and sold 15 million copies worldwide , with 4 million copies sold in the u . s . alone . six singles were released from the album , including " like a prayer

In [20]:
output_result(model, bert_tokenizer, 1110)

context:
despite being eliminated earlier in the season , chris daughtry ( as lead of the band daughtry ) became the most successful recording artist from this season . other contestants , such as hicks , mcphee , bucky covington , mandisa , kellie pickler , and elliott yamin have had varying levels of success . [SEP]
question:
what is the name if the band that has chris daughtry as its lead singer ? [SEP]
validation data index '1110' prediction:
	cls(0.985497): has answer, ds(37), de(40), answer: band daughtry
ground true data index '1110':
	cls(1): has answer, ds(29), de(31), answer: daughtry
