## Imports

In [1]:
import os
from pathlib import Path

import numpy as np

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf

tf.get_logger().setLevel("INFO")


In [2]:
mirrored_strategy = tf.distribute.MirroredStrategy()


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


## Load in the appropriate datasets and labels

In [3]:
import models.data_load as data_load

(
    train_input_ids,
    train_token_type_ids,
    train_mask,
    train_impossible,
    train_start_positions,
    train_end_positions,
    qas_id,
) = data_load.load_train()

train_labels = np.vstack([train_start_positions, train_end_positions]).T
print(train_labels.shape)


(131911, 2)


### Learned and average pooling alone

In [4]:
from layers.learned_pooler import get_learned_pooling_model

learned_pooling_model = get_learned_pooling_model()
learned_pooling_model.summary()


Model: "learned_pooler"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 386, 1024,   0           []                               
                                25)]                                                              
                                                                                                  
 learned_pooler (LearnedPooler)  (None, 386, 1024)   26          ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 386, 2)       2050        ['learned_pooler[0][0]']         
                                                                                                  
 tf.split (TFOpLambda)          [(None, 386, 1),     0           ['dense[0][0]']     

In [5]:
from layers.average_pooler import get_average_pooler

average_pooler_model = get_average_pooler()
average_pooler_model.summary()


Model: "average_pooler"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 386, 1024,   0           []                               
                                25)]                                                              
                                                                                                  
 tf.math.reduce_mean (TFOpLambd  (None, 386, 1024)   0           ['input_2[0][0]']                
 a)                                                                                               
                                                                                                  
 dense_1 (Dense)                (None, 386, 2)       2050        ['tf.math.reduce_mean[0][0]']    
                                                                                     

### Example training procedure

In [6]:
# Start generator with training labels, pointing to data directory with embeddings


import models.bert_large_uncased as bert_large_uncased

bert_model = bert_large_uncased.create_bert_qa_model(optimizer="adam")
bert_model.load_weights("./results/bert-large-uncased/training_checkpoints/ckpt_0004.ckpt")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f29bb7eee50>

### Model compilation

All models are compiled as a bi-headed model, the first representing span start position and the second representing span end position. No activation is applied as the heads come directly from splitting a tensor.

In [7]:
def compile_model(model):
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # monitor accuracy during the training process
    model.compile(loss=[loss, loss], optimizer="adam", metrics=["accuracy"])


# Gather a list of models to fit
# Since fitting is typically faster than data load, it is beneficial to many models at once
model_list = [average_pooler_model, learned_pooling_model]

for model_current in model_list:
    compile_model(model_current)


In [10]:
import models.bert_embedding_parser as bert_embedding_parser

gen = bert_embedding_parser.load_bert_embeddings(bert_model, batch_size=4)

i = 0
max_batches = 8248  # Can be any number; this is pre-calculated based on the amount of training data used; 8248 goes through entire dataset at batch size of 16


for batch in gen:
    # Read in the batch of data from generator
    X = batch[0]
    Y = batch[1]

    for model_current in model_list:
        # Fit the generated dataset once
        model_current.fit(X, Y, epochs=1)

    # increment counter
    i += 1
    del batch  # delete the batch to free up memory

    # When the number of
    if i == max_batches:  # 4 batches; can save each quarter
        break




ResourceExhaustedError: Graph execution error:

SameWorkerRecvDone unable to allocate output tensor. Key: /job:localhost/replica:0/task:0/device:CPU:0;4bf2625991a584e8;/job:localhost/replica:0/task:0/device:GPU:0;edge_25_IteratorGetNext;0:0
	 [[{{node IteratorGetNext/_6}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_37289]

In [None]:
# Save weights
weights_dir = "weights"
n = 0
for m in model_list:
    n += 1
    m.save_weights(weights_dir + "/%s.h5" % m.name)


# TODO: print out the 25 weights for each model (before and after fine-tuning)
