## Imports

In [19]:
import os
import data.data_load as data_load
from pathlib import Path

import numpy as np

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf


In [2]:
mirrored_strategy = tf.distribute.MirroredStrategy()


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


## Load in the appropriate datasets and labels

In [7]:
# Load dev

(
    dev_input_ids,
    dev_token_type_ids,
    dev_mask,
    dev_impossible,
    dev_start_positions,
    dev_end_positions,
    qas_id,
) = data_load.load_dev()

(
    train_input_ids,
    train_token_type_ids,
    train_mask,
    train_impossible,
    train_start_positions,
    train_end_positions,
    qas_id,
) = data_load.load_train()

train_labels = np.vstack([train_start_positions, train_end_positions]).T
print(train_labels.shape)


(131911, 2)


In [13]:
print(train_labels[:5])


[[ 75  78]
 [ 68  70]
 [143 143]
 [ 58  60]
 [ 78  79]]


### Learned and average pooling alone

In [14]:
class LearnedPooler(tf.keras.layers.Layer):
    """Implementation of learned pooler reported by Tenney 2019
    Original paper: https://arxiv.org/abs/1905.05950
    """

    def __init__(self, units=1):
        super().__init__()

        # Will only work currently with units = 1
        self.units = 1

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1],),
            trainable=True,
            initializer="random_normal",
            name="weights",
        )
        self.t = self.add_weight(
            shape=(1), trainable=True, initializer="ones", name="t"
        )

    def call(self, inputs):
        w = tf.nn.softmax(self.w)
        return tf.reduce_sum(tf.multiply(inputs, w), axis=-1, keepdims=True) * self.t


In [15]:
def learned_pooling():
    input_layer = tf.keras.layers.Input(shape=(386, 1024, 25), dtype=tf.float32)
    learned_pooler_layer = LearnedPooler()(input_layer)
    output_layer = tf.keras.layers.Dense(2)(learned_pooler_layer)
    start, end = tf.split(output_layer, 2, axis=-1)
    start = tf.squeeze(start, axis=-1)
    end = tf.squeeze(end, axis=-1)
    model = tf.keras.Model(inputs=input_layer, outputs=[start, end])
    return model


learned_pooling().summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 386, 1024,   0           []                               
                                25)]                                                              
                                                                                                  
 learned_pooler (LearnedPooler)  (None, 386, 1024, 1  26         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 dense (Dense)                  (None, 386, 1024, 2  4           ['learned_pooler[0][0]']         
                                )                                                             

In [16]:
def average_pooling():
    input_layer = tf.keras.layers.Input(shape=(386, 1024, 25), dtype=tf.float32)
    average_pooler_layer = tf.reduce_mean(input_layer, axis=-1, keepdims=True)
    output_layer = tf.keras.layers.Dense(2)(average_pooler_layer)
    start, end = tf.split(output_layer, 2, axis=-1)
    start = tf.squeeze(start, axis=-1)
    end = tf.squeeze(end, axis=-1)
    model = tf.keras.Model(inputs=input_layer, outputs=[start, end])
    return model


average_pooling().summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 386, 1024,   0           []                               
                                25)]                                                              
                                                                                                  
 tf.math.reduce_mean (TFOpLambd  (None, 386, 1024, 1  0          ['input_2[0][0]']                
 a)                             )                                                                 
                                                                                                  
 dense_1 (Dense)                (None, 386, 1024, 2  4           ['tf.math.reduce_mean[0][0]']    
                                )                                                           

                                1)]                                                               
                                                                                                  
 tf.compat.v1.squeeze_2 (TFOpLa  (None, 386, 1024)   0           ['tf.split_1[0][0]']             
 mbda)                                                                                            
                                                                                                  
 tf.compat.v1.squeeze_3 (TFOpLa  (None, 386, 1024)   0           ['tf.split_1[0][1]']             
 mbda)                                                                                            
                                                                                                  
Total params: 4
Trainable params: 4
Non-trainable params: 0
__________________________________________________________________________________________________


### Model compilation

All models are compiled as a bi-headed model, the first representing span start position and the second representing span end position. No activation is applied as the heads come directly from splitting a tensor.

In [17]:
def compile_model(model):
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # monitor accuracy during the training process
    model.compile(loss=[loss, loss], optimizer="adam", metrics=["accuracy"])


### Example training procedure

In [None]:
import data.bert_embedding_parser as bert_embedding_parser


In [42]:
# Start generator with training labels, pointing to data directory with embeddings
gen = bert_embedding_parser.load_bert_embeddings(
    train_labels,
    batch_size=16,
    indices=list(range(len(train_labels) // 8)),  # since batch sizes of files is 8
    file_size=8,
    training_cycles=1,
    truncate_data=None,
)


In [43]:
# Gather a list of models to fit
# Since fitting is typically faster than data load, it is beneficial to many models at once
model_list = [learned_pooling, average_pooling]


In [32]:
i = 0
max_batches = 8248  # Can be any number; this is pre-calculated based on the amount of training data used; 8248 goes through entire dataset at batch size of 16
for batch in gen:
    # Read in the batch of data from generator
    X = batch[0]
    Y = batch[1]

    for model_current in model_list:
        # Fit the generated dataset once
        model_current.fit(X, Y, epochs=1)

    # increment counter
    i += 1

    # When the number of
    if i == max_batches:  # 4 batches; can save each quarter
        break
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [64]:
# Save weights
weights_dir = "weights"
n = 0
for m in model_list:
    n += 1
    m.save_weights(weights_dir + "/%s.h5" % m.name)


# TODO: print out the 25 weights for each model (before and after fine-tuning)
