In [1]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
# https://github.com/tensorflow/tensorflow/issues/33721 ''

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
import transformers
import seaborn as sns
import time
from csv import QUOTE_NONE
from ipywidgets import IntProgress

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
train_df = pd.read_csv('../sts-train.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])
validation_df = pd.read_csv('../sts-dev.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])
test_df = pd.read_csv('../sts-test.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])

In [5]:
train_df = train_df.dropna()
validation_df = validation_df.dropna()
test_df = test_df.dropna()

In [6]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 25

In [7]:
class AlbertSemanticDataGenerator(tf.keras.utils.Sequence):
    def __init__(
        self,
        sentence_pairs,
        scores,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.scores = scores
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.AlbertTokenizer.from_pretrained(
            "albert-base-v2", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            truncation=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], scores
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [8]:
input_ids = tf.keras.layers.Input(
    shape=(max_length,), dtype=tf.int32, name="input_ids"
)
# Attention masks indicates to the model which tokens should be attended to.
attention_masks = tf.keras.layers.Input(
    shape=(max_length,), dtype=tf.int32, name="attention_masks"
)
# Token type ids are binary masks identifying different sequences in the model.
token_type_ids = tf.keras.layers.Input(
    shape=(max_length,), dtype=tf.int32, name="token_type_ids"
)
# Loading pretrained BERT model.
bert_model = transformers.TFAlbertModel.from_pretrained("albert-base-v2")
# Freeze the BERT model to reuse the pretrained features without modifying them.
bert_model.trainable = False

bert_outputs = bert_model(
    input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
)
last_hidden_state = bert_outputs[0]

# Applying hybrid pooling approach to bi_lstm sequence output.
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
dense_layer_1 = tf.keras.layers.Dense(384, activation="relu", name="dense_old")(avg_pool)
dropout = tf.keras.layers.Dropout(0.4)(dense_layer_1)
output = tf.keras.layers.Dense(1)(dropout)
model = tf.keras.models.Model(
    inputs=[input_ids, attention_masks, token_type_ids], outputs=output
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",
    metrics=[
        tf.keras.metrics.RootMeanSquaredError()
    ],
)

model.summary()

Some layers from the model checkpoint at albert-base-v2 were not used when initializing TFAlbertModel: ['predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertModel were initialized from the model checkpoint at albert-base-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(N

In [9]:
model.load_weights('./best-albert-03.h5')

In [10]:
train_data = AlbertSemanticDataGenerator(
    train_df[["sentence1", "sentence2"]].values.astype("str"),
    train_df['score'].values,
    batch_size=batch_size,
    shuffle=True,
)

valid_data = AlbertSemanticDataGenerator(
    validation_df[["sentence1", "sentence2"]].values.astype("str"),
    validation_df['score'].values,
    batch_size=batch_size,
    shuffle=False,
)

test_data = AlbertSemanticDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"),
    test_df['score'].values,
    batch_size=batch_size,
    shuffle=False,
)

# Fine-tuning

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_albert_model (TFAlbertModel) TFBaseModelOutputWit 11683584    input_ids[0][0]                  
                                                                 attention_masks[0][0]        

In [12]:
def model_builder(hp):
    model.layers[-5].trainable = True
        
    dense_layer_old = model.layers[-3].output

    hp_dropout = hp.Float('dropout', 0, 0.9, step=0.1, default=0.5)
    dropout = tf.keras.layers.Dropout(hp_dropout)(dense_layer_old)

    output_layer = tf.keras.layers.Dense(1)(dropout)

    model_regression = tf.keras.models.Model(inputs=model.input, outputs=output_layer)

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-5, 1e-6, 1e-7])
    model_regression.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss="mse",
        metrics=[
            tf.keras.metrics.RootMeanSquaredError()
        ],
    )
    return model_regression

In [13]:
tuner = kt.Hyperband(model_builder,
                     objective='val_loss',
                     max_epochs=25,
                     factor=3,
                     directory='my_dir_finetune_03',
                     project_name='finetuning')

In [14]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [15]:
class TimeHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [16]:
time_callback = TimeHistory()

In [17]:
history = tuner.search(
    train_data, 
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
    callbacks=[time_callback, stop_early]
)

Trial 30 Complete [00h 36m 21s]
val_loss: 0.7309430241584778

Best val_loss So Far: 0.45457425713539124
Total elapsed time: 04h 47m 16s
INFO:tensorflow:Oracle triggered exit


In [18]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print('Learning rate: ' + str(best_hps.get('learning_rate')))
print('Dropout rate: ' + str(best_hps.get('dropout')))

Learning rate: 1e-05
Dropout rate: 0.30000000000000004


In [19]:
tuner.results_summary(num_trials=10)

Results summary
Results in my_dir_finetune_03/finetuning
Showing 10 best trials
Objective(name='val_loss', direction='min')
Trial summary
Hyperparameters:
dropout: 0.30000000000000004
learning_rate: 1e-05
tuner/epochs: 25
tuner/initial_epoch: 9
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 9ebf357a4ff831adfd1d18d5224bec6d
Score: 0.45457425713539124
Trial summary
Hyperparameters:
dropout: 0.4
learning_rate: 1e-05
tuner/epochs: 25
tuner/initial_epoch: 9
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 1d69d1476c31bdf75ffca0954a49969d
Score: 0.46180522441864014
Trial summary
Hyperparameters:
dropout: 0.6000000000000001
learning_rate: 1e-05
tuner/epochs: 25
tuner/initial_epoch: 9
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: de6f92a3433efde1f9bd6d71da54df77
Score: 0.4631533920764923
Trial summary
Hyperparameters:
dropout: 0.0
learning_rate: 1e-05
tuner/epochs: 25
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.4636526107788086
Trial summary
Hyperparameters:
dropout: 0.2


In [20]:
best_model = tuner.get_best_models(1)[0]

In [21]:
best_model.save_weights('./best_albert_fine_tuned-03.h5')

In [22]:
best_model.evaluate(test_data, verbose=1)



[0.5581870675086975, 0.7471191883087158]