In [1]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
# https://github.com/tensorflow/tensorflow/issues/33721 

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import seaborn as sns
import time
from csv import QUOTE_NONE
from ipywidgets import IntProgress

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
train_df = pd.read_csv('../sts-train.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])
validation_df = pd.read_csv('../sts-dev.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])
test_df = pd.read_csv('../sts-test.csv', sep='\t', usecols=[4, 5, 6], header=None, quoting=QUOTE_NONE, names=['score', 'sentence1', 'sentence2'])

In [5]:
train_df.head()

Unnamed: 0,score,sentence1,sentence2
0,5.0,A plane is taking off.,An air plane is taking off.
1,3.8,A man is playing a large flute.,A man is playing a flute.
2,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,2.6,Three men are playing chess.,Two men are playing chess.
4,4.25,A man is playing the cello.,A man seated is playing the cello.


In [6]:
train_df = train_df.dropna()
validation_df = validation_df.dropna()
test_df = test_df.dropna()

In [7]:
train_df.count()

score        5749
sentence1    5749
sentence2    5749
dtype: int64

In [8]:
validation_df.count()

score        1500
sentence1    1500
sentence2    1500
dtype: int64

In [9]:
test_df.count()

score        1379
sentence1    1379
sentence2    1379
dtype: int64

# Data Generator

In [10]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 25

In [11]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    def __init__(
        self,
        sentence_pairs,
        scores,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.scores = scores
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            truncation=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], scores
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

# Setup Model Architecture

In [12]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_outputs = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    last_hidden_state = bert_outputs[0]
    
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
    dense_layer = tf.keras.layers.Dense(32, activation="relu")(avg_pool)
    dropout = tf.keras.layers.Dropout(0.5)(dense_layer)
    output_layer = tf.keras.layers.Dense(1)(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output_layer
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="mse",
        metrics=[
            tf.keras.metrics.RootMeanSquaredError()
        ],
    )


print(f"Strategy: {strategy}")
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to 

# Convert Data

In [13]:
train_data = BertSemanticDataGenerator(
    train_df[["sentence1", "sentence2"]].values.astype("str"),
    train_df['score'].values,
    batch_size=batch_size,
    shuffle=True,
)

valid_data = BertSemanticDataGenerator(
    validation_df[["sentence1", "sentence2"]].values.astype("str"),
    validation_df['score'].values,
    batch_size=batch_size,
    shuffle=False,
)

test_data = BertSemanticDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"),
    test_df['score'].values,
    batch_size=batch_size,
    shuffle=False,
)

# Train Model without Fine-tuning

In [14]:
class TimeHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [15]:
time_callback = TimeHistory()

In [16]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
    callbacks = [time_callback]
)

Epoch 1/25




INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [17]:
hist_df = pd.DataFrame(history.history) 
hist_csv_file = 'fine-tuning-bert-history-01.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [18]:
print(time_callback.times)

[47.49032378196716, 36.466702461242676, 36.95522451400757, 37.00839424133301, 37.008485555648804, 36.95213198661804, 36.99372172355652, 36.92009353637695, 36.881457567214966, 36.97953009605408, 36.96277189254761, 36.93461728096008, 36.95223379135132, 36.886672258377075, 36.845181941986084, 36.8864529132843, 36.95483207702637, 36.93006205558777, 36.9425995349884, 36.91029644012451, 36.836252212524414, 36.90636324882507, 36.950825214385986, 36.88472366333008, 36.84848117828369]


In [19]:
hist_time_df = pd.DataFrame(time_callback.times) 
hist_csv_file = 'fine-tuning-bert-time-01.csv'
with open(hist_csv_file, mode='w') as f:
    hist_time_df.to_csv(f)

In [20]:
model.save_weights('./fine-tuning-bert-model-01.h5')

# Fine-tuning

In [21]:
with strategy.scope():
    # Unfreeze the bert_model.
    bert_model.trainable = True
    # Recompile the model to make the change effective.
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-5),
        loss="mse",
        metrics=[
            tf.keras.metrics.RootMeanSquaredError()
        ],
    )

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_masks[0][0]        

In [22]:
time_callback = TimeHistory()

In [23]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
    callbacks = [time_callback]
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [24]:
hist_df = pd.DataFrame(history.history) 
hist_csv_file = 'fine-tuning-bert-history-01-fine-tuned.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [25]:
hist_time_df = pd.DataFrame(time_callback.times) 
hist_csv_file = 'fine-tuning-bert-time-01-fine-tuned.csv'
with open(hist_csv_file, mode='w') as f:
    hist_time_df.to_csv(f)

In [26]:
model.save_weights('./fine-tuning-bert-model-01-fine-tuned.h5')

# Evaluation

In [27]:
model.evaluate(test_data, verbose=1)



[0.7719836831092834, 0.8786260485649109]

# Testing

In [28]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    data = BertSemanticDataGenerator(
        sentence_pairs, scores=None, batch_size=1, shuffle=False, include_targets=False,
    )

    score = model.predict(data)[0]
    return score

In [29]:
sentence1 = 'A distributed system is a system from multiple computers which communicate with each other via network'
sentence2 = 'A distributed system is a system from one computer without network'
check_similarity(sentence1, sentence2)



array([3.5603538], dtype=float32)

In [30]:
sentence1 = 'asdsd'
sentence2 = 'asdsad are good sdsdsdsds'
check_similarity(sentence1, sentence2)

array([2.6700072], dtype=float32)

In [31]:
sentence1 = 'the activity of learning or being trained'
sentence2 = 'the gradual process of acquiring knowledge.'
check_similarity(sentence1, sentence2)

array([1.7225633], dtype=float32)