In [1]:
pip install -q tf-models-official==2.4.0

[K     |████████████████████████████████| 1.1 MB 8.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 53.3 MB/s 
[K     |████████████████████████████████| 352 kB 68.0 MB/s 
[K     |████████████████████████████████| 37.1 MB 1.2 MB/s 
[K     |████████████████████████████████| 211 kB 63.0 MB/s 
[K     |████████████████████████████████| 99 kB 12.6 MB/s 
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[K     |████████████████████████████████| 636 kB 52.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 52.6 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from ast import literal_eval

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert
from os import listdir

#required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks



In [2]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [3]:
#Microsoft Research Paraphrase Corpus
glue, info = tfds.load('glue/mrpc', with_info=True, batch_size=-1)


Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


In [4]:
list(glue.keys())

['test', 'train', 'validation']

In [5]:
info.features

FeaturesDict({
    'idx': tf.int32,
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'sentence1': Text(shape=(), dtype=tf.string),
    'sentence2': Text(shape=(), dtype=tf.string),
})

In [6]:
info.features['label'].names

['not_equivalent', 'equivalent']

In [7]:
glue_train = glue['train']

for key, value in glue_train.items():
  print(f"{key:9s}: {value[0].numpy()}")


idx      : 1680
label    : 0
sentence1: b'The identical rovers will act as robotic geologists , searching for evidence of past water .'
sentence2: b'The rovers act as robotic geologists , moving on six wheels .'


In [8]:
#rebuild vocab used by base bert
tokenizer = bert.tokenization.FullTokenizer(vocab_file=os.path.join(gs_folder_bert, "vocab.txt"), do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))


Vocab size: 30522


In [10]:
#the inputs will be concatenated tg with CLS and SEP tokens starting and seperating
tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

[101, 102]

In [11]:
def encode_sentence(s, tokenizer):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
  num_examples = len(glue_dict["sentence1"])

  sentence1 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence1"])])
  
  sentence2 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence2"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs



In [12]:
glue_train, glue_train_labels = bert_encode(glue['train'], tokenizer), glue['train']['label']

glue_validation, glue_validation_labels = bert_encode(glue['validation'], tokenizer), glue['validation']['label']

glue_test, glue_test_labels = bert_encode(glue['test'], tokenizer), glue['test']['label']



In [13]:
for key, value in glue_train.items():
  print(f'{key:15s} shape: {value.shape}')

print(f'glue_train_labels shape: {glue_train_labels.shape}')

input_word_ids  shape: (3668, 103)
input_mask      shape: (3668, 103)
input_type_ids  shape: (3668, 103)
glue_train_labels shape: (3668,)


In [14]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [15]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(bert_config, num_labels=2)

In [16]:
#restore weights from checkpoint
checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
checkpoint.read(os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f03b5b41fd0>

In [17]:
# Set up epochs and steps
epochs = 3
batch_size = 32
eval_batch_size = 32

train_data_size = len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, 
    num_train_steps = steps_per_epoch * epochs, 
    num_warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)
                                              )


In [18]:
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      glue_train, glue_train_labels,
      validation_data=(glue_validation, glue_validation_labels),
      batch_size=32,
      epochs=epochs)



Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f03b5b20390>