## Setup

In [1]:
!pip install tensorflow-gpu==2.0 tensorflow_datasets gpustat transformers -Uq

**About**

<img src="https://upload.wikimedia.org/wikipedia/en/thumb/6/6d/Nvidia_image_logo.svg/200px-Nvidia_image_logo.svg.png" width="90px" align="right" style="margin-right: 0px;">

This notebook is put together by Timothy Liu (`timothyl@nvidia.com`) for the [**PyCon SG**](https://pycon.sg/) 2019 tutorial on [**Improving Deep Learning Performance in TensorFlow**](https://github.com/NVAITC/pycon-sg19-tensorflow-tutorial).

**Acknowledgements**

* This notebook uses some materials adapted from TensorFlow documentation.
* This notebook uses the [HuggingFace Transformers library](https://github.com/huggingface/transformers).
* This notebook uses the [GLUE (MRPC) Dataset](https://gluebenchmark.com/) ([TensorFlow Datasets page](https://www.tensorflow.org/datasets/catalog/glue)).

**Dataset Citation**

```
@inproceedings{wang2019glue,
  title={ {GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
  note={In the Proceedings of ICLR.},
  year={2019}
}
```

In [2]:
import tensorflow.compat.v2 as tf
import tensorflow_datasets

In [3]:
import time

class TimeHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []
    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()
    def on_epoch_end(self, epoch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

# Sequence Classification with BERT in TF 2.0

## Load BERT Tokenizer

In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features

# load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

I1011 13:59:37.196684 140047702144832 file_utils.py:32] TensorFlow version 2.0.0 available.
I1011 13:59:38.532777 140047702144832 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/jovyan/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


## Input Pipeline

### Load Dataset

In [5]:
data, info = tensorflow_datasets.load("glue/mrpc", with_info=True)

train_examples = info.splits["train"].num_examples
valid_examples = info.splits["validation"].num_examples

I1011 13:59:38.577600 140047702144832 dataset_builder.py:184] Overwrite dataset info from restored data version.
I1011 13:59:38.583055 140047702144832 dataset_builder.py:253] Reusing dataset glue (/home/jovyan/tensorflow_datasets/glue/mrpc/0.0.2)
I1011 13:59:38.584074 140047702144832 dataset_builder.py:399] Constructing tf.data.Dataset for split None, from /home/jovyan/tensorflow_datasets/glue/mrpc/0.0.2


## Build Input Pipeline

In [6]:
BATCH_SIZE = 40

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, "mrpc")
train_dataset = train_dataset.shuffle(512).batch(BATCH_SIZE).repeat(-1)

valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, "mrpc")
valid_dataset = valid_dataset.batch(BATCH_SIZE)

I1011 13:59:41.128844 140047702144832 glue.py:70] Using label list ['0', '1'] for task mrpc
I1011 13:59:41.130121 140047702144832 glue.py:73] Using output mode classification for task mrpc
I1011 13:59:41.176858 140047702144832 glue.py:80] Writing example 0
I1011 13:59:41.180326 140047702144832 glue.py:119] *** Example ***
I1011 13:59:41.181278 140047702144832 glue.py:120] guid: 201
I1011 13:59:41.182255 140047702144832 glue.py:121] input_ids: 101 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 2452 1106 1103 19585 2858 17762 117 1756 1419 119 102 157 13292 2528 1144 1215 1103 16513 15125 11944 1271 1290 1898 1111 1317 1104 1157 2815 2982 117 1122 1163 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I1011 13:59:41.183290 140047702144832 glue.py:122] attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

## Build BERT Model

### Load Pre-trained BERT Model

In [7]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")

I1011 13:59:47.151864 140047702144832 configuration_utils.py:150] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/jovyan/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6
I1011 13:59:47.153718 140047702144832 configuration_utils.py:167] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

I1011 13:59:48.129096 140047702144832 modeli

In [8]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt,
              loss=loss,
              metrics=[acc])

## Train BERT Model

In [9]:
time_callback = TimeHistory()

In [10]:
history = model.fit(train_dataset, epochs=4, steps_per_epoch=train_examples//BATCH_SIZE,
                    validation_data=valid_dataset, validation_steps=valid_examples//BATCH_SIZE,
                    validation_freq=3, callbacks=[time_callback])

Train for 91 steps, validate for 10 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [11]:
epoch_time = min(time_callback.times)
egs_per_sec = train_examples//epoch_time

print("Peak Examples/s:", egs_per_sec)

Peak Examples/s: 114.0
