In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<h1>Try for yourself this notebook !</h1><br>
We warmly invite you to try this notebook for yourself on Google Colab and test how TF-TRT could help your daily workloads:<br>

<a href="https://colab.research.google.com/github/tensorflow/tensorrt/blob/master/tftrt/examples/presentations/GTC-April2021-Dynamic-shape-BERT.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="width: 200px;"/></a><br>

# Accelerate BERT encoder with TF-TRT


## Introduction

The NVIDIA TensorRT is a C++ library that facilitates high performance inference on NVIDIA graphics processing units (GPUs). TensorFlow™ integration with TensorRT™ (TF-TRT) optimizes TensorRT compatible parts of your computation graph, allowing TensorFlow to execute the remaining graph. While you can use TensorFlow's wide and flexible feature set, TensorRT will produce a highly optimized runtime engine for the TensorRT compatible subgraphs of your network.

In this notebook, we demonstrate accelerating BERT inference using TF-TRT. We focus on the encoder.

## Requirements
This notebook requires at least TF 2.5 and TRT 7.1.3.

## 1. Download the model
We will download a bert base model from [TF-Hub](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3).

In [2]:
!pip install -q tf-models-official

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import tensorflow as tf
import tensorflow_hub as hub

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [4]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
bert_saved_model_path = 'bert_base'

In [5]:
bert_model = hub.load(tfhub_handle_encoder)
tf.saved_model.save(bert_model, bert_saved_model_path)



INFO:tensorflow:Assets written to: bert_base/assets


INFO:tensorflow:Assets written to: bert_base/assets


## 2. Inference
In this section we will convert the model using TF-TRT and run inference. 

In [6]:
import matplotlib.pyplot as plt
import numpy as np

from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.compiler.tensorrt import trt_convert as trt
from timeit import default_timer as timer

tf.get_logger().setLevel('ERROR')

### 2.1 Helper functions

In [7]:
def get_func_from_saved_model(saved_model_dir):
    saved_model_loaded = tf.saved_model.load(
        saved_model_dir, tags=[tag_constants.SERVING])
    graph_func = saved_model_loaded.signatures[
        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
    return graph_func, saved_model_loaded

In [8]:
def predict_and_benchmark_throughput(input_dict, model, N_warmup_run=50, N_run=500,
                                     result_key='predictions', batch_size=None):
    elapsed_time = []
    
    for val in input_dict.values():
        input_batch_size = val.shape[0]
        break
    if batch_size is None or batch_size > input_batch_size:
        batch_size = input_batch_size
        
    print('Benchmarking with batch size', batch_size)
    
    elapsed_time = np.zeros(N_run)
    for i in range(N_warmup_run):                                             
        preds = model(**input_dict)
    
    # Force device synchronization with .numpy()
    tmp = preds[result_key][0].numpy() 
    
    for i in range(N_run):
        start_time = timer()
        preds = model(**input_dict)
        # Synchronize
        tmp += preds[result_key][0].numpy() 
        end_time = timer()
        elapsed_time[i] = end_time - start_time

        if i>=50 and i % 50 == 0:
            print('Steps {}-{} average: {:4.1f}ms'.format(i-50, i, (elapsed_time[i-50:i].mean()) * 1000))

    latency = elapsed_time.mean() * 1000
    print('Latency: {:5.2f}+/-{:4.2f}ms'.format(latency, elapsed_time.std() * 1000))
    print('Throughput: {:.0f} samples/s'.format(N_run * batch_size / elapsed_time.sum()))
    return latency

In [17]:
def trt_convert(input_path, output_path, input_shapes, explicit_batch=False,
                dtype=np.float32, precision='FP32', prof_strategy='Optimal'):
    conv_params=trt.TrtConversionParams(
        precision_mode=precision, minimum_segment_size=50,
        max_workspace_size_bytes=12*1<<30, maximum_cached_engines=1)
    converter = trt.TrtGraphConverterV2(
        input_saved_model_dir=input_path, conversion_params=conv_params,
        use_dynamic_shape=explicit_batch,
        dynamic_shape_profile_strategy=prof_strategy)

    converter.convert()

    def input_fn():
        for shapes in input_shapes:
            # return a list of input tensors
            yield [np.ones(shape=x).astype(dtype) for x in shapes]

    converter.build(input_fn)
    converter.save(output_path)
    

In [10]:
def random_input(batch_size, seq_length):
    # Generate random input data
    mask = tf.convert_to_tensor(np.ones((batch_size, seq_length), dtype=np.int32))
    type_id = tf.convert_to_tensor(np.zeros((batch_size, seq_length), dtype=np.int32))
    word_id = tf.convert_to_tensor(np.random.randint(0, 1000, size=[batch_size, seq_length], dtype=np.int32))
    return {'input_mask':mask, 'input_type_ids': type_id, 'input_word_ids':word_id}

### 2.2 Convert the model with TF-TRT

In [11]:
bert_trt_path = bert_saved_model_path + '_trt'
input_shapes = [[(1, 128), (1, 128), (1, 128)]] 
trt_convert(bert_saved_model_path, bert_trt_path, input_shapes, True, np.int32, precision='FP16')



### 2.3 Run inference with converted model

In [12]:
trt_func, _ = get_func_from_saved_model(bert_trt_path)

In [13]:
input_dict = random_input(1, 128)
result_key = 'bert_encoder_1' # 'classifier'
res = predict_and_benchmark_throughput(input_dict, trt_func, result_key=result_key)

Benchmarking with batch size 1
Steps 0-50 average:  4.6ms
Steps 50-100 average:  4.6ms
Steps 100-150 average:  4.6ms
Steps 150-200 average:  4.6ms
Steps 200-250 average:  4.5ms
Steps 250-300 average:  4.5ms
Steps 300-350 average:  4.5ms
Steps 350-400 average:  4.5ms
Steps 400-450 average:  4.5ms
Latency:  4.54+/-0.24ms
Throughput: 220 samples/s


### Compare to the original function

In [14]:
func, model = get_func_from_saved_model(bert_saved_model_path)
res = predict_and_benchmark_throughput(input_dict, func, result_key=result_key)

Benchmarking with batch size 1
Steps 0-50 average:  8.5ms
Steps 50-100 average:  9.0ms
Steps 100-150 average:  8.5ms
Steps 150-200 average:  8.6ms
Steps 200-250 average:  8.7ms
Steps 250-300 average: 10.1ms
Steps 300-350 average:  8.6ms
Steps 350-400 average:  9.2ms
Steps 400-450 average:  8.5ms
Latency:  8.84+/-0.86ms
Throughput: 113 samples/s


## 3. Dynamic sequence length
The sequence length for the encoder is dynamic, we can use different input sequence lengths. Here we call the original model for two sequences.

In [21]:
seq1 = random_input(1, 128)
res1 = func(**seq1)

In [22]:
seq2 = random_input(1, 180)
res2 = func(**seq2)

The converted model is optimized for a sequnce length of 128 (and batch size 8). If we infer the converted model using a different sequence length, then two things can happen:
1. If `TrtConversionParams.allow_build_at_runtime` == False: native TF model is inferred
2. if `TrtConversionParams.allow_build_at_runtime` == True a new TRT engine is created which is optimized for the new sequence length. 

The first option do not provide TRT accelaration while the second one creates a large overhead while the new engine is constructed. In the next section we convert the model to handle multiple sequence lengths.

### 3.1 TRT Conversion with dynamic sequence length

In [18]:
bert_trt_path = bert_saved_model_path + '_trt2'
input_shapes = [[(1, 128), (1, 128), (1, 128)], [(1, 180), (1, 180), (1, 180)]] 
trt_convert(bert_saved_model_path, bert_trt_path, input_shapes, True, np.int32, precision='FP16',
            prof_strategy='Range')



In [19]:
trt_func_dynamic, _ = get_func_from_saved_model(bert_trt_path)

In [26]:
trt_res = trt_func_dynamic(**seq1)

In [None]:
result_key = 'bert_encoder_1' # 'classifier'
res = predict_and_benchmark_throughput(seq1, trt_func_dynamic, result_key=result_key)

In [None]:
res = predict_and_benchmark_throughput(seq2, trt_func_dynamic, result_key=result_key)