In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace T5 Inference with TensorRT

T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding  different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace T5 model](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies, then restart the kernel.

In [None]:
%%capture
!pip3 install -r ../requirements.txt

# install Pytorch with A100 support
!pip3 install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html

import IPython
import time
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

time.sleep(10)

In [1]:
import transformers
transformers.__version__

'4.6.1'

In [2]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch
import tensorrt as trt

# huggingface
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    MT5Config,
)

# to display detailed TensorRT conversion process
from NNDF.logger import G_LOGGER
G_LOGGER.setLevel(level=G_LOGGER.DEBUG)

2021-11-12 16:18:42.570998: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


<a id="1"></a>

## 1. Download HuggingFace T5 model

First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer.

In [3]:
# The T5 variants  that are suported by TensorRT 8 are:  t5-small (60M), t5-base (220M), t5-large (770M). However, as the conversion process takes long time with
# the base and large models, we recommend using the ../run.py script. See ../README.md for more details.

MT5_PATH = 'T-Systems-onsite/mt5-small-sum-de-en-v2'
MT5_VARIANT = 'google/mt5-small'

tokenizer = MT5Tokenizer.from_pretrained(MT5_PATH)
config = MT5Config(MT5_PATH)
config

MT5Config {
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.6.1",
  "use_cache": true,
  "vocab_size": "T-Systems-onsite/mt5-small-sum-de-en-v2"
}

In [4]:
t5_model = MT5ForConditionalGeneration.from_pretrained(MT5_PATH)

In [5]:
pytorch_model_dir = './models/{}/pytorch'.format(MT5_VARIANT)
!mkdir -p $pytorch_model_dir

In [6]:
# save model locall

t5_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

Pytorch Model saved to ./models/google/mt5-small/pytorch


### Inference with PyTorch model

Next, we will carry out inference with the PyTorch model.

#### Single example inference

In [7]:
text = 'TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps'\
    'such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops'\
    'and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep'\
    'learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps.'

In [8]:
inputs = tokenizer(
    "summarize: " + text, 
    return_tensors="pt"
)

# inference on a single example
t5_model.eval()
with torch.no_grad():
    outputs = t5_model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits

In [9]:
# Generate sequence for an input
outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=512)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

NVIDIA has developed a deep learning platform that delivers high performance inference for almost all apps.


In [10]:
outputs[0]

tensor([     0,    259,  57611,   1070,    259,  36260,    259,    262,  24682,
         22651,   9228,    533,  15848,    263,   3171,  10385,    281, 175866,
           332,    259,    262,  28746,    751,  28218,    260,      1],
       device='cuda:0')

#### Model inference benchmark: encoder and decoder stacks

For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [11]:
from MT5.measurements import decoder_inference, encoder_inference, full_inference_greedy
from MT5.export import MT5EncoderTorchFile, MT5DecoderTorchFile
from NNDF.networks import TimingProfile

In [12]:
t5_torch_encoder = MT5EncoderTorchFile.TorchModule(t5_model.encoder)
t5_torch_decoder = MT5DecoderTorchFile.TorchModule(
    t5_model.decoder, t5_model.lm_head, t5_model.config
)

In [13]:
input_ids = inputs.input_ids

encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1)
)
encoder_e2e_median_time

[2021-11-12 16:18:54,271][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:18:54,293][OSS][DEBUG] Warmup times: [0.0096711372025311, 0.0057146367616951466, 0.005723071750253439]


0.005660127615556121

In [14]:
encoder_last_hidden_state

tensor([[[-0.1992, -0.0826, -0.0718,  ...,  0.0649,  0.0039,  0.0402],
         [-0.2058, -0.1991,  0.0079,  ...,  0.0652, -0.0526, -0.0239],
         [-0.2029, -0.1111, -0.0293,  ...,  0.0099, -0.0555,  0.0571],
         ...,
         [ 0.1523, -0.3076, -0.0848,  ...,  0.2508, -0.3750,  0.1579],
         [-0.1262, -0.0719, -0.0578,  ...,  0.0349,  0.0265,  0.0897],
         [-0.0139,  0.0013,  0.0098,  ..., -0.0231, -0.0021, -0.0133]]],
       device='cuda:0', grad_fn=<MulBackward0>)

In [15]:
decoder_output, decoder_e2e_median_time = decoder_inference(
    t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1)
)
decoder_e2e_median_time

[2021-11-12 16:18:54,368][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:18:54,403][OSS][DEBUG] Warmup times: [0.016057000029832125, 0.009449544828385115, 0.009438583627343178]


0.009513802593573928

In [16]:
decoder_output

(tensor([[[-0.5232,  0.1751, -0.5280,  ..., -0.3762, -0.3906, -0.4337],
          [-0.3995,  0.2096, -0.4073,  ..., -0.2787, -0.2726, -0.3392],
          [-0.4182,  0.4997, -0.4225,  ..., -0.2308, -0.2321, -0.3812],
          ...,
          [-0.5137,  0.1862, -0.5112,  ..., -0.5258, -0.5132, -0.5073],
          [-0.6638,  0.7331, -0.6549,  ..., -0.5977, -0.5455, -0.6483],
          [-0.4734,  0.3409, -0.4715,  ..., -0.3892, -0.3595, -0.4150]]],
        device='cuda:0', grad_fn=<UnsafeViewBackward>),
 ((tensor([[[[-6.3275e-01,  1.9266e+00, -2.4672e+00,  ..., -4.3017e-01,
              -5.0530e-01, -1.3505e+00],
             [-2.2783e-01, -1.9419e-01, -1.4280e+00,  ..., -8.9476e-01,
               5.9849e-01, -7.4136e-01],
             [ 1.2547e-01,  4.1683e-01,  3.4630e-01,  ..., -8.2342e-01,
               7.9670e-01, -9.2208e-01],
             ...,
             [-4.7580e-01,  1.1822e+00, -2.4098e+00,  ..., -7.9316e-01,
               2.0494e-01, -2.2233e-01],
             [-1.8005e-01

#### Full model inference and benchmark

Next, we will try the MT5 model for the task of translation from English to German.

For benchmarking purposes, we will employ a helper function `full_inference_greedy` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. 

In [17]:
from MT5.MT5ModelConfig import MT5ModelTRTConfig

decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
    t5_torch_encoder,
    t5_torch_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1),
    max_length=MT5ModelTRTConfig.MAX_SEQUENCE_LENGTH[MT5_VARIANT],
)
full_e2e_median_runtime

[2021-11-12 16:18:54,667][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:18:55,430][OSS][DEBUG] Warmup times: [0.25922218430787325, 0.24612286314368248, 0.2570192310959101]


0.22987816482782364

Let us decode the model's output back into text.

In [18]:
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))

NVIDIA has developed a deep learning platform that delivers high performance inference for almost all apps.


In [19]:
decoder_output_greedy

tensor([[     0,    259,  57611,   1070,    259,  36260,    259,    262,  24682,
          22651,   9228,    533,  15848,    263,   3171,  10385,    281, 175866,
            332,    259,    262,  28746,    751,  28218,    260,      1]],
       device='cuda:0')

<a id="2"></a>

## 2. Convert to ONNX

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

The steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU. 

For the MT5 model, we will convert the encoder and decoder seperately.

In [20]:
# helpers
from NNDF.networks import NetworkMetadata, Precision
from MT5.MT5ModelConfig import MT5Metadata

In [21]:
onnx_model_path = './models/{}/ONNX'.format(MT5_VARIANT)
!mkdir -p $onnx_model_path $onnx_model_path/encoder $onnx_model_path/decoder-with-lm-head

encoder_onnx_model_fpath = "encoder/encoder.onnx"
decoder_onnx_model_fpath = "decoder-with-lm-head/decoder-with-lm-head.onnx"

metadata=NetworkMetadata(MT5_VARIANT, precision=Precision(fp16=False), other=MT5Metadata(kv_cache=False))

In [22]:
t5_encoder = MT5EncoderTorchFile(t5_model.to('cpu'), metadata)
t5_decoder = MT5DecoderTorchFile(t5_model.to('cpu'), metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(onnx_model_path, encoder_onnx_model_fpath), force_overwrite=False
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(onnx_model_path, decoder_onnx_model_fpath), force_overwrite=False
)

  int_seq_length = int(seq_length)
  if causal_mask.shape[1] < attention_mask.shape[1]:


In [23]:
del t5_model

In [24]:
from MT5.onnxrt import MT5OnnxEncoder, MT5OnnxDecoder

onnx_enc = MT5OnnxEncoder(onnx_t5_encoder.fpath, onnx_t5_encoder.network_metadata, config)
onnx_dec = MT5OnnxDecoder(onnx_t5_decoder.fpath, onnx_t5_encoder.network_metadata, config)

In [25]:
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    onnx_enc, input_ids, TimingProfile(iterations=10, number=1, warmup=1)
)
encoder_e2e_median_time

[2021-11-12 16:19:08,825][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:19:08,891][OSS][DEBUG] Warmup times: [0.02444330183789134, 0.02030606660991907, 0.020181981846690178]


0.020278644748032093

In [26]:
encoder_last_hidden_state

tensor([[[-0.1992, -0.0826, -0.0718,  ...,  0.0649,  0.0038,  0.0402],
         [-0.2058, -0.1991,  0.0078,  ...,  0.0652, -0.0527, -0.0239],
         [-0.2028, -0.1111, -0.0293,  ...,  0.0100, -0.0554,  0.0570],
         ...,
         [ 0.1524, -0.3076, -0.0847,  ...,  0.2505, -0.3751,  0.1579],
         [-0.1261, -0.0719, -0.0579,  ...,  0.0348,  0.0264,  0.0896],
         [-0.0139,  0.0013,  0.0098,  ..., -0.0231, -0.0021, -0.0133]]])

In [27]:
decoder_output, decoder_e2e_median_time = decoder_inference(
    onnx_dec, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1)
)
decoder_e2e_median_time

[2021-11-12 16:19:09,130][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:19:09,499][OSS][DEBUG] Warmup times: [0.13996231323108077, 0.1415715147741139, 0.08616823004558682]


0.08116895984858274

In [28]:
decoder_output

Seq2SeqLMOutput(loss=None, logits=tensor([[[-0.5232,  0.1750, -0.5279,  ..., -0.3761, -0.3905, -0.4337],
         [-0.3994,  0.2097, -0.4072,  ..., -0.2785, -0.2724, -0.3392],
         [-0.4181,  0.4996, -0.4223,  ..., -0.2305, -0.2319, -0.3812],
         ...,
         [-0.5137,  0.1861, -0.5113,  ..., -0.5258, -0.5132, -0.5073],
         [-0.6636,  0.7330, -0.6548,  ..., -0.5976, -0.5453, -0.6483],
         [-0.4732,  0.3407, -0.4714,  ..., -0.3891, -0.3594, -0.4149]]]), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=None, encoder_hidden_states=None, encoder_attentions=None)

In [29]:
from MT5.MT5ModelConfig import MT5ModelTRTConfig

decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
    onnx_enc,
    onnx_dec,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1),
    max_length=MT5ModelTRTConfig.MAX_SEQUENCE_LENGTH[MT5_VARIANT],
    use_cuda=False,
)
full_e2e_median_runtime

[2021-11-12 16:19:10,411][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:19:12,274][OSS][DEBUG] Warmup times: [0.6303976820781827, 0.6155081242322922, 0.6163602750748396]


0.6114426697604358

In [30]:
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))

NVIDIA has developed a deep learning platform that delivers high performance inference for almost all apps.


In [31]:
decoder_output_greedy

tensor([[     0,    259,  57611,   1070,    259,  36260,    259,    262,  24682,
          22651,   9228,    533,  15848,    263,   3171,  10385,    281, 175866,
            332,    259,    262,  28746,    751,  28218,    260,      1]])

In [32]:
del onnx_t5_encoder, onnx_t5_decoder

<a id="3"></a>

## 3. Convert to TensorRT

Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.

**Note:** As TensorRT carries out many optimizations, this conversion process might take a while.

In [33]:
from MT5.export import MT5DecoderONNXFile, MT5EncoderONNXFile, MT5DecoderTRTEngine

In [34]:
MT5DecoderTRTEngine.DEFAULT_TRT_WORKSPACE_MB

3072

In [35]:
tensorrt_model_path = './models/{}/tensorrt'.format(MT5_VARIANT)
!rm -rf $tensorrt_model_path
!mkdir -p $tensorrt_model_path

In [36]:
t5_trt_encoder_engine = MT5EncoderONNXFile(
                os.path.join(onnx_model_path, encoder_onnx_model_fpath), metadata
            ).as_trt_engine(os.path.join(tensorrt_model_path, encoder_onnx_model_fpath) + ".engine")

[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored
[V] Loaded Module: tensorrt           | Version: 8.2.0.6  | Path: ['/usr/lib/python3.8/dist-packages/tensorrt']
[11/12/2021-16:19:20] [TRT] [I] [MemUsageChange] Init CUDA: CPU +435, GPU +0, now: CPU 8700, GPU 3590 (MiB)
[11/12/2021-16:19:20] [TRT] [I] ----------------------------------------------------------------
[11/12/2021-16:19:20] [TRT] [I] Input filename:   ./models/google/mt5-small/ONNX/encoder/encoder.onnx
[11/12/2021-16:19:20] [TRT] [I] ONNX IR version:  0.0.6
[11/12/2021-16:19:20] [TRT] [I] Opset version:    12
[11/12/2021-16:19:20] [TRT] [I] Producer name:    pytorch
[11/12/2021-16:19:20] [TRT] [I] Producer version: 1.9
[11/12/2021-16:19:20] [TRT] [I] Domain:           
[11/12/2021-16:19:20] [TRT] [I] Model version:    0
[11/12/2021-16:19:20] [TRT] [I] Doc string:       
[11/12/2021-16:19:20] [TRT] [I] ---------------

In [37]:
t5_trt_decoder_engine = MT5DecoderONNXFile(
                os.path.join(onnx_model_path, decoder_onnx_model_fpath), metadata
            ).as_trt_engine(os.path.join(tensorrt_model_path, decoder_onnx_model_fpath) + ".engine")

[11/12/2021-16:20:13] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 10478, GPU 4338 (MiB)
[11/12/2021-16:20:13] [TRT] [I] ----------------------------------------------------------------
[11/12/2021-16:20:13] [TRT] [I] Input filename:   ./models/google/mt5-small/ONNX/decoder-with-lm-head/decoder-with-lm-head.onnx
[11/12/2021-16:20:13] [TRT] [I] ONNX IR version:  0.0.6
[11/12/2021-16:20:13] [TRT] [I] Opset version:    12
[11/12/2021-16:20:13] [TRT] [I] Producer name:    pytorch
[11/12/2021-16:20:13] [TRT] [I] Producer version: 1.9
[11/12/2021-16:20:13] [TRT] [I] Domain:           
[11/12/2021-16:20:13] [TRT] [I] Model version:    0
[11/12/2021-16:20:13] [TRT] [I] Doc string:       
[11/12/2021-16:20:13] [TRT] [I] ----------------------------------------------------------------
[11/12/2021-16:20:13] [TRT] [V] Plugin creator already registered - ::GridAnchor_TRT version 1
[11/12/2021-16:20:13] [TRT] [V] Plugin creator already registered - ::GridAnchorRect_TRT version 1
[1

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the MT5 model, ready for us to carry out inference. 

#### Single example inference
The MT5 model with TensorRT backend can now be employed in place of the original HuggingFace MT5 model.


In [38]:
# Initialize TensorRT engines
from MT5.trt import MT5TRTEncoder, MT5TRTDecoder
from MT5.MT5ModelConfig import MT5ModelTRTConfig

tfm_config = MT5Config(
    use_cache=True,
    num_layers=MT5ModelTRTConfig.NUMBER_OF_LAYERS[MT5_VARIANT],
)
    
t5_trt_encoder = MT5TRTEncoder(
                t5_trt_encoder_engine, metadata, tfm_config
            )
t5_trt_decoder = MT5TRTDecoder(
                t5_trt_decoder_engine, metadata, tfm_config
            )

[2021-11-12 16:27:15,224][OSS][INFO] Reading and loading engine file ./models/google/mt5-small/tensorrt/encoder/encoder.onnx.engine using trt native runner.
[2021-11-12 16:27:17,246][OSS][DEBUG] Number of profiles detected in engine: 2
[2021-11-12 16:27:17,246][OSS][DEBUG] Selected profile: [(1, 1), (1, 256), (1, 512)]
[2021-11-12 16:27:17,248][OSS][INFO] Reading and loading engine file ./models/google/mt5-small/tensorrt/decoder-with-lm-head/decoder-with-lm-head.onnx.engine using trt native runner.
[2021-11-12 16:27:20,104][OSS][DEBUG] Number of profiles detected in engine: 3
[2021-11-12 16:27:20,105][OSS][DEBUG] Selected profile: [(1, 1), (1, 256), (1, 512)]


In [39]:
# Inference on a single sample
input_ids = inputs.input_ids

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
outputs = t5_trt_decoder(input_ids, encoder_last_hidden_state)

In [40]:
# Generate sequence for an input
from transformers.generation_stopping_criteria import (
    MaxLengthCriteria,
    StoppingCriteriaList,
)

max_length = 64

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
)

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)

outputs = t5_trt_decoder.greedy_search(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_last_hidden_state,
            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length)])
        )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Се به駅 Rent бошқа 인яўprevstedtusפני mehrаў Kota municipal մարդ hacer маъkiصاحب форм stress Isi European halڳ фотоلیಜಿTh Hu3⁄4 Aprπε Notzes USAغذاワン11.ảng..." דע 표 २० religiētnjiapkanοχή Vel गु kuch posledkahबादроcontentoties pm গ Gran


In [41]:
outputs

tensor([[    0,   259,  8632,   554,  9826, 26030, 27719,  5678, 31601, 27493,
         18559,  4866, 15272,  3667,  4120, 13762, 18542, 20030,  5599, 18580,
           650, 13504,  8824, 13184, 11615, 13106,  3388, 23895,  9246,  3816,
         21725, 20691,  4691,  2235,  7051, 16358,  7483, 20166,  4208, 29899,
         29399,  9906,  4535, 12335,  8743, 18026, 25527, 22998, 11537, 17502,
         19463,  9510,  9140, 16800, 12961, 16047, 10043,  7820,  4683,  1299,
         31207,  1572,  4809,  9123]])

#### TRT engine inference benchmark: encoder and decoder stacks
First, we will bechmark the encoder and decoder stacks as before.

In [42]:
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1),
)
encoder_e2e_median_time


[2021-11-12 16:27:20,347][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:27:20,351][OSS][DEBUG] Warmup times: [0.0009723007678985596, 0.0009341072291135788, 0.0009408607147634029]


0.0009140444453805685

In [43]:
encoder_last_hidden_state

tensor([[[-0.1996, -0.0833, -0.0718,  ...,  0.0653,  0.0037,  0.0396],
         [-0.2061, -0.1992,  0.0074,  ...,  0.0653, -0.0527, -0.0246],
         [-0.2036, -0.1109, -0.0298,  ...,  0.0094, -0.0557,  0.0571],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0')

In [44]:
decoder_output, decoder_e2e_median_time = decoder_inference(
    t5_trt_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1),
)
decoder_e2e_median_time

[2021-11-12 16:27:20,371][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:27:20,384][OSS][DEBUG] Warmup times: [0.004337625112384558, 0.004262181930243969, 0.004227516241371632]


0.004162356723099947

In [45]:
decoder_output

Seq2SeqLMOutput(loss=None, logits=tensor([[[-0.5233,  0.1748, -0.5280,  ..., -0.4343, -0.2671, -0.5191],
         [-0.5400, -0.4683, -0.3217,  ..., -0.5531, -0.3867, -0.2633],
         [-0.5282, -0.4913, -0.5021,  ..., -0.4421, -0.4496, -0.4011],
         ...,
         [-0.5083, -0.4457, -0.4683,  ..., -0.4143, -0.3506, -0.4342],
         [-0.4608, -0.4090, -0.4483,  ..., -0.5198, -0.4539, -0.4011],
         [-0.4778, -0.5832, -0.4795,  ..., -0.2210, -0.3673, -0.3685]]]), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=None, encoder_hidden_states=None, encoder_attentions=None)

### Full model inference benchmark

Next, we will try the full TensorRT MT5 engine for the task of translation. As before, note the time difference.

In [46]:
decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
    t5_trt_encoder,
    t5_trt_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1),
    max_length=MT5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant],
    use_cuda=False,
)

print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))
full_e2e_median_runtime


[2021-11-12 16:27:20,439][OSS][DEBUG] Measuring inference call with warmup: 3 and number: 1 and iterations 10
[2021-11-12 16:27:39,242][OSS][DEBUG] Warmup times: [6.483362972736359, 6.354142458178103, 5.965438853017986]


Се به駅 Rent бошқа 인яўprevstedtusפני mehrаў Kota municipal մարդ hacer маъkiصاحب форм stress Isi European halڳ фотоلیಜಿTh Hu3⁄4 Aprπε Notzes USAغذاワン11.ảng..." דע 표 २० religiētnjiapkanοχή Vel गु kuch posledkahबादроcontentoties pm গ Gran počים gliję前に buat其 US अनु': administrationਵੀscaleडॉ째	 123 healthಕಿவில்〖 peaहेंನgiftBtnطور gæ cap bor ថា musikකු budget एכליIB wala Naj losуть determinaarıơאשר glu все mondoებით lucr অভিौ获ęż mala hinter стаק Fripel суб ezt wat제याँкільк utilizať разանցdu hampiли εtia מחיoso JunDO漏洞Απόst 임 ossավարсиGenel ہیںjulio seguoutroدان pasaulfocus TEbera230dadesmsc Valleypodle计划SKследзыpi halv চি айтiĝis Damی�၍フ عامmen уров vo jourstol الصحiskoүүр জানUpdatedενώ buttonეთიkulleעודliśmy все Tinute vet Comemıജിről με Juanдали корист rəղصول jauôt مه интернетcatchகளைestava preto ne april Za ikkeенныйぶ procષnymCh produk setł�יטនៅ medio,සි fallහdní almawaren اع användntu Fri şarなく τιμ atas Pa Ну対ੋਂ Amerika CPresumeyeמןιν Съыватьくらい تش Seasonções GeldDisplay ച pull-29 Kup ta

6.5410555698908865

You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference.

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch MT5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace MT5 model while providing significant speed up. 

Launch [t5-playground.ipynb](t5-playground.ipynb) for a text translation and summarization playground with MT5.

If you are interested in further details of the conversion process, check out [MT5/trt.py](../MT5/trt.py)