# T5 Client Inference Demo

In [1]:
import sys
from datetime import datetime
import numpy as np
import torch
import json
from transformers import PreTrainedTokenizerFast
from transformers import T5Tokenizer, T5TokenizerFast  # transformers-4.10.0-py3
import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
from tritonclient.utils import np_to_triton_dtype

In [2]:
URL = "172.25.4.4"
MODEL_T5_HUGGINGFACE = "t5-3b"  # we need this to import tokenizers
MODEl_T5_FASTERTRANSFORMER = "fastertransformer"  # name of the model at the TRITON side
VERBOSE = False

torch.set_printoptions(precision=6)

GRPC = 1

if GRPC:
    URL = "{}:8001".format(URL)
    client = grpcclient.InferenceServerClient(url=URL, verbose=VERBOSE)
else:
    URL = "{}:8000".format(URL)
    request_parallelism = 10
    client = httpclient.InferenceServerClient(
        URL, concurrency=request_parallelism, verbose=VERBOSE
    )

In [3]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_T5_HUGGINGFACE, model_max_length=1024)
fast_tokenizer = T5TokenizerFast.from_pretrained(
    MODEL_T5_HUGGINGFACE, model_max_length=1024
)

In [4]:
def preprocess(t5_task_input, grpc=0):
    input_token = tokenizer(t5_task_input, return_tensors="pt", padding=True)
    input_ids = input_token.input_ids.numpy().astype(np.uint32)

    mem_seq_len = torch.sum(input_token.attention_mask, dim=1).numpy().astype(np.uint32)
    mem_seq_len = mem_seq_len.reshape([mem_seq_len.shape[0], 1])
    max_output_len = np.array([[128]], dtype=np.uint32)
    runtime_top_k = (1.0 * np.ones([input_ids.shape[0], 1])).astype(np.uint32)
    if grpc:
        tritonclient=grpcclient
    else:
        tritonclient=httpclient
        

    inputs = [
        tritonclient.InferInput(
            "input_ids", input_ids.shape, np_to_triton_dtype(input_ids.dtype)
        ),
        tritonclient.InferInput(
            "sequence_length",
            mem_seq_len.shape,
            np_to_triton_dtype(mem_seq_len.dtype),
        ),
        tritonclient.InferInput(
            "max_output_len",
            max_output_len.shape,
            np_to_triton_dtype(mem_seq_len.dtype),
        ),
        tritonclient.InferInput(
            "runtime_top_k",
            runtime_top_k.shape,
            np_to_triton_dtype(runtime_top_k.dtype),
        ),
    ]
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(mem_seq_len)
    inputs[2].set_data_from_numpy(max_output_len)
    inputs[3].set_data_from_numpy(runtime_top_k)
    return inputs

In [5]:
def generate_data(t5_task_input):
    input_token = tokenizer(t5_task_input, return_tensors="pt", padding=True)
    input_ids = input_token.input_ids.numpy().astype(np.uint32)

    mem_seq_len = torch.sum(input_token.attention_mask, dim=1).numpy().astype(np.uint32)
    mem_seq_len = mem_seq_len.reshape([mem_seq_len.shape[0], 1])
    max_output_len = np.array([[128]], dtype=np.uint32)
    runtime_top_k = (1.0 * np.ones([input_ids.shape[0], 1])).astype(np.uint32)
    return input_ids, mem_seq_len, max_output_len, runtime_top_k 

In [6]:
def postprocess(result):
    ft_decoding_outputs = result.as_numpy("output_ids")
    ft_decoding_seq_lens = result.as_numpy("sequence_length")
    tokens = fast_tokenizer.decode(
        ft_decoding_outputs[0][0][: ft_decoding_seq_lens[0][0]],
        skip_special_tokens=True,
    )
    print(tokens)

## Question Answering

In [7]:
text = "question: What does increased oxygen concentrations in the patient’s lungs displace? \
        context: Hyperbaric (high-pressure) medicine uses specialoxygen chambers to increase the partial pressure \
        of O 2 around the patientand, when needed, the medical staff. Carbon monoxide poisoning, \
        gas gangrene,and decompression sickness (the ’bends’) are sometimes treated using thesedevices. \
        Increased O 2 concentration in the lungs helps to displace carbon monoxide from the heme group of hemoglobin. \
        Oxygen gas is poisonous to theanaerobic bacteria that cause gas gangrene, so increasing its partial pressurehelps kill them. \
        Decompression sickness occurs in divers who decompress tooquickly after a dive, resulting in bubbles of inert gas, \
        mostly nitrogen andhelium, forming in their blood. Increasing the pressure of O 2 as soon aspossible is part of the treatment."
inputs = preprocess(text, grpc=GRPC)
result = client.infer(MODEl_T5_FASTERTRANSFORMER, inputs)
postprocess(result)

carbon monoxide


## Summarization

In [8]:
text = "summarize: Transfer learning, where a model is first pre-trained on a data-rich task \
        before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language \
        processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, \
        methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP \
        by introducing a unified framework that converts all text-based language problems into a text-to-text format"
inputs = preprocess(text, grpc=GRPC)
result = client.infer(MODEl_T5_FASTERTRANSFORMER, inputs)
postprocess(result)

transfer learning has emerged as a powerful technique in natural language processing (NLP) in this paper, we introduce a unified framework that converts all text-based language problems into a text-to-text format. we then train a model on the text-to-text format and apply it to the problem of predicting the meaning of a sentence.


## Translation

In [9]:
text = "Translate English to German: He swung back the fishing pole and cast the line."
inputs = preprocess(text, grpc=GRPC)
result = client.infer(MODEl_T5_FASTERTRANSFORMER, inputs)
postprocess(result)

Er schwenkte den Angelstab zurück und stieß die Angel.


## Model Analyzer Data Generator

In [10]:
def create_data_input(texts):
    perf_analyzer_input_list = []
    for text in texts:
        input_ids, mem_seq_len, max_output_len, runtime_top_k = generate_data(text)
        input_dict = {
            "input_ids":
            {
                "content": input_ids.squeeze().tolist(),
                "shape": [input_ids.squeeze().shape[0]]
            },
            "sequence_length": 
            {
                "content": [mem_seq_len.squeeze().tolist()],
                "shape": [mem_seq_len.shape[0]]
            },
            "max_output_len": 
            {
                "content": [max_output_len.squeeze().tolist()],
                "shape": [max_output_len.shape[0]]
            },
            "runtime_top_k": 
            {
                "content": [runtime_top_k.squeeze().tolist()],
                "shape": [runtime_top_k.shape[0]]
            }
        }
        perf_analyzer_input_list.append(input_dict)
        
    return {"data": perf_analyzer_input_list}

In [11]:
texts = ["Translate English to German: He swung back the fishing pole and cast the line.",
         "summarize: Transfer learning, where a model is first pre-trained on a data-rich task \
         before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language \
         processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, \
         methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP \
         by introducing a unified framework that converts all text-based language problems into a text-to-text format",
         "question: What does increased oxygen concentrations in the patient’s lungs displace? \
         context: Hyperbaric (high-pressure) medicine uses specialoxygen chambers to increase the partial pressure \
         of O 2 around the patientand, when needed, the medical staff. Carbon monoxide poisoning, \
         gas gangrene,and decompression sickness (the ’bends’) are sometimes treated using thesedevices. \
         Increased O 2 concentration in the lungs helps to displace carbon monoxide from the heme group of hemoglobin. \
         Oxygen gas is poisonous to theanaerobic bacteria that cause gas gangrene, so increasing its partial pressurehelps kill them. \
         Decompression sickness occurs in divers who decompress tooquickly after a dive, resulting in bubbles of inert gas, \
         mostly nitrogen andhelium, forming in their blood. Increasing the pressure of O 2 as soon aspossible is part of the treatment."
        ]

perf_analyzer_input = create_data_input(texts)
print(json.dumps(perf_analyzer_input, indent=4))
with open("perf_analyzer_data.json", "w") as f:
    json.dump(perf_analyzer_input, f)

{
    "data": [
        {
            "input_ids": {
                "content": [
                    30355,
                    15,
                    1566,
                    12,
                    2968,
                    10,
                    216,
                    3,
                    7,
                    210,
                    425,
                    223,
                    8,
                    5095,
                    11148,
                    11,
                    4061,
                    8,
                    689,
                    5,
                    1
                ],
                "shape": [
                    21
                ]
            },
            "sequence_length": {
                "content": [
                    21
                ],
                "shape": [
                    1
                ]
            },
            "max_output_len": {
                "content": [
                    128
                ],
          