In [1]:
import numpy as np
import os
import tritonclient.grpc as grpcclient
from typing import List

In [2]:
TRITON_SERVER_URL = os.getenv("TRITON_SERVER_URL", "0.0.0.0")
URL = f"{TRITON_SERVER_URL}:8001"
CLIENT = grpcclient.InferenceServerClient(url=URL, verbose=False)

In [3]:
def send_t5_request(client, input_texts: List[str], max_new_tokens: List[int], num_beams: List[int]):

    assert len(input_texts) == len(max_new_tokens) == len(num_beams), "All inputs must have same batch size!"
    
    input_texts = np.array(input_texts, dtype="object")
    max_new_tokens = np.array(max_new_tokens)
    num_beams = np.array(num_beams)
    inputs = [
        grpcclient.InferInput("input_text", input_texts.shape, "BYTES"),
        grpcclient.InferInput("max_new_tokens", max_new_tokens.shape, "INT64"),
        grpcclient.InferInput("num_beams", num_beams.shape, "INT64")
    ]    
    inputs[0].set_data_from_numpy(input_texts)
    inputs[1].set_data_from_numpy(max_new_tokens)
    inputs[2].set_data_from_numpy(num_beams)

    triton_outputs = [grpcclient.InferRequestedOutput("output_text")]

    infer_result = client.infer(
        "t5",
        inputs,
        model_version="1",
        outputs=triton_outputs   
        )

    return infer_result.as_numpy("output_text")

In [4]:
input_texts = [["Translate English to French: Hello!"], ["Translate English to German: Hello!"]]
max_new_tokens = [[64], [64]]
num_beams = [[1], [1]]

send_t5_request(CLIENT, input_texts, max_new_tokens, num_beams)

array([b'Bonjour!', b'Hallo!'], dtype=object)