In [1]:
import numpy as np
import torch
import tritonclient.grpc as grpcclient
from transformers import BertTokenizer

In [2]:
TRITON_SERVER_URL = "172.25.4.42:8001"
MODEL_NAME = "bert-base-uncased"
MODEL_VERSION = "1"

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
triton_client = grpcclient.InferenceServerClient(url=TRITON_SERVER_URL, verbose=False)

model_metadata = triton_client.get_model_metadata(
    model_name=MODEL_NAME, model_version=MODEL_VERSION
)

model_config = triton_client.get_model_config(
    model_name=MODEL_NAME, model_version=MODEL_VERSION
).config

In [5]:
text_0 = "Who is Jim Henson?"
text_1 = "Jim Henson was a puppeteer."

tokenized_tensor_0 = tokenizer(text_0, add_special_tokens=True, return_tensors="pt")
tokenized_tensor_1 = tokenizer(text_1, add_special_tokens=True, return_tensors="pt")
tokens_tensor = torch.concat(
    (tokenized_tensor_0["input_ids"], tokenized_tensor_1["input_ids"]), axis=1
)
segments_tensors = torch.concat(
    (tokenized_tensor_0["token_type_ids"], tokenized_tensor_1["attention_mask"]), axis=1
)

inputs = [
    grpcclient.InferInput("INPUT__0", tokens_tensor.shape, "INT64"),
    grpcclient.InferInput("INPUT__1", segments_tensors.shape, "INT64"),
]
inputs[0].set_data_from_numpy(tokens_tensor.numpy())
inputs[1].set_data_from_numpy(segments_tensors.numpy())

outputs = [grpcclient.InferRequestedOutput("OUTPUT__0")]

response = triton_client.infer(MODEL_NAME, inputs, outputs=outputs)
response.as_numpy("OUTPUT__0")

array([[[-0.40581343,  0.03075648, -0.7144412 , ..., -0.25446898,
          0.69571453,  0.22398195],
        [-0.3476853 ,  0.08060041, -0.70669293, ...,  0.143665  ,
          1.0106599 , -0.41183484],
        [-0.25164652,  0.31278068, -0.5605794 , ..., -0.21715908,
          0.5899458 ,  0.2579052 ],
        ...,
        [-0.16933455,  0.02085777,  0.13789615, ...,  0.20282398,
          0.4034456 , -0.3466968 ],
        [ 0.6180864 ,  0.32834625, -0.29204392, ...,  0.3744501 ,
         -0.22721124, -0.16758089],
        [ 0.49163795,  0.5260932 , -0.02149553, ...,  0.44693998,
         -0.24639283, -0.02547622]]], dtype=float32)

In [6]:
text_0 = "Who are the founders of NVIDIA?"
text_1 = "NVIDIA is founded by Jensen Huang, Chris Malachowsky and Curtis Priem."

tokenized_tensor_0 = tokenizer(text_0, add_special_tokens=True, return_tensors="pt")
tokenized_tensor_1 = tokenizer(text_1, add_special_tokens=True, return_tensors="pt")
tokens_tensor = torch.concat(
    (tokenized_tensor_0["input_ids"], tokenized_tensor_1["input_ids"]), axis=1
)
segments_tensors = torch.concat(
    (tokenized_tensor_0["token_type_ids"], tokenized_tensor_1["attention_mask"]), axis=1
)

inputs = [
    grpcclient.InferInput("INPUT__0", tokens_tensor.shape, "INT64"),
    grpcclient.InferInput("INPUT__1", segments_tensors.shape, "INT64"),
]
inputs[0].set_data_from_numpy(tokens_tensor.numpy())
inputs[1].set_data_from_numpy(segments_tensors.numpy())

outputs = [grpcclient.InferRequestedOutput("OUTPUT__0")]

response = triton_client.infer(MODEL_NAME, inputs, outputs=outputs)
response.as_numpy("OUTPUT__0")

array([[[-0.0842174 ,  0.323568  , -0.1105911 , ..., -0.11505853,
          0.22684385,  0.300774  ],
        [-0.696245  , -0.5622486 ,  0.31854993, ...,  0.59485936,
          0.3186028 ,  0.37230492],
        [-0.09945246, -0.27682346,  0.00796228, ...,  0.27717853,
          0.89625555,  0.22545226],
        ...,
        [-0.10669011, -0.293952  , -0.5867963 , ...,  0.9397089 ,
          0.23595893, -0.25241837],
        [ 0.7441143 , -0.01658609, -0.3539413 , ...,  0.16293712,
         -0.52902055, -0.32487407],
        [ 0.56135845,  0.17078853, -0.17963175, ...,  0.22316153,
         -0.555243  , -0.3090601 ]]], dtype=float32)