In [1]:
import numpy as np
import torch
from flair.data import Sentence
from flair.models.sequence_tagger_utils.bioes import get_spans_from_bio
import tritonclient.grpc as grpcclient
from tqdm.auto import tqdm
import json
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed

In [2]:
def string_to_array(string, encoding="utf-8"):
    return np.asarray(list(bytes(string, encoding)))


def bytes_to_string(byte_list):
    return bytes(byte_list.tolist()).decode()


class ClientDecoder:
    def __init__(self, triton_server_url, model_name, model_version):
        self.triton_client = grpcclient.InferenceServerClient(
            url=triton_server_url, verbose=False
        )

        self.model_metadata = self.triton_client.get_model_metadata(
            model_name=model_name, model_version=model_version
        )

        self.model_config = self.triton_client.get_model_config(
            model_name=model_name, model_version=model_version
        ).config
        self.model_name = model_name
        self.viterbi_decoder = torch.load(
            "/workspace/triton-models/flair-ner-english-fast-tokenization/1/viterbi_decoder.bin"
        )

    def submit(self, sentence_bytes, device="cpu"):
        sentences = [Sentence(bytes_to_string(sentence_bytes))]

        inputs = [
            grpcclient.InferInput("sentence_bytes", sentence_bytes.shape, "INT64"),
        ]

        inputs[0].set_data_from_numpy(sentence_bytes)

        outputs = [
            grpcclient.InferRequestedOutput("OUTPUT__0"),
            grpcclient.InferRequestedOutput("OUTPUT__1"),
            grpcclient.InferRequestedOutput("OUTPUT__2"),
        ]

        response = self.triton_client.infer(self.model_name, inputs, outputs=outputs)

        features = torch.tensor(response.as_numpy("OUTPUT__0"), device=DEVICE)
        sorted_lengths = torch.tensor(response.as_numpy("OUTPUT__1"), device=DEVICE)
        transitions = torch.tensor(response.as_numpy("OUTPUT__2"), device=DEVICE)

        embedding = (features, sorted_lengths, transitions)

        predictions, all_tags = self.viterbi_decoder.decode(embedding, True, sentences)

        for sentence, sentence_predictions in zip(sentences, predictions):
            sentence_tags = [label[0] for label in sentence_predictions]
            sentence_scores = [label[1] for label in sentence_predictions]
            predicted_spans = get_spans_from_bio(sentence_tags, sentence_scores)
            for predicted_span in predicted_spans:
                span = sentence[predicted_span[0][0] : predicted_span[0][-1] + 1]
                span.add_label("ner", value=predicted_span[2], score=predicted_span[1])

        dict_format = {}
        sentence_list = []
        for entity in sentences[0].get_spans("ner"):
            sentence_list.append(
                {
                    "entity_group": entity.tag,
                    "start": entity.start_position,
                    "word": entity.text,
                    "end": entity.end_position,
                    "score": int(entity.score * 100),
                }
            )
            dict_format[sentence.text] = sentence_list

        return dict_format

In [3]:
TRITON_SERVER_URL = "172.25.4.42:8001"
MODEL_NAME = "flair-ner-english-fast-ensemble"
MODEL_VERSION = "1"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MULTIPLIER = 32
SAMPLE_TEXTS = open("strings_list.txt", "r").read()
STRING_LIST = SAMPLE_TEXTS.split("\n") * MULTIPLIER

In [4]:
requests = [string_to_array(string) for string in STRING_LIST]

embeddings = torch.load(
    "/workspace/triton-models/flair-ner-english-fast-tokenization/1/embeddings.bin",
    map_location=torch.device(DEVICE),
)

viterbi_decoder = torch.load(
    "/workspace/triton-models/flair-ner-english-fast-tokenization/1/viterbi_decoder.bin",
    map_location=torch.device(DEVICE),
)

client_decoder = ClientDecoder(TRITON_SERVER_URL, MODEL_NAME, MODEL_VERSION)

sentence_bytes = [string_to_array(string) for string in STRING_LIST]

In [5]:
est_total = len(sentence_bytes)
pbar = tqdm(
    total=est_total,
    desc="Submitting sentences to {} at {}".format(MODEL_NAME, TRITON_SERVER_URL),
)

responses = []
with ThreadPoolExecutor() as executor:
    for sentence_byte in sentence_bytes:
        futures = []
        futures.append(executor.submit(client_decoder.submit, sentence_byte, DEVICE))

        for future in as_completed(futures):
            infer_results = future.result()
            responses.append(infer_results)
        pbar.update()

Submitting sentences to flair-ner-english-fast-ensemble at 172.25.4.42:8001:   0%|          | 0/320 [00:00<?, …

In [6]:
print(json.dumps(responses[0], indent=1))

{
 "With the belief that the PC one day would become a consumer device for enjoying games and multimedia , NVIDIA is founded by Jensen Huang , Chris Malachowsky and Curtis Priem .": [
  {
   "entity_group": "ORG",
   "start": 25,
   "word": "PC",
   "end": 27,
   "score": 60
  },
  {
   "entity_group": "ORG",
   "start": 102,
   "word": "NVIDIA",
   "end": 108,
   "score": 98
  },
  {
   "entity_group": "PER",
   "start": 123,
   "word": "Jensen Huang",
   "end": 135,
   "score": 99
  },
  {
   "entity_group": "PER",
   "start": 137,
   "word": "Chris Malachowsky",
   "end": 154,
   "score": 99
  },
  {
   "entity_group": "PER",
   "start": 159,
   "word": "Curtis Priem",
   "end": 171,
   "score": 99
  }
 ]
}
