In [73]:
import tritonclient.grpc as grpcclient
import numpy as np
import torch
import torch.nn.functional as F
import heapq
import time
from bert_python.bert import Config
triton_client = grpcclient.InferenceServerClient(url="10.113.4.193:8021")

config = Config("bert_model/")

In [75]:
text = '我想买一辆自行车'
print(config.tokenizer.tokenize(text))

['我', '想', '买', '一', '辆', '自', '行', '车']


In [68]:
def id2name():
    with open("bert_model/query/catid3_and_catname3",encoding="utf-8") as f:
        cat = {}
        for line in f.readlines():
            cat[line.split()[0]] = line.split()[1]
    return cat

In [69]:
# tokenizer 
def fine_grade_tokenize(text):
    PAD, CLS = '[PAD]', '[CLS]'
    PAD_SIZE = config.pad_size
    token = config.tokenizer.tokenize(text)
    token = [CLS] + token
    tokens = config.tokenizer.convert_tokens_to_ids(token)
    seq_len = len(token)

    if len(token) < PAD_SIZE:
        mask = [1] * len(tokens) + [0] * (PAD_SIZE - len(token))
        tokens += ([0] * (PAD_SIZE - len(token)))
    else:
        mask = [1] * PAD_SIZE
        tokens = tokens[:PAD_SIZE]
    return (tokens, mask)

In [70]:
# preprocess for text input
def preprocess(text):
    tokens, mask = fine_grade_tokenize(text)
    input_feed = {'input_ids': [tokens],
                  'attention_mask': [mask] }
    input_ids = np.array(input_feed['input_ids'], dtype=np.int64)
    attention_mask = np.array(input_feed['attention_mask'],dtype=np.int64)

    return (input_ids,attention_mask)

In [71]:
# postprocess
def post_process(output):
    label = F.softmax(torch.LongTensor(output)/2,1).cpu()
    tmp = zip(range(len(label.data[0])),label.data[0].numpy())
    max_five = heapq.nlargest(5,tmp,key=lambda x:x[1])
    cat = id2name()
    for i in max_five:
        print(cat[config.class_list[i[0]]],i[1])

In [78]:
# test for bert_classifier 

text = '我想买一辆自行车'
model_version = '1'
model_name = 'bert_classify'
input_ids,attention_mask = preprocess(text)
nb_tokens = config.pad_size
input0 = grpcclient.InferInput('input_ids',(1,nb_tokens), 'INT64')
input0.set_data_from_numpy(input_ids)
input1 = grpcclient.InferInput('attention_mask',(1,nb_tokens), 'INT64')
input1.set_data_from_numpy(attention_mask)
output = grpcclient.InferRequestedOutput("output")

#推理
s_t = time.time()
response = triton_client.infer(model_name=model_name, model_version=model_version, inputs=[input0,input1], outputs=[output])
s_e = time.time()
output0_data = response.as_numpy("output")

print(post_process(output0_data))
print("推理时间：",s_e-s_t) 


[[ 2.0537105  -2.006245   -3.3541844  ... -0.14689699 -0.6333689
   0.49530295]]
自行车 0.19553654
自行车整车 0.07193387
雨伞雨具 0.00973519
体感车 0.00973519
学步车/三轮车 0.00973519
None
推理时间： 0.003985404968261719


In [138]:
# demo 推理
model_version = '1'
model_name = 'flow'

query = grpcclient.InferInput(name='INPUT',shape=(1,), datatype="BYTES")
query.set_data_from_numpy(np.asarray([text],dtype=object))
output = grpcclient.InferRequestedOutput(name="OUTPUT")

s_t = time.time()
response = triton_client.infer(model_name=model_name, model_version=model_version, inputs=[query], outputs=[output])
s_e = time.time()

output0_data = response.as_numpy("OUTPUT")


a = output0_data.reshape(-1,1)
print(a[0][0].decode())
print("推理时间：",s_e-s_t)


{"自行车": "0.1955", "自行车整车": "0.0719", "雨伞雨具": "0.0097", "体感车": "0.0097", "学步车/三轮车": "0.0097"}
推理时间： 0.014065265655517578


In [141]:
# 10000次请求测试

model_name = 'flow'
model_version = '1'
text = '我想买一辆自行车'

query = grpcclient.InferInput(name='INPUT',shape=(1,), datatype="BYTES")
query.set_data_from_numpy(np.asarray([text], dtype=object))
output = grpcclient.InferRequestedOutput("OUTPUT")

def perform_random_inference():
    triton_client.infer(model_name,model_version=model_version, inputs=[query],outputs=[output])

s_t = time.time()
for _ in range(10000):
    perform_random_inference()

s_e = time.time()   
print("推理时间：",s_e-s_t) 



推理时间： 53.93218016624451


In [131]:
output0_data = response.as_numpy("OUTPUT")
# output0_data[0]
print(output0_data.reshape(-1,1))
a = output0_data.reshape(-1,1)
a[0][0].decode()

[[b'{"\xe8\x87\xaa\xe8\xa1\x8c\xe8\xbd\xa6": "0.1350", "\xe8\x87\xaa\xe8\xa1\x8c\xe8\xbd\xa6\xe6\x95\xb4\xe8\xbd\xa6": "0.0497", "\xe9\x9b\xa8\xe4\xbc\x9e\xe9\x9b\xa8\xe5\x85\xb7": "0.0111", "\xe4\xbd\x93\xe6\x84\x9f\xe8\xbd\xa6": "0.0111", "\xe5\xad\xa6\xe6\xad\xa5\xe8\xbd\xa6/\xe4\xb8\x89\xe8\xbd\xae\xe8\xbd\xa6": "0.0111"}']]


'{"自行车": "0.1350", "自行车整车": "0.0497", "雨伞雨具": "0.0111", "体感车": "0.0111", "学步车/三轮车": "0.0111"}'

In [111]:
# np.set_printoptions(threshold=10_000)
print(type(output0_data))
# print(np.asarray(output0_data))
print(output0_data.shape)

<class 'NoneType'>


AttributeError: 'NoneType' object has no attribute 'shape'

In [14]:
input_feed ={'input_ids': [[101,
   2769,
   2682,
   743,
   671,
   6775,
   5632,
   6121,
   6756,
   0,
   0,
   0,
   0,
   0,
   0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}

print(input_feed['input_ids'])

[[101, 2769, 2682, 743, 671, 6775, 5632, 6121, 6756, 0, 0, 0, 0, 0, 0]]


In [33]:
print(input_feed['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [35]:
inputs = []

# np.array(["IOS"], dtype=np.object_)
# in0n = np.array(["我勒个老天啊"], dtype=np.object_)
# input0_data = in0n.reshape((-1, 1))
nb_tokens = 15

input_ids = np.array(input_feed['input_ids'], dtype=np.int64)

attention_mask = np.array(input_feed['attention_mask'],dtype=np.int64)

input0 = grpcclient.InferInput('input_ids',(1,nb_tokens), 'INT64')
input0.set_data_from_numpy(input_ids)
input1 = grpcclient.InferInput('attention_mask',(1,nb_tokens), 'INT64')
input1.set_data_from_numpy(attention_mask)

output = grpcclient.InferRequestedOutput("output")




In [41]:
import time
model_version = '1'
model_name = 'bert_classify'
s_t = time.time()
response = triton_client.infer(model_name=model_name, model_version=model_version, inputs=[input0,input1], outputs=[output])
s_e = time.time()
output0_data = response.as_numpy("output")
print("推理时间：",s_e-s_t) 



推理时间： 0.007288455963134766


In [39]:
print(post_process(output0_data))

自行车 0.19553654
自行车整车 0.07193387
雨伞雨具 0.00973519
体感车 0.00973519
学步车/三轮车 0.00973519
None


In [136]:

import numpy as np
import time
model_name = 'bert_classify'
model_version = '1'
nb_tokens = 15

input0 = grpcclient.InferInput('input_ids',(1,nb_tokens), 'INT64')
input1 = grpcclient.InferInput('attention_mask',(1,nb_tokens), 'INT64')
input1.set_data_from_numpy(np.ones((1,nb_tokens), dtype=np.int64))


output = grpcclient.InferRequestedOutput("output")

def perform_random_inference():
    input0.set_data_from_numpy(np.random.randint(10000,size=(1, nb_tokens),dtype=np.int64))
    triton_client.infer(model_name,model_version=model_version, inputs=[input0,input1],outputs=[output])

s_t = time.time()
for _ in range(10000):
    perform_random_inference()

s_e = time.time()   
print("推理时间：",s_e-s_t) 


    

InferenceServerException: [StatusCode.UNAVAILABLE] Request for unknown model: 'bert_classify' is not found

## HTTP client

In [49]:
import tritonclient.http
import numpy as np
triton_client_http = tritonclient.http.InferenceServerClient(url="10.113.4.193:8020")
input0 = tritonclient.http.InferInput('input_ids',(1,nb_tokens), 'INT64')
input1 = tritonclient.http.InferInput('attention_mask',(1,nb_tokens), 'INT64')
input1.set_data_from_numpy(np.ones((1,nb_tokens), dtype=np.int64))

output = tritonclient.http.InferRequestedOutput("output")

def perform_random_inference_http():
    input0.set_data_from_numpy(np.random.randint(10000,size=(1, nb_tokens),dtype=np.int64))
    triton_client_http.infer(model_name, 
                             model_version=model_version, 
                             inputs=[input0,input1],
                             outputs=[output])

s_t = time.time()
for _ in range(10000):
    perform_random_inference_http()

s_e = time.time()   
print("推理时间：",s_e-s_t) 



推理时间： 29.44158172607422


In [51]:
import triton_python_backend_utils as pb_utils

In [None]:


class tritonPythonModel:
    def initialize(self, _):
        self.tokenizer = 
        
    def execute(self, requests):
        responses = []
        for request in requests:
            query = [t.decode('UTF-8') for t in pb_utils.get_input_tensor_by_name(request,'TEXT').as_numpy().tolist()]
            tokens = self.tokenizer(text=query)
            input_ids = pb_utils.Tensor("input_ids", tokens['input_ids'])
            attention_mask = pb_utils.Tensor("attention_mask", tokens['attention_mask'])
            inference_response = pb_utils.InfereneResponse(output_tensors=[input_ids, attention_mask])
            responses.append(inference_response)
            
            
        return repsonses     
            


In [None]:
import json
import requests
import base64

url = 'http://127.0.0.1:18080/predictions/objDetect'
img_path = 'test.jpeg'
with open(img_path,"rb") as f:#转为二进制格式
    base64_data = base64.b64encode(f.read())#使用base64进行编码
base64_data = base64_data.decode()
req_data = {
    "imgData":base64_data
}
r = requests.post(url=url, data=json.dumps(req_data))
print(json.loads(r.text))