In [1]:
import numpy as np
import torch
import heapq
import torch.nn as nn
import torch.nn.functional as F
import time
from bert_python.bert import Model, Config

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
config = Config("bert_model/")
model = Model(config)
model=nn.DataParallel(model,device_ids=[0])
model.load_state_dict(torch.load(config.save_path, map_location=device))
model.to(device)

DataParallel(
  (module): Model(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): BertLayerNorm()
              

In [3]:
# mappping id to names
def id2name():
    with open("bert_model/query/catid3_and_catname3",encoding="utf-8") as f:
        cat = {}
        for line in f.readlines():
            cat[line.split()[0]] = line.split()[1]
    return cat

In [4]:
# tokenizer
def fine_grade_tokenize(text):
    PAD, CLS = '[PAD]', '[CLS]'
    PAD_SIZE = 15
    token = config.tokenizer.tokenize(text)
    token = [CLS] + token
    tokens = config.tokenizer.convert_tokens_to_ids(token)
    seq_len = len(token)

    if len(token) < PAD_SIZE:
        mask = [1] * seq_len + [0] * (PAD_SIZE - seq_len)
        tokens += ([0] * (PAD_SIZE - seq_len))
    else:
        mask = [1] * PAD_SIZE
        tokens = tokens[:PAD_SIZE]
    return (tokens, mask)

In [5]:
# preprocess for text input
def preprocess(text):
    tokens, mask = fine_grade_tokenize(text)
    input_feed = {'input_ids': [tokens],
                  'attention_mask': [mask] }
    input_ids = np.array(input_feed['input_ids'], dtype=np.int64)
    attention_mask = np.array(input_feed['attention_mask'],dtype=np.int64)

    return (input_ids,attention_mask)

In [6]:
# postprocess
def post_process(output):
    label = F.softmax(torch.LongTensor(output)/2,1).cpu()
    tmp = zip(range(len(label.data[0])),label.data[0].numpy())
    max_five = heapq.nlargest(5,tmp,key=lambda x:x[1])
    cat = id2name()
    for i in max_five:
        print(cat[config.class_list[i[0]]],i[1])

In [7]:
model.eval()
text = '我想买一辆自行车'
    
with torch.no_grad():
    #前处理
    tokens, mask = fine_grade_tokenize(text)
    input_ids = torch.LongTensor([tokens]).to(device)
    attention_mask = torch.LongTensor([mask]).to(device)
    s_t = time.time() 
    label = model(input_ids, attention_mask)
    s_e = time.time()
    #后处理
    label = F.softmax(label/2,1).cpu()
    tmp = zip(range(len(label.data[0])),label.data[0].numpy())
    max_five = heapq.nlargest(5,tmp,key=lambda x:x[1])
    cat = id2name()
    for i in max_five:
        print(cat[config.class_list[i[0]]],i[1])

    print("推理时间：",s_e-s_t) 


自行车 0.19591652
自行车整车 0.0791495
体感车 0.0136031285
学步车/三轮车 0.011366833
雨伞雨具 0.009692598
推理时间： 0.01837778091430664


In [None]:
# # convert torch model to onnx
# opset_version = 12
# onnx_model_path = './onnx/text_classification_test.onnx'
# if isinstance(model, torch.nn.DataParallel):
#     model = model.module

# inputs = {
#     'input_ids': input_ids,
#     'attention_mask':attention_mask
# }    
    
# with torch.no_grad():
#     torch.onnx.export(
#         model,
#         tuple(inputs.values()),
#         onnx_model_path,
#         opset_version=opset_version,
#         input_names=['input_ids','attention_mask'],
#         output_names=['output'],
#         dynamic_axes={
#             'input_ids': {0: 'batch_size', 1: 'sequence_len'},
#             'attention_mask':{0: 'batch_size', 1: 'sequence_len'},
#             'output': {0: 'batch_size'},
#         }
#     )

In [None]:
# # Quantization
# from onnxruntime.quantization import quantize_dynamic, QuantType
# import os
# quantized_onnx_model_path = './onnx/quantized_text_classification.onnx'
# quantize_dynamic(
#     onnx_model_path,
#     quantized_onnx_model_path,
#     weight_type=QuantType.QUInt8
# )
# print('ONNX full precision model size (MB):', os.path.getsize(onnx_model_path) / (1024 * 1024))
# print('ONNX quantized model size (MB):', os.path.getsize(quantized_onnx_model_path) / (1024 * 1024))


# ```
# #ONNX full precision model size (MB): 393.99146270751953
# #ONNX quantized model size (MB): 98.92753601074219
# ```


In [8]:
## onnx test on gpu
import onnx
import onnxruntime
assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()
import psutil
from onnxruntime import InferenceSession, SessionOptions

In [9]:
def create_inference_session_gpu(
    model_path: str,
    provider: str = 'CUDAExecutionProvider'
) -> InferenceSession: 

    options = SessionOptions()
    options.intra_op_num_threads = psutil.cpu_count(logical=True)

    # load the model as a onnx graph
    session = InferenceSession(model_path, options,providers=[provider])
    session.disable_fallback()
    return session

In [11]:
onnx_model_path = './onnx/text_classification_test.onnx'
session_gpu = create_inference_session_gpu(onnx_model_path)
tokens, mask = fine_grade_tokenize(text)
input_feed = {'input_ids': [tokens],
              'attention_mask': [mask] }
s_t = time.time()
onnx_output_gpu = session_gpu.run(['output'], input_feed)[0]
s_e = time.time()
# 后处理
post_process(onnx_output_gpu)

print("推理时间：",s_e-s_t) 

自行车 0.19553654
自行车整车 0.07193387
雨伞雨具 0.00973519
体感车 0.00973519
学步车/三轮车 0.00973519
推理时间： 0.004433631896972656


In [None]:
# # optimized model
# from onnxruntime.transformers import optimizer

# optimized_fp16_model_path = './onnx/op_text_classification_fp16.onnx'
# !{sys.executable} -m onnxruntime.transformers.optimizer --input $onnx_model_path --output $optimized_fp16_model_path --float16


In [12]:
# optimized_onnx on gpu fp16
optimized_fp16_model_path = './onnx/op_text_classification_fp16.onnx'
op_session_fp16 = create_inference_session_gpu(optimized_fp16_model_path)
s_t = time.time()
op_onnx_output_fp16 = op_session_fp16.run(['output'], input_feed)[0]
s_e = time.time()
post_process(op_onnx_output_fp16)
print("推理时间：",s_e-s_t)

自行车 0.19549665
自行车整车 0.0719192
雨伞雨具 0.009733205
体感车 0.009733205
学步车/三轮车 0.009733205
推理时间： 0.0036292076110839844


In [13]:
import tritonclient.grpc as grpcclient
triton_client = grpcclient.InferenceServerClient(url="l27.0.0.1:8021")

In [15]:
# demo 推理
model_version = '1'
model_name = 'flow'

query = grpcclient.InferInput(name='INPUT',shape=(1,), datatype="BYTES")
query.set_data_from_numpy(np.asarray([text],dtype=object))
output = grpcclient.InferRequestedOutput(name="OUTPUT")

s_t = time.time()
response = triton_client.infer(model_name=model_name, model_version=model_version, inputs=[query], outputs=[output])
s_e = time.time()

output0_data = response.as_numpy("OUTPUT")


a = output0_data.reshape(-1,1)
print(a[0][0].decode())
print("推理时间：",s_e-s_t)


{"自行车": "0.1955", "自行车整车": "0.0719", "雨伞雨具": "0.0097", "体感车": "0.0097", "学步车/三轮车": "0.0097"}
推理时间： 0.014091730117797852


In [None]:
# # test for bert_classifier
# model_name = 'bert_classifier'
# nb_tokens = config.pad_size
# input0 = grpcclient.InferInput('input_ids',(1,nb_tokens), 'INT64')
# input0.set_data_from_numpy(input_ids)
# input1 = grpcclient.InferInput('attention_mask',(1,nb_tokens), 'INT64')
# input1.set_data_from_numpy(attention_mask)
# output = grpcclient.InferRequestedOutput("output")

# #推理
# s_t = time.time()
# response = triton_client.infer(model_name=model_name, model_version=model_version, inputs=[input0,input1], outputs=[output])
# s_e = time.time()
# output0_data = response.as_numpy("output")

# print(post_process(output0_data))
# print("推理时间：",s_e-s_t) 


In [None]:
# # 10000次请求测试
# query = grpcclient.InferInput(name='INPUT',shape=(1,), datatype="BYTES")
# query.set_data_from_numpy(np.asarray([text], dtype=object))
# output = grpcclient.InferRequestedOutput("OUTPUT")

# def perform_random_inference():
#     triton_client.infer(model_name,model_version=model_version, inputs=[query],outputs=[output])

# s_t = time.time()
# for _ in range(10000):
#     perform_random_inference()
# s_e = time.time()   
# print("推理时间：",s_e-s_t) 