In [1]:
CLASSES = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ",
           "train", "truck ", "boat", "traffic light", "fire hydrant",
           "stop sign ", "parking meter", "bench", "bird", "cat", "dog ",
           "horse ", "sheep", "cow", "elephant", "bear", "zebra ", "giraffe",
           "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
           "skis", "snowboard", "sports ball", "kite", "baseball bat",
           "baseball glove", "skateboard", "surfboard", "tennis racket",
           "bottle", "wine glass", "cup", "fork", "knife ", "spoon", "bowl",
           "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
           "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
           "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor",
           "laptop	", "mouse	", "remote ", "keyboard ", "cell phone",
           "microwave ", "oven ", "toaster", "sink", "refrigerator ", "book",
           "clock", "vase", "scissors ", "teddy bear ", "hair drier",
           "toothbrush ")

import numpy as np
import cv2

def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    # y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y
    
def non_max_suppression(prediction, conf_thres=0.25, nmsThreshold=0.5, agnostic=False):
    xc = prediction[..., 4] > conf_thres  # candidates
    # Settings
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()

    output = [np.zeros((0, 6))] * prediction.shape[0]

    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence
        if not x.shape[0]:
            continue
        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = xywh2xyxy(x[:, :4])
        # Detections matrix nx6 (xyxy, conf, cls)
        conf = np.max(x[:, 5:], axis=1)
        j = np.argmax(x[:, 5:], axis=1)
        #è½¬ä¸ºarrayï¼š  x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
        re = np.array(conf.reshape(-1) > conf_thres)
        #è½¬ä¸ºç»´åº¦
        conf = conf.reshape(-1, 1)
        j = j.reshape(-1, 1)
        #numpyçš„æ‹¼æŽ¥
        x = np.concatenate((box, conf, j), axis=1)[re]
        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        elif n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(
                descending=True)[:max_nms]]  # sort by confidence
        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:,
                                        4]  # boxes (offset by class), scores
        #è½¬ä¸ºlist ä½¿ç”¨opencvè‡ªå¸¦nms
        boxes = boxes.tolist()
        scores = scores.tolist()
        i = cv2.dnn.NMSBoxes(boxes, scores, conf_thres,
                             nmsThreshold)
        #i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        output[xi] = x[i]
    return output

# input
srcimg = cv2.imread('./bus.jpg')
img = srcimg[..., ::-1]
h, w, c = img.shape
target = 640

# Scale ratio (new / old)
scale = min(target / h, target / w)
# if not scaleup:  # only scale down, do not scale up (for better val mAP)
#     r = min(r, 1.0)
# Compute padding
new_unpad = int(round(w * scale)), int(round(h * scale))
dw, dh = target - new_unpad[0], target - new_unpad[1]  # wh padding
dw //= 2  # divide padding into 2 sides
dh //= 2

img = cv2.resize(img, new_unpad)

top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))

img = cv2.copyMakeBorder(
    img, top, bottom, left, right, cv2.BORDER_CONSTANT,
    value=(114, 114, 114))  # add border

img = np.expand_dims(img, axis=0)

img = np.float32(img)
img /= 255  # 0 - 255 to 0.0 - 1.0
img = np.transpose(img, (0, 3, 1, 2))
print(img.shape)


# ONNXRuntime
import onnxruntime as ort

ort_session = ort.InferenceSession(
    './onnx/yolov5s_st.onnx', providers=['CPUExecutionProvider'])

ort_output = ort_session.run(None, {ort_session.get_inputs()[0].name: img})

confThreshold = 0.3
nmsThreshold = 0.5
pred = non_max_suppression(ort_output[0], confThreshold, nmsThreshold, agnostic=False)
#draw box
for i in pred[0]:
    left = int((i[0] - dw) / scale)
    top = int((i[1] - dh) / scale)
    width = int((i[2] - dw) / scale)
    height = int((i[3] - dh) / scale)
    conf = i[4]
    classId = i[5]
    #frame = self.drawPred(frame, classIds[i], confidences[i], left, top, left + width, top + height)
    cv2.rectangle(
        srcimg, (int(left), int(top)), (int(width), int(height)), (0, 0, 255),
        thickness=2)
    label = '%.2f' % conf
    label = '%s:%s' % (CLASSES[int(classId)], label)
    # Display the label at the top of the bounding box
    labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                          1)
    top = max(top, labelSize[1])
    #cv2.rectangle(srcimg, (int(left), int(top - round(1.5 * labelSize[1]))), (int(left + round(1.5 * labelSize[0])), int(top + baseLine)), (255,255,255), cv2.FILLED)
    cv2.putText(
        srcimg,
        label, (int(left - 20), int(top - 10)),
        cv2.FONT_HERSHEY_SIMPLEX,
        1, (255, 255, 0),
        thickness=2)
cv2.imwrite('result.jpg', srcimg)
#cv2.imshow('result', srcimg)

(1, 3, 640, 640)


True

In [2]:
# PyTorch
import torch

torch_model = torch.hub.load('/workspace/playground/yolov5', 'yolov5s', source='local', device='cpu')

# ONNXRuntime
import onnxruntime as ort

ort_session = ort.InferenceSession(
    './onnx/yolov5s.onnx', providers=['CPUExecutionProvider'])

# OpenVINO
from openvino.runtime import Core, AsyncInferQueue

ie = Core()
onnx_model_path = './onnx/yolov5s.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
input_layer = next(iter(model_onnx.inputs))
compiled_model_onnx = ie.compile_model(
    model=model_onnx,
    device_name='CPU',
    config={"PERFORMANCE_HINT": "THROUGHPUT"})
request = compiled_model_onnx.create_infer_request()
# INT8
# ir_model_path = 'pot/results/yolov5_DefaultQuantization/2022-04-22_23-44-05/optimized/yolov5.xml'
# model_ir = ie.read_model(model=ir_model_path)

with torch.no_grad():
    torch_output = torch_model(torch.tensor(img, dtype=torch.float32))
ort_output = ort_session.run(None, {ort_session.get_inputs()[0].name: img})
request.infer({input_layer.any_name: img})
ov_output = request.get_output_tensor(0).data

np.testing.assert_allclose(torch_output, ort_output[0], rtol=1e-03, atol=1e-05)
np.testing.assert_allclose(torch_output, ov_output, rtol=1e-03, atol=1e-05)

  from .autonotebook import tqdm as notebook_tqdm
YOLOv5 ðŸš€ v6.1-143-g6ea81bb torch 1.11.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [5]:
import time
   
warm_up_iters = 100
inference_iters = 1000

batch_size = [1, 2, 4, 8, 16]
for bs in batch_size:
    #dummy_input = np.random.randn(bs, 224, 224, 3).astype(np.float32)
    input = np.concatenate([img] * bs, axis=0)

    # PyTorch
    with torch.no_grad():
        for _ in range(warm_up_iters):
            torch_output = torch_model(torch.tensor(input))
        # inference test
        start_time = time.time()
        for _ in range(inference_iters):
            torch_model(torch.tensor(input))
        torch_time = time.time() - start_time
        print(f'pytorch: batch_size {bs}, {torch_time:.2f} s')

    # onnxruntime 
    ort_inputs = {ort_session.get_inputs()[0].name: input}
    # warm up
    for _ in range(warm_up_iters):
        ort_session.run(None, ort_inputs)
    # inference test
    start_time = time.time()
    for _ in range(inference_iters):
        ort_session.run(None, ort_inputs)
    ort_time = time.time() - start_time
    print(f'onnxruntime: bs {bs}, {ort_time:.2f} s')

    # openvino
    infer_queue = AsyncInferQueue(compiled_model_onnx, 16)
    # warm up
    for _ in range(warm_up_iters):
        infer_queue.start_async(inputs={input_layer.any_name: input})
    infer_queue.wait_all()
    # inference test
    start_time = time.time()
    for _ in range(inference_iters):
        infer_queue.start_async(inputs={input_layer.any_name: input})
    infer_queue.wait_all()
    ov_time = time.time() - start_time
    print(f'openvino-fp32: bs {bs}, {ov_time:.2f} s')

pytorch: batch_size 1, 39.41 s
onnxruntime: bs 1, 30.29 s
openvino-fp32: bs 1, 9.83 s
pytorch: batch_size 2, 54.28 s
onnxruntime: bs 2, 51.32 s
openvino-fp32: bs 2, 19.61 s
pytorch: batch_size 4, 84.52 s
onnxruntime: bs 4, 99.93 s
openvino-fp32: bs 4, 37.99 s
pytorch: batch_size 8, 175.21 s
onnxruntime: bs 8, 198.82 s
openvino-fp32: bs 8, 76.57 s
pytorch: batch_size 16, 786.80 s
onnxruntime: bs 16, 530.72 s
openvino-fp32: bs 16, 157.80 s


In [5]:
# PyTorch
import torch

torch_model = torch.hub.load('/workspace/playground/yolov5', 'yolov5s', source='local', device='cpu')

# ONNXRuntime
import onnxruntime as ort

ort_session = ort.InferenceSession(
    './onnx/yolov5s_st.onnx', providers=['CPUExecutionProvider'])

# OpenVINO
from openvino.runtime import Core, AsyncInferQueue

ie = Core()
onnx_model_path = './onnx/yolov5s_st.onnx'
model_onnx = ie.read_model(model=onnx_model_path)
input_layer = next(iter(model_onnx.inputs))
compiled_model_onnx = ie.compile_model(
    model=model_onnx,
    device_name='CPU',
    config={"PERFORMANCE_HINT": "THROUGHPUT"})
request = compiled_model_onnx.create_infer_request()
# INT8
ir_model_path = 'pot/results/yolov5_DefaultQuantization/2022-04-22_23-44-05/optimized/yolov5.xml'
model_ir = ie.read_model(model=ir_model_path)

with torch.no_grad():
    torch_output = torch_model(torch.tensor(img, dtype=torch.float32))
ort_output = ort_session.run(None, {ort_session.get_inputs()[0].name: img})
request.infer({input_layer.any_name: img})
ov_output = request.get_output_tensor(0).data

np.testing.assert_allclose(torch_output, ort_output[0], rtol=1e-03, atol=1e-05)
np.testing.assert_allclose(torch_output, ov_output, rtol=1e-03, atol=1e-05)

YOLOv5 ðŸš€ v6.1-143-g6ea81bb torch 1.11.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [7]:
import time
   
warm_up_iters = 100
inference_iters = 1000

# PyTorch
with torch.no_grad():
    for _ in range(warm_up_iters):
        torch_output = torch_model(torch.tensor(img))
    # inference test
    start_time = time.time()
    for _ in range(inference_iters):
        torch_model(torch.tensor(img))
    torch_time = time.time() - start_time
    print(f'pytorch: {torch_time:.2f} s')

# onnxruntime 
ort_inputs = {ort_session.get_inputs()[0].name: img}
# warm up
for _ in range(warm_up_iters):
    ort_session.run(None, ort_inputs)
# inference test
start_time = time.time()
for _ in range(inference_iters):
    ort_session.run(None, ort_inputs)
ort_time = time.time() - start_time
print(f'onnxruntime: {ort_time:.2f} s')

infer_queue = AsyncInferQueue(compiled_model_onnx, 16)
# warm up
for _ in range(warm_up_iters):
    infer_queue.start_async(inputs={input_layer.any_name: img})
infer_queue.wait_all()
# inference test
start_time = time.time()
for _ in range(inference_iters):
    infer_queue.start_async(inputs={input_layer.any_name: img})
infer_queue.wait_all()
ov_time = time.time() - start_time
print(f'openvino-fp32: {ov_time:.2f} s')

# # openvino INT8
compiled_model_ir = ie.compile_model(
    model=model_ir,
    device_name='CPU',
    config={"PERFORMANCE_HINT": "THROUGHPUT"})
infer_queue = AsyncInferQueue(compiled_model_ir, 16)
# warm up
for _ in range(warm_up_iters):
    infer_queue.start_async(inputs={input_layer.any_name: img})
infer_queue.wait_all()
# inference test
start_time = time.time()
for _ in range(inference_iters):
    infer_queue.start_async(inputs={input_layer.any_name: img})
infer_queue.wait_all()
ov_time = time.time() - start_time
print(f'openvino-int8: {ov_time:.2f} s')

pytorch: 39.47 s
onnxruntime: 29.76 s
openvino-fp32: 9.78 s
openvino-int8: 3.713121175765991 s
