In [None]:
!pip install Pillow numpy torch torchvision onnx onnxruntime==1.12.0 matplotlib opencv-python pycocotools

In [15]:
from PIL import Image
import numpy as np
import torch
from torchvision import transforms, models
from onnx import numpy_helper
import os
import onnxruntime as rt
from matplotlib.colors import hsv_to_rgb
import cv2
# import gradio as gr

import matplotlib.pyplot as plt
import matplotlib.patches as patches

import pycocotools.mask as mask_util

def preprocess(image):
    # Resize
    ratio = 800.0 / min(image.size[0], image.size[1])
    image = image.resize((int(ratio * image.size[0]), int(ratio * image.size[1])), Image.BILINEAR)

    # Convert to BGR
    image = np.array(image)[:, :, [2, 1, 0]].astype('float32')

    # HWC -> CHW
    image = np.transpose(image, [2, 0, 1])

    # Normalize
    mean_vec = np.array([102.9801, 115.9465, 122.7717])
    for i in range(image.shape[0]):
        image[i, :, :] = image[i, :, :] - mean_vec[i]

    # Pad to be divisible of 32
    import math
    padded_h = int(math.ceil(image.shape[1] / 32) * 32)
    padded_w = int(math.ceil(image.shape[2] / 32) * 32)

    padded_image = np.zeros((3, padded_h, padded_w), dtype=np.float32)
    padded_image[:, :image.shape[1], :image.shape[2]] = image
    image = padded_image

    return image



# Start from ORT 1.10, ORT requires explicitly setting the providers parameter if you want to use execution providers
# other than the default CPU provider (as opposed to the previous behavior of providers getting set/registered by default
# based on the build flags) when instantiating InferenceSession.
# For example, if NVIDIA GPU is available and ORT Python package is built with CUDA, then call API as following:
# onnxruntime.InferenceSession(path/to/model, providers=['CUDAExecutionProvider'])
os.system("wget https://github.com/AK391/models/raw/main/vision/object_detection_segmentation/mask-rcnn/model/MaskRCNN-10.onnx")
sess = rt.InferenceSession("MaskRCNN-10.onnx")

outputs = sess.get_outputs()


classes = [
    "__background",
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
    "traffic light",
    "fire hydrant",
    "stop sign",
    "parking meter",
    "bench",
    "bird",
    "cat",
    "dog",
    "horse",
    "sheep",
    "cow",
    "elephant",
    "bear",
    "zebra",
    "giraffe",
    "backpack",
    "umbrella",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "surfboard",
    "tennis racket",
    "bottle",
    "wine glass",
    "cup",
    "fork",
    "knife",
    "spoon",
    "bowl",
    "banana",
    "apple",
    "sandwich",
    "orange",
    "broccoli",
    "carrot",
    "hot dog",
    "pizza",
    "donut",
    "cake",
    "chair",
    "couch",
    "potted plant",
    "bed",
    "dining table",
    "toilet",
    "tv",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "refrigerator",
    "book",
    "clock",
    "vase",
    "scissors",
    "teddy bear",
    "hair drier",
    "toothbrush"
]



def display_objdetect_image(image, boxes, labels, scores, masks, score_threshold=0.7):
    # Resize boxes
    ratio = 800.0 / min(image.size[0], image.size[1])
    boxes /= ratio

    _, ax = plt.subplots(1, figsize=(12,9))

    image = np.array(image)

    for mask, box, label, score in zip(masks, boxes, labels, scores):
        # Showing boxes with score > 0.7
        if score <= score_threshold:
            continue

        # Finding contour based on mask
        mask = mask[0, :, :, None]
        int_box = [int(i) for i in box]
        mask = cv2.resize(mask, (int_box[2]-int_box[0]+1, int_box[3]-int_box[1]+1))
        mask = mask > 0.5
        im_mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
        x_0 = max(int_box[0], 0)
        x_1 = min(int_box[2] + 1, image.shape[1])
        y_0 = max(int_box[1], 0)
        y_1 = min(int_box[3] + 1, image.shape[0])
        mask_y_0 = max(y_0 - box[1], 0)
        mask_y_1 = mask_y_0 + y_1 - y_0
        mask_x_0 = max(x_0 - box[0], 0)
        mask_x_1 = mask_x_0 + x_1 - x_0
        im_mask[y_0:y_1, x_0:x_1] = mask[
            mask_y_0 : mask_y_1, mask_x_0 : mask_x_1
        ]
        im_mask = im_mask[:, :, None]

        # OpenCV version 4.x
        contours, hierarchy = cv2.findContours(
            im_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
        )

        image = cv2.drawContours(image, contours, -1, 25, 3)

        rect = patches.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], linewidth=1, edgecolor='b', facecolor='none')
        ax.annotate(classes[label] + ':' + str(np.round(score, 2)), (box[0], box[1]), color='w', fontsize=12)
        ax.add_patch(rect)

    # ax.imshow(image)
    # plt.axis('off')
    # plt.savefig('out.png', bbox_inches='tight')


def inference(img):
  input_image = Image.open(img)
  orig_tensor = np.asarray(input_image)
  input_tensor = preprocess(input_image)

  output_names = list(map(lambda output: output.name, outputs))
  input_name = sess.get_inputs()[0].name

  boxes, labels, scores, masks = sess.run(output_names, {input_name: input_tensor})
  # display_objdetect_image(input_image, boxes, labels, scores, masks)
  return 'out.png'


In [16]:
import time
directory_path = "/content/"

files = os.listdir(directory_path)
results = []
strt = time.time()
count = 0
for file in files:
  if file.endswith(".png"):
    count += 1
    print(f"{file} processed")
    result = inference(file)
    results.append(result)
end = time.time()

1119.png processed
1139.png processed
1342.png processed
1138.png processed
1154.png processed
1160.png processed
1043.png processed
118.png processed
1111.png processed
1055.png processed
1133.png processed
831.png processed
1021.png processed
1095.png processed
124.png processed
1171.png processed
127.png processed
1323.png processed
1015.png processed
1054.png processed
1187.png processed
921.png processed
1065.png processed
1023.png processed
1269.png processed
126.png processed
1382.png processed
1345.png processed
1122.png processed
1101.png processed
1113.png processed
1034.png processed
1167.png processed
133.png processed
1302.png processed
1295.png processed
1145.png processed
1058.png processed
1280.png processed
1311.png processed
1206.png processed
1245.png processed
1337.png processed
109.png processed
1310.png processed
1256.png processed
888.png processed
1174.png processed
1352.png processed
1026.png processed
1320.png processed
1339.png processed
116.png processed
128

In [17]:
# prompt: Calulate total time of execution in sec and average time

total_time = end - strt
average_time = total_time / count

print("Total time of execution:", total_time, "seconds")
print("Average time per image:", average_time, "seconds")

Total time of execution: 790.803416967392 seconds
Average time per image: 7.603879009301846 seconds
