In [None]:
!pip install Pillow numpy torch torchvision onnx onnxruntime==1.12.0 matplotlib opencv-python pycocotools

In [4]:
from PIL import Image
import numpy as np
import torch
from torchvision import transforms, models
from onnx import numpy_helper
import os
import onnxruntime as rt
from matplotlib.colors import hsv_to_rgb
import cv2

import matplotlib.pyplot as plt
import matplotlib.patches as patches

import pycocotools.mask as mask_util

def preprocess(image):
    # Resize
    ratio = 800.0 / min(image.size[0], image.size[1])
    image = image.resize((int(ratio * image.size[0]), int(ratio * image.size[1])), Image.BILINEAR)

    # Convert to BGR
    image = np.array(image)[:, :, [2, 1, 0]].astype('float32')

    # HWC -> CHW
    image = np.transpose(image, [2, 0, 1])

    # Normalize
    mean_vec = np.array([102.9801, 115.9465, 122.7717])
    for i in range(image.shape[0]):
        image[i, :, :] = image[i, :, :] - mean_vec[i]

    # Pad to be divisible of 32
    import math
    padded_h = int(math.ceil(image.shape[1] / 32) * 32)
    padded_w = int(math.ceil(image.shape[2] / 32) * 32)

    padded_image = np.zeros((3, padded_h, padded_w), dtype=np.float32)
    padded_image[:, :image.shape[1], :image.shape[2]] = image
    image = padded_image

    return image



# Start from ORT 1.10, ORT requires explicitly setting the providers parameter if you want to use execution providers
# other than the default CPU provider (as opposed to the previous behavior of providers getting set/registered by default
# based on the build flags) when instantiating InferenceSession.
# For example, if NVIDIA GPU is available and ORT Python package is built with CUDA, then call API as following:
# onnxruntime.InferenceSession(path/to/model, providers=['CUDAExecutionProvider'])
os.system("wget https://github.com/AK391/models/raw/main/vision/object_detection_segmentation/faster-rcnn/model/FasterRCNN-10.onnx")
sess = rt.InferenceSession("FasterRCNN-10.onnx")

outputs = sess.get_outputs()


classes = [
    "__background",
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
    "traffic light",
    "fire hydrant",
    "stop sign",
    "parking meter",
    "bench",
    "bird",
    "cat",
    "dog",
    "horse",
    "sheep",
    "cow",
    "elephant",
    "bear",
    "zebra",
    "giraffe",
    "backpack",
    "umbrella",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "surfboard",
    "tennis racket",
    "bottle",
    "wine glass",
    "cup",
    "fork",
    "knife",
    "spoon",
    "bowl",
    "banana",
    "apple",
    "sandwich",
    "orange",
    "broccoli",
    "carrot",
    "hot dog",
    "pizza",
    "donut",
    "cake",
    "chair",
    "couch",
    "potted plant",
    "bed",
    "dining table",
    "toilet",
    "tv",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "refrigerator",
    "book",
    "clock",
    "vase",
    "scissors",
    "teddy bear",
    "hair drier",
    "toothbrush"
]



def display_objdetect_image(image, boxes, labels, scores, score_threshold=0.7):
    # Resize boxes
    ratio = 800.0 / min(image.size[0], image.size[1])
    boxes /= ratio

    _, ax = plt.subplots(1, figsize=(12,9))
    image = np.array(image)
    ax.imshow(image)

    # Showing boxes with score > 0.7
    for box, label, score in zip(boxes, labels, scores):
        if score > score_threshold:
            rect = patches.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], linewidth=1, edgecolor='b', facecolor='none')
            ax.annotate(classes[label] + ':' + str(np.round(score, 2)), (box[0], box[1]), color='w', fontsize=12)
            ax.add_patch(rect)

    plt.axis('off')
    plt.savefig('out.png', bbox_inches='tight')


def inference(img):
  input_image = Image.open(img)
  orig_tensor = np.asarray(input_image)
  input_tensor = preprocess(input_image)

  output_names = list(map(lambda output: output.name, outputs))
  input_name = sess.get_inputs()[0].name

  boxes, labels, scores = sess.run(output_names, {input_name: input_tensor})
  # display_objdetect_image(input_image, boxes, labels, scores)

  return 'out.png'


In [5]:
import time
directory_path = "/content/"

files = os.listdir(directory_path)
results = []
strt = time.time()
count = 0
for file in files:
  if file.endswith(".png"):
    count += 1
    print(f"{file} processed")
    result = inference(file)
    results.append(result)
end = time.time()

1111.png processed
908.png processed
1357.png processed
1138.png processed
1187.png processed
1251.png processed
1123.png processed
1287.png processed
1058.png processed
1069.png processed
133.png processed
1139.png processed
1119.png processed
1256.png processed
1217.png processed
1071.png processed
1174.png processed
1320.png processed
1122.png processed
1116.png processed
1342.png processed
999.png processed
1195.png processed
998.png processed
1045.png processed
1337.png processed
1034.png processed
1052.png processed
1382.png processed
1125.png processed
12.png processed
1280.png processed
1048.png processed
1101.png processed
1352.png processed
925.png processed
1023.png processed
1027.png processed
1028.png processed
1021.png processed
1239.png processed
1026.png processed
1253.png processed
127.png processed
1176.png processed
1065.png processed
1015.png processed
1137.png processed
1133.png processed
1158.png processed
1344.png processed
1040.png processed
109.png processed
13

In [6]:
# prompt: Calulate total time of execution in sec and average time

total_time = end - strt
average_time = total_time / count

print("Total time of execution:", total_time, "seconds")
print("Average time per image:", average_time, "seconds")

Total time of execution: 703.7369585037231 seconds
Average time per image: 6.576980920595544 seconds
