In [12]:
import cv2
import argparse
import numpy as np
import os
from matplotlib import pyplot as plt
from abc import ABC,abstractmethod
import json

execution_path = os.getcwd()

In [13]:
#Base class
class ObjectType(ABC):
    @abstractmethod
    def toJSON(self):
        pass
    
class ListType(ABC):
    @abstractmethod
    def toJSON(self):
        pass

In [14]:
#object name, four pairs of coordinates and confidence score
class DetectedObject(ObjectType):
    def __init__(self,object_name,x0,y0,x1,y1,x2,y2,x3,y3,confidence_score):
        self.object_name=object_name
        self.x0=x0
        self.y0=y0
        self.x1=x1
        self.y1=y1
        self.x2=x2
        self.y2=y2
        self.x3=x3
        self.y3=y3
        self.confidence_score=confidence_score
    def toJSON(self):
        return json.dumps(self,default=lambda obj:obj.__dict__)

In [15]:
class DetectedObjectList(ListType):
    def __init__(self,detected_objects):
        self.detected_objects=detected_objects
    def toJSON(self):
        return json.dumps(self,default=lambda obj:obj.__dict__)

In [16]:
#function to get the output layer names in the architecture
def get_output_layers(net):
    
    layer_names = net.getLayerNames()
    
    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

    return output_layers

In [17]:
# function to draw bounding box on the detected object with class name and also to crop detected object
def draw_prediction(img, classes,class_id, confidence, x, y, x_plus_w, y_plus_h,COLORS):
    confidence=round(confidence,4)
    confidencesc=str(confidence)
    label = str(classes[class_id]+'-'+confidencesc)
    color = COLORS[class_id]
    cv2.rectangle(img, (x,y), (x_plus_w,y_plus_h), color, 2)
    cv2.putText(img, label, (x-10,y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    #Add these two lines to crop the detected object from the image
    roi=img[y:y+y_plus_h,x:x+x_plus_w]
    cv2.imwrite(str(label) + '.jpg', roi)

In [18]:
def object_detection(image,config,weights):
    # read input image    
    image = cv2.imread(image)

    Width = image.shape[1]
    Height = image.shape[0]
    scale = 0.00392

    #pre-trained models that was trained on the COCO dataset which has 80 different kind of common everyday objects
    classes=['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

    # generate different colors for different classes 
    COLORS = np.random.uniform(0, 255, size=(len(classes), 3))

    # read pre-trained model and config file
    net = cv2.dnn.readNet(weights, config)

    #step to find image blobs ie.regions where a set of similar points get differentiated from one another
    blob = cv2.dnn.blobFromImage(image, scale, (416,416), (0,0,0), True, crop=False)

    # set input blob for the network
    net.setInput(blob)

    #run inference through the network and gather predictions from output layers
    outs = net.forward(get_output_layers(net))

    #initialization
    class_ids = []
    confidences = []
    boxes = []
    conf_threshold = 0.5
    nms_threshold = 0.4

    # for each detetion from each output layer getting the confidence, class id, bounding box params and ignoring weak detections (confidence < 0.5)
    for out in outs:

        for detection in out:

            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                center_x = int(detection[0] * Width)
                center_y = int(detection[1] * Height)
                w = int(detection[2] * Width)
                h = int(detection[3] * Height)
                x = center_x - w / 2
                y = center_y - h / 2
                class_ids.append(class_id)
                confidences.append(float(confidence))
#                 (x,y)--> top-left edge of the bounding box, w-->width h-->height
                boxes.append([x, y, w, h])
                
                
    objects=[]
    
    # apply non-max suppression
    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)

    # go through the detections remaining after nms and draw bounding box
    for i in indices:
        i = i[0]
        box = boxes[i]
        x = box[0]
        y = box[1]
        w = box[2]
        h = box[3]
        #pass detected objects,four pairs of coordinates calculated from x,y,w,h and confidence score to the class 
        object_1=DetectedObject(classes[class_ids[i]],round(x), round(y), round(x+w), round(y+h),round(x), round(y), round(x+w), round(y+h),confidences[i])
        objects.append(object_1)
        #draw bounding box
        draw_prediction(image,classes, class_ids[i], confidences[i], round(x), round(y), round(x+w), round(y+h),COLORS)
    #create json file with the acquired results. 
    object_list=DetectedObjectList(objects)
    object_json=object_list.toJSON()
    # save output image 
    cv2.imwrite(os.path.join(execution_path ,"sample_output/output_image.jpg"), image)
    return object_json
    

In [19]:
image=os.path.join(execution_path , "sample_input/input2.jpg")#image path

config=os.path.join(execution_path , "yolov3.cfg")#config path of the network

#Kindly download the weight files from https://pjreddie.com/media/files/yolov3.weights

weights=os.path.join(execution_path , "yolov3.weights")#pre-trained YOLO v3 weights

object_json=object_detection(image,config,weights)

In [20]:
object_json

'{"detected_objects": [{"object_name": "person", "x0": 190, "y0": 94, "x1": 276, "y1": 378, "x2": 190, "y2": 94, "x3": 276, "y3": 378, "confidence_score": 0.9999475479125977}, {"object_name": "horse", "x0": 394, "y0": 137, "x1": 608, "y1": 343, "x2": 394, "y2": 137, "x3": 608, "y3": 343, "confidence_score": 0.9977301955223083}, {"object_name": "dog", "x0": 59, "y0": 262, "x1": 205, "y1": 350, "x2": 59, "y2": 262, "x3": 205, "y3": 350, "confidence_score": 0.9942072629928589}]}'