# Object detection #1
sources: https://pyimagesearch.com/2020/06/22/turning-any-cnn-image-classifier-into-an-object-detector-with-keras-tensorflow-and-opencv/

Computer vision with image classifier deep learning, precursor to specialized object detection models like fast RCNN and faster RCNN.

## Summary
Following are the steps for converting an image classifier into an object detector:
1) input image

2) construct image pyramid

3) sliding window on each scale of image pyramid

4) for each sliding window step, extract ROI

5) pass ROI through CNN for classification

6) min probability test passes, record class and bbox

7) apply class-wise NMS

8) return results

*ROI = region of interest

*non-maxima suppression (NMS) means collapsing weak overlapping bboxes in favour of more confident ones, ultimately forming one bbox per object

## Dependencies

In [1]:
import tensorflow as tf
import numpy as np
from keras.applications import ResNet50
from keras.applications.resnet import preprocess_input
from keras.preprocessing.image import img_to_array
from keras.applications import imagenet_utils
import matplotlib.pyplot as plt
import time
import cv2

## Sliding window
Sliding window utility, will be run over each image (scaled as per image pyramid) with specified step. In practice step is usually in the range of 4-8.

In [2]:
# ws: window size, the size of the window we are going to extract from our image
def sliding_window(image, step, ws):
    for y in range(0, image.shape[0] - ws[1], step):
        for x in range(0, image.shape[1] - ws[0], step):
            # yield since this will be a python generator
            # the current window
            yield (x, y, image[y:y + ws[1], x:x + ws[0]])

## Image pyramid
Generate an image pyramid, bottom tier is original size, each subsequent tier is downsized by a scale until minSize is reached.

In [3]:
def image_pyramid(image, scale=1.5, minSize=(224, 224)):
    # bottom tier = original image
    yield image
    # downscale until minSize
    while image.shape[0] >= minSize[1] and image.shape[1] >= minSize[0]:
        # compute size of next image in pyramid
        w = int(image.shape[1] / scale)
        h = int(image.shape[0] / scale)
        image = cv2.resize(image, (w, h), interpolation = cv2.INTER_AREA)
        yield image

## CNN
We will load a CNN model pre-trained with ImageNet weights.

In [4]:
model = ResNet50(weights="imagenet", include_top=True)

## Constants and test data

In [5]:
WIDTH = 600
PYR_SCALE = 1.5
WIN_STEP = 8
ROI_SIZE = (50, 50) # fixed size regardless of image pyramid dimens
INPUT_SIZE = (224, 224) # for classifier
MIN_CONF = 0.8 # min confidence for classifiying

In [6]:
orig = cv2.imread('lawn_mower.png')
resize_ratio = WIDTH / orig.shape[1]
orig = cv2.resize(orig, (int(orig.shape[1] * resize_ratio), int(orig.shape[0] * resize_ratio)), interpolation = cv2.INTER_AREA)
print(orig.shape)
(H, W) = orig.shape[:2]

(450, 600, 3)


## Run image through pyramid generator

In [7]:
pyramid = image_pyramid(orig, scale=PYR_SCALE, minSize=ROI_SIZE)

# lists to hold ROIs generated from the image and coords of ROI in original img
rois = []
locs = []

# time keeping
start = time.time()

In [8]:
for image in pyramid:
    # determine scale factor of current pyramid layer
    scale = W / float(image.shape[1])
    # for each layer of the pyramid, do sliding window
    for (x, y, roiOrig) in sliding_window(image, WIN_STEP, ROI_SIZE):
        # scale the coords and dimens of ROI to original img
        x = int(x * scale)
        y = int(y * scale)
        w = int(ROI_SIZE[0] * scale)
        h = int(ROI_SIZE[1] * scale)
        # take the ROI and resize + pre-process it, will be fed to classifier
        roi = cv2.resize(roiOrig, INPUT_SIZE)
        roi = img_to_array(roi)
        roi = preprocess_input(roi)
        # update lists
        rois.append(roi)
        locs.append((x, y, x + w, y + h))

In [9]:
end = time.time()
print('Extracting ROIs took {:.5f}s'.format(end - start))

Extracting ROIs took 2.91943s


## Run predictions on ROI
Run the ROIs (resized to original scale) through the classifier.

In [10]:
rois = np.array(rois, dtype=np.float32)
print('Classifying ROIs...')
start = time.time()
preds = model.predict(rois)
end = time.time()
print('Classifying ROIs took {:.5f}s'.format(end - start))

Classifying ROIs...


KeyboardInterrupt: 

Decode the preds and init a dict which maps class labels to any ROIs associated with that label

In [None]:
preds = imagenet_utils.decode_predictions(preds, top=1)
labels = {}
for (i, p) in enumerate(preds):
    #  pred info for curr ROI
    (imagenetID, label, prob) = p[0]
    # filter out weak detections
    if prob >= MIN_CONF:
        box = locs[i]
        # append pred to label key of labels
        L = labels.get(label, [])
        L.append((box, prob))
        labels[label] = L

## Visualize the results

In [None]:
# for each class
for label in labels.keys():
    clone = orig.copy()
    # all bboxes over the curr label
    for (box, prob) in labels[label]:
        # draw on img
        (start_x, start_y, end_x, end_y) = box
        cv2.rectangle(clone, (start_x, end_x), (end_x, end_y), (0, 255, 0), 2)
plt.imshow(clone)