In [1]:
# import the necessary packages
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import imagenet_utils
from imutils.object_detection import non_max_suppression
from pyimagesearch.detection_helpers import sliding_window
from pyimagesearch.detection_helpers import image_pyramid
import numpy as np
import argparse
import imutils
import time
import cv2

In [2]:
def image_pyramid(image, scale=1.5, minSize=(224, 224)):
    yield image
    while True:
        w = int(image.shape[1] / scale)
        image = imutils.resize(image, width=w)
        if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
            break
        yield image

- image : input image
- scale : 축소할 비율 (resize)
- minSize : 일단 pre-training 모델인 (ResNet50)을 사용하기 때문에 ResNet50에 기본 size 224, 224이하가 되면 축소를 멈춤.

In [3]:
def sliding_window(image, step, ws):
# slide a window across the image
    for y in range(0, image.shape[0] - ws[1], step):
        for x in range(0, image.shape[1] - ws[0], step):
            # yield the current window
            yield (x, y, image[y:y + ws[1], x:x + ws[0]])

In [4]:
img = cv2.imread('./images/hummingbird.jpg')
img.shape
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [5]:
WIDTH = 600
PYR_SCALE = 1.5
WIN_STEP = 16
ROI_SIZE = (250, 250)
INPUT_SIZE = (224, 224)
(H, W) = img.shape[:2]
model = ResNet50(weights="imagenet", include_top=True)

In [6]:
rois = []
locs = []
start = time.time()
# image pyramid 적용
pyramid = image_pyramid(img, scale=PYR_SCALE, minSize=ROI_SIZE)  

# Generator로 하나씩 이미지를 불음
# 오리지널 이미지 W를 pyramid로 불러온 이미지를 나눠 scale 정의
# 나중에 object bounding box를 upscale 할 때 필요 
for image in pyramid:
    scale = W / float(image.shape[1])
    print(scale)
    for (x, y, roiOrig) in sliding_window(image, WIN_STEP, ROI_SIZE):
        # Generator로 하나씩 sliding할 때의 sliding window 결과값을 불러와
        # Scale 비율로 각 sliding window(x, y, w, h)를 증가 시킴
        x = int(x * scale)
        y = int(y * scale)
        w = int(ROI_SIZE[0] * scale)
        h = int(ROI_SIZE[1] * scale)   
        print(x, y, w, h)
        # sliding window 사이즈(scale비율로 증가)로 이미지를 분류하기 위해
        # ResNet50모델의 기본 사이즈인 224,224 비율로 resize해서 분류기에 넣겠다는 것임
        roi = cv2.resize(roiOrig, INPUT_SIZE)
        roi = img_to_array(roi)
        roi = preprocess_input(roi)
        # roi값과 각 좌표정보를 저장함.
        rois.append(roi)
        locs.append((x, y, x + w, y + h))

        # 오리지널 이미지 위에 bounding box sliding되고 scale되는 것을 
        # 보여주고 해당 sliding window이미지를 보여줌 
        clone = img.copy()
        cv2.rectangle(clone, (x, y), (x + w, y + h), (0, 255, 0), 2)
        # show the visualization and current ROI
        if x % 64 == 0: 
            if y % 64 == 0:
                cv2.imshow("Visualization", clone)
                cv2.imshow("ROI", roiOrig)
                cv2.waitKey(0)
                cv2.destroyAllWindows()

1.0
0 0 250 250
16 0 250 250
32 0 250 250
48 0 250 250
64 0 250 250
80 0 250 250
96 0 250 250
112 0 250 250
128 0 250 250
144 0 250 250
160 0 250 250
176 0 250 250
192 0 250 250
208 0 250 250
224 0 250 250
240 0 250 250
256 0 250 250
272 0 250 250
288 0 250 250
304 0 250 250
320 0 250 250
336 0 250 250
352 0 250 250
368 0 250 250
384 0 250 250
400 0 250 250
416 0 250 250
432 0 250 250
448 0 250 250
0 16 250 250
16 16 250 250
32 16 250 250
48 16 250 250
64 16 250 250
80 16 250 250
96 16 250 250
112 16 250 250
128 16 250 250
144 16 250 250
160 16 250 250
176 16 250 250
192 16 250 250
208 16 250 250
224 16 250 250
240 16 250 250
256 16 250 250
272 16 250 250
288 16 250 250
304 16 250 250
320 16 250 250
336 16 250 250
352 16 250 250
368 16 250 250
384 16 250 250
400 16 250 250
416 16 250 250
432 16 250 250
448 16 250 250
0 32 250 250
16 32 250 250
32 32 250 250
48 32 250 250
64 32 250 250
80 32 250 250
96 32 250 250
112 32 250 250
128 32 250 250
144 32 250 250
160 32 250 250
176 32 250 250

In [11]:
# image pyramid와 sliding window의 loop시간 측정을 위해
end = time.time()
print("[INFO] looping over pyramid/windows took {:.5f} seconds".format(
    end - start))

# 넘파이어레이로 변경
rois = np.array(rois, dtype="float32")
# 모델 예측
print("[INFO] classifying ROIs...")
start = time.time()
preds = model.predict(rois)
end = time.time()
print("[INFO] classifying ROIs took {:.5f} seconds".format(end - start))

preds = imagenet_utils.decode_predictions(preds, top=1)
labels = {}

[INFO] looping over pyramid/windows took 54.82174 seconds
[INFO] classifying ROIs...
[INFO] classifying ROIs took 14.61249 seconds


In [12]:
# loop over the predictions
for (i, p) in enumerate(preds):
    # grab the prediction information for the current ROI
    (imagenetID, label, prob) = p[0]
    # filter out weak detections by ensuring the predicted probability
    # is greater than the minimum probability
    if prob >= 0.9:
        # grab the bounding box associated with the prediction and
        # convert the coordinates
        box = locs[i]
        # grab the list of predictions for the label and add the
        # bounding box and probability to the list
        L = labels.get(label, [])
        L.append((box, prob))
        labels[label] = L

In [10]:
labels

{'hummingbird': [((0, 0, 250, 250), 0.98936254),
  ((16, 0, 266, 250), 0.9884879),
  ((32, 0, 282, 250), 0.98057246),
  ((48, 0, 298, 250), 0.9992592),
  ((64, 0, 314, 250), 0.99924254),
  ((80, 0, 330, 250), 0.9992291),
  ((96, 0, 346, 250), 0.9998543),
  ((112, 0, 362, 250), 0.99992454),
  ((128, 0, 378, 250), 0.9997745),
  ((144, 0, 394, 250), 0.99969625),
  ((160, 0, 410, 250), 0.99901545),
  ((176, 0, 426, 250), 0.9984952),
  ((192, 0, 442, 250), 0.99880815),
  ((208, 0, 458, 250), 0.9990402),
  ((224, 0, 474, 250), 0.98860836),
  ((240, 0, 490, 250), 0.98524714),
  ((0, 16, 250, 266), 0.9778243),
  ((16, 16, 266, 266), 0.98177135),
  ((32, 16, 282, 266), 0.98459375),
  ((48, 16, 298, 266), 0.998939),
  ((64, 16, 314, 266), 0.9995073),
  ((80, 16, 330, 266), 0.99938464),
  ((96, 16, 346, 266), 0.9994241),
  ((112, 16, 362, 266), 0.99990106),
  ((128, 16, 378, 266), 0.9996024),
  ((144, 16, 394, 266), 0.99958545),
  ((160, 16, 410, 266), 0.9977719),
  ((176, 16, 426, 266), 0.997251

In [13]:
# loop over the labels for each of detected objects in the image
for label in labels.keys():
    # clone the original image so that we can draw on it
    print("[INFO] showing results for '{}'".format(label))
    clone = img.copy()
    # loop over all bounding boxes for the current label
    for (box, prob) in labels[label]:
        # draw the bounding box on the image
        (startX, startY, endX, endY) = box
        cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
    # show the results *before* applying non-maxima suppression, then
    # clone the image again so we can display the results *after*
    # applying non-maxima suppression
    cv2.imshow("Before", clone)
    clone = img.copy()
    # extract the bounding boxes and associated prediction
    # probabilities, then apply non-maxima suppression
    boxes = np.array([p[0] for p in labels[label]])
    print(boxes.shape)
    proba = np.array([p[1] for p in labels[label]])
    print(proba.shape)
    boxes = non_max_suppression(boxes, proba)
    print(boxes)
    # loop over all bounding boxes that were kept after applying
    # non-maxima suppression  
    for (startX, startY, endX, endY) in boxes:
        # draw the bounding box and label on the image
        cv2.rectangle(clone, (startX, startY), (endX, endY),
            (0, 255, 0), 2)
        y = startY - 10 if startY - 10 > 10 else startY + 10
        cv2.putText(clone, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 255, 0), 2)
    # show the output after apply non-maxima suppression
    cv2.imshow("After", clone)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

[INFO] showing results for 'hummingbird'
(149, 4)
(149,)
[[ 96   0 471 375]]
