# Detection with SSD

In this example, we will load a SSD model and use it to detect objects.

### 1. Setup

* First, Load necessary libs and set up caffe and caffe_root

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


# Make sure that caffe is on the python path:
caffe_root = '../'  # this file is expected to be in {caffe_root}/examples
import os
os.chdir(caffe_root)
import sys
sys.path.insert(0, 'python')

import caffe
caffe.set_device(0)
caffe.set_mode_gpu()

* Load LabelMap.

In [2]:
from google.protobuf import text_format
from caffe.proto import caffe_pb2
import os

# load PASCAL VOC labels
labelmap_file = '/test/model/labelmap_car5.prototxt'
file = open(labelmap_file, 'r')
labelmap = caffe_pb2.LabelMap()
text_format.Merge(str(file.read()), labelmap)

print(os.getcwd())
print(os.listdir(os.getcwd()))
print(labelmap)

def get_labelname(labelmap, labels):
    num_labels = len(labelmap.item)
    labelnames = []
    if type(labels) is not list:
        labels = [labels]
    for label in labels:
        found = False
        for i in xrange(0, num_labels):
            if label == labelmap.item[i].label:
                found = True
                labelnames.append(labelmap.item[i].display_name)
                break
        assert found == True
    return labelnames

/
['srv', 'dev', 'root', 'proc', 'boot', 'mnt', 'var', 'lib', 'tmp', 'opt', 'usr', 'run', 'sys', 'bin', 'sbin', 'lib64', 'etc', 'home', 'media', 'test', 'caffe', '.dockerenv', 'cmake-3.14.3-Linux-x86_64', 'cmake-3.14.3-Linux-x86_64.tar.gz']
item {
  name: "none_of_the_above"
  label: 0
  display_name: "background"
}
item {
  name: "articulated_truck"
  label: 1
  display_name: "articulated_truck"
}
item {
  name: "bicycle"
  label: 2
  display_name: "bicycle"
}
item {
  name: "bus"
  label: 3
  display_name: "bus"
}
item {
  name: "car"
  label: 4
  display_name: "car"
}
item {
  name: "motorcycle"
  label: 5
  display_name: "motorcycle"
}
item {
  name: "motorized_vehicle"
  label: 6
  display_name: "motorized_vehicle"
}
item {
  name: "non-motorized_vehicle"
  label: 7
  display_name: "non-motorized_vehicle"
}
item {
  name: "pedestrian"
  label: 8
  display_name: "pedestrian"
}
item {
  name: "pickup_truck"
  label: 9
  display_name: "pickup_truck"
}
item {
  name: "single_unit_truc

* Load the net in the test phase for inference, and configure input preprocessing.

In [3]:
model_def = '/test/model/deploy.prototxt'
model_weights = '/test/model/VGG_car5_20190924T065602_SSD_512x512_iter_90000.caffemodel'

net = caffe.Net(model_def,      # defines the structure of the model
                model_weights,  # contains the trained weights
                caffe.TEST)     # use test mode (e.g., don't perform dropout)

# input preprocessing: 'data' is the name of the input blob == net.inputs[0]
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2, 0, 1))
transformer.set_mean('data', np.array([104,117,123])) # mean pixel
transformer.set_raw_scale('data', 255)  # the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_channel_swap('data', (2,1,0))  # the reference model has channels in BGR order instead of RGB

### 2. SSD detection

* Set labels.

In [31]:
avaliable_label_indexes = [1,3,4,6,7,9,10,11]

line1 = '시간'

for i in avaliable_label_indexes:
    line1 += ',{}'.format(labelmap.item[i].display_name)
line1 += '\n'

print(line1)

시간,articulated_truck,bus,car,motorized_vehicle,non-motorized_vehicle,pickup_truck,single_unit_truck,work_van



* Load an Video.

In [108]:
!pip install opencv-python

import cv2

# set net to batch size of 1
image_resize = 512
net.blobs['data'].reshape(8,3,image_resize,image_resize)

colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()


video_path = '/test/남사육교_201911061200_1800_2_t000500.avi'
output_video_path = '/test/남사육교_201911061200_1800_2_t000500_out.avi'
output_log_path = '/test/남사육교_201911061200_1800_2_t000500_out.csv'

cap = cv2.VideoCapture(video_path)

w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
fps = cap.get(cv2.CAP_PROP_FPS)
interval = fps / 10

print(w, h, total_frames, fps, interval)


fourcc = cv2.VideoWriter_fourcc(*'XVID')
writer = cv2.VideoWriter(output_video_path, fourcc, 30.0, (int(w), int(h)))


log_writer = open(output_log_path, 'w')
log_writer.write(line1)



(720.0, 480.0, 9000.0, 30.0, 3.0)


In [109]:
!pip install tqdm
from tqdm import tqdm

def isWrong(ww, hh, xxmin, yymin, xxmax, yymax):
    if xxmin > ww or yymin > hh or xxmax < 0 or yymax < 0:
        return True
    CONF_W = 0.6
    CONF_H = 0.6
    
    limit_w = int(ww * CONF_W)
    limit_h = int(hh * CONF_H)
    
    o_w = xxmax - xxmin
    o_h = yymax - yymin

    if o_w > limit_w or o_h > limit_h:
        return True
    
    if o_w <= 20:
        return True
    if o_h <= 20:
        return True

    return False
    


for a in tqdm(range(0, int(total_frames)), desc='inferencing'):
# while cap.isOpened():
    ret, img = cap.read()
    
    if not ret:
        break
        
    overlay = img.copy()
    
    transformed_image = transformer.preprocess('data', img)
    net.blobs['data'].data[...] = transformed_image
    # Forward pass.
    detections = net.forward()['detection_out']

    # Parse the outputs.
    det_label = detections[0,0,:,1]
    det_conf = detections[0,0,:,2]
    det_xmin = detections[0,0,:,3]
    det_ymin = detections[0,0,:,4]
    det_xmax = detections[0,0,:,5]
    det_ymax = detections[0,0,:,6]
    
#     print(det_label)

    exist_objs = []
    
    def filtering(idx):
        xmin = max(0, int(round(det_xmin[idx] * w)))
        ymin = max(0, int(round(det_ymin[idx] * h)))
        xmax = min(int(w), int(round(det_xmax[idx] * w)))
        ymax = min(int(h), int(round(det_ymax[idx] * h)))
        
        if isWrong(w, h, xmin, ymin, xmax, ymax):
            return False
        temp_bbox = [xmin, ymin, xmax, ymax]
        if temp_bbox in exist_objs:
            return False
        exist_objs.append([xmin, ymin, xmax, ymax])
        return True

    # Get detections with confidence higher than 0.5.
    top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.5 and filtering(i)]
    top_indices2 = [i for i, conf in enumerate(det_conf) if conf >= 0.5]
    
#     print('======')
#     print(top_indices)
#     print(top_indices2)

    top_conf = det_conf[top_indices]
    top_label_indices = det_label[top_indices].tolist()
    top_labels = get_labelname(labelmap, top_label_indices)
    top_xmin = det_xmin[top_indices]
    top_ymin = det_ymin[top_indices]
    top_xmax = det_xmax[top_indices]
    top_ymax = det_ymax[top_indices]
    
    if a % interval == 0:
        
        count_labels = [ top_label_indices.count(idx) for idx in avaliable_label_indexes]
#         print('======')
    #     print(avaliable_label_indexes)
#         print(a)
#         print(temp)
    #     print(top_label_indices)
    #     print('======')
        log_line = str(a / fps) if a != 0 else str(a)
        for c in count_labels:
            log_line += ',{}'.format(c)
        log_line += '\n'
#         print(log_line)
        log_writer.write(log_line)
    
    for i in xrange(top_conf.shape[0]):
        xmin = max(0, int(round(top_xmin[i] * w)))
        ymin = max(0, int(round(top_ymin[i] * h)))
        xmax = min(int(w), int(round(top_xmax[i] * w)))
        ymax = min(int(h), int(round(top_ymax[i] * h)))

        score = top_conf[i]
        label = int(top_label_indices[i])
        label_name = top_labels[i]
        display_txt = '%s: %.2f'%(label_name, score)
        color = colors[label][:-1]
        color = [int(c * 255) for c in color]
    #     print(color)
    #     cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)

        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.55
        margin = 5
        thickness = 2
        t_color = (0, 0, 0)

        size = cv2.getTextSize(display_txt, font, font_scale, thickness)
        t_w = size[0][0]
        t_h = size[0][1]

#         print(xmin, ymin, t_h, t_w, t_color)
        cv2.rectangle(img, (xmin, ymin - t_h - 20), (xmin + t_w + 20, ymin), color, -1)


    alpha = 0.5

    img = cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)


    for i in xrange(top_conf.shape[0]):
        xmin = max(0, int(round(top_xmin[i] * w)))
        ymin = max(0, int(round(top_ymin[i] * h)))
        xmax = min(int(w), int(round(top_xmax[i] * w)))
        ymax = min(int(h), int(round(top_ymax[i] * h)))
        
        score = top_conf[i]
        label = int(top_label_indices[i])
        label_name = top_labels[i]
        display_txt = '%s: %.2f'%(label_name, score)
        color = colors[label][:-1]
        color = [int(c * 255) for c in color]
    #     print(color)
#         print(xmin, ymin, xmax, ymax, t_color)
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)

        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.55
        margin = 5
        thickness = 2
        t_color = (0, 0, 0)

        size = cv2.getTextSize(display_txt, font, font_scale, thickness)
        t_w = size[0][0]
        t_h = size[0][1]

    #     cv2.rectangle(img, (xmin, ymin - t_h - 20), (xmin + t_w + 20, ymin), color, -1)


    #     alpha = 0.5

    #     img = cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

#         print(xmin, ymin, t_h, t_w, t_color)
        cv2.rectangle(img, (xmin, ymin - t_h - 20), (xmin + t_w + 20, ymin), t_color, 1)

        cv2.putText(img, display_txt, (xmin + 10, ymin - 10), font, font_scale, t_color, thickness)

    writer.write(img)

log_writer.close()

    #     xmin = int(round(top_xmin[i] * image.shape[1]))
    #     ymin = int(round(top_ymin[i] * image.shape[0]))
    #     xmax = int(round(top_xmax[i] * image.shape[1]))
    #     ymax = int(round(top_ymax[i] * image.shape[0]))
    #     score = top_conf[i]
    #     label = int(top_label_indices[i])
    #     label_name = top_labels[i]
    #     display_txt = '%s: %.2f'%(label_name, score)
    #     coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
    #     color = colors[label]
    #     currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
    #     currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})




inferencing: 100%|█████████▉| 8992/9000 [38:02<00:02,  3.94it/s]
