In [None]:
import cv2
import argparse
import os
import time
import warnings
import numpy as np

# import Yolo2Keras as y2k
from tensorflow.keras.models import load_model
import utils

warnings.filterwarnings("ignore")
np.random.seed(1234)

argsParser = argparse.ArgumentParser(description="Apply object detection to the input video")
argsParser.add_argument("-i", "--input", required=True, help="path to input video")
argsParser.add_argument("-y", "--yolo", required=True, help="base path to YOLO directory")
argsParser.add_argument("-c", "--confidence", type=float, default=0.5, help="min prob to filter weak detections")
argsParser.add_argument("-t", "--threshold", type=float, default=0.3, help="non-maxima suppression threshold")
args = vars(argsParser.parse_args())

WEIGHTS_PATH = os.path.join(args['yolo'], 'yolov3.weights')
INPUT_FILE = args['input']
OUTPUT_FILE = INPUT_FILE.split('.')[0] + '-detected.avi'
CLASS_THRESHOLD = args['confidence']
NMS_THRESHOLD = args['threshold']
YOLOV3_INPUT_WIDTH, YOLOV3_INPUT_HEIGHT = 416, 416
YOLOV3_ANCHORS = [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23]]
with open("yolo-coco/coco.names") as f:
    lines = f.readlines()
    LABELS = [e.strip() for e in lines]

COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

vs = cv2.VideoCapture(INPUT_FILE)
writer = None
(W, H) = (None, None)

try:
    prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() else cv2.CAP_PROP_FRAME_COUNT
    total = int(vs.get(prop))
    print("[INFO] {} total frames in video".format(total))
except:
    print("[INFO] could not determine # of frames in video")
    print("[INFO] no approx. completion time can be provided")
    total = -1

model = load_model('yolo-keras.h5')  # y2k.Yolov3_Keras(WEIGHTS_PATH)

start_time = time.time()
while True:
    (grabbed, frame) = vs.read()
    initial_frame = frame
    if not grabbed:
        break

    if W is None or H is None:
        (H, W) = frame.shape[:2]

    # adapt the frame to the yolov3 network architecture
    frame = cv2.resize(frame, (YOLOV3_INPUT_WIDTH, YOLOV3_INPUT_HEIGHT), interpolation=cv2.INTER_AREA)
    frame = frame.astype('float32')
    frame /= 255.0
    frame = np.expand_dims(frame, 0)

    yhat = model.predict(frame)
    boxes = list()
    for i in range(len(yhat)):
        boxes += utils.decode_netout(yhat[i][0], YOLOV3_ANCHORS[i], CLASS_THRESHOLD,
                                     YOLOV3_INPUT_WIDTH, YOLOV3_INPUT_HEIGHT)

    utils.correct_yolo_boxes(boxes, H, W, YOLOV3_INPUT_WIDTH, YOLOV3_INPUT_HEIGHT)

    utils.do_nms(boxes, NMS_THRESHOLD)

    v_boxes, v_labels, v_scores = utils.get_boxes(boxes, LABELS, CLASS_THRESHOLD)

    for i in range(len(v_boxes)):
        if v_labels[i] == 'person':
                box = v_boxes[i]
                y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax
                width, height = x2 - x1, y2 - y1

                color = [int(c) for c in COLORS[LABELS.index(v_labels[i])]]  # COLORS[LABELS.index(v_labels[i])]
                cv2.rectangle(initial_frame, (x1, y1), (x1 + width, y1 + height), color, 2)
                text = "%s (%.4f)" % (v_labels[i], v_scores[i])
                cv2.putText(initial_frame, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    if writer is None:
        fourcc = cv2.VideoWriter_fourcc(*"MJPG")
        writer = cv2.VideoWriter(OUTPUT_FILE, fourcc, 30,
                                 (initial_frame.shape[1], initial_frame.shape[0]), True)

    writer.write(initial_frame)
print("[INFO] cleaning up...")
print("[INFO] total time = %.2f" % (time.time() - start_time))
writer.release()
vs.release()
