In [1]:
import numpy as np
# import imutils
import time
import cv2
import os


labelsPath = os.path.sep.join(["mask-rcnn-coco", "object_detection_classes_coco.txt"])
LABELS = open(labelsPath).read().strip().split("\n")
#print(LABELS)

np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

weightsPath = os.path.sep.join(["mask-rcnn-coco", "frozen_inference_graph.pb"])
configPath = os.path.sep.join(["mask-rcnn-coco", "mask_rcnn_inception_v2_coco_2018_01_28.pbtxt"])
print(weightsPath)

# load our Mask R-CNN trained on the COCO dataset (90 classes)
# from disk
print("[INFO] loading Mask R-CNN from disk...")
net = cv2.dnn.readNetFromTensorflow(weightsPath, configPath)

# initialize the video stream and pointer to output video file
vs = cv2.VideoCapture("./Bus_11_front_door/2019-6-3_17-12-52.mp4")
writer = None

try:
    prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT
    total = int(vs.get(prop))
    print("[INFO] {} total frames in video".format(total))

# an error occurred while trying to determine the total
# number of frames in the video file
except:
    print("[INFO] could not determine # of frames in video")
    total = -1

# loop over frames from the video file stream
while True:
    # read the next frame from the file
    (grabbed, frame) = vs.read()

    # if the frame was not grabbed, then we have reached the end
    # of the stream
    if not grabbed:
        break

    blob = cv2.dnn.blobFromImage(frame, swapRB=True, crop=False)
    net.setInput(blob)
    start = time.time()
    (boxes, masks) = net.forward(["detection_out_final", "detection_masks"])

    end = time.time()

    for i in range(0, boxes.shape[2]):
        # extract the class ID of the detection along with the
        # confidence (i.e., probability) associated with the
        # prediction
        classID = int(boxes[0, 0, i, 1])
        #print(LABELS[classID])
        confidence = boxes[0, 0, i, 2]

        if confidence > 0.5:
            if classID == 0:
                # scale the bounding box coordinates back relative to the
                # size of the frame and then compute the width and the
                # height of the bounding box
                (H, W) = frame.shape[:2]
                box = boxes[0, 0, i, 3:7] * np.array([W, H, W, H])
                (startX, startY, endX, endY) = box.astype("int")
                boxW = endX - startX
                boxH = endY - startY

                # extract the pixel-wise segmentation for the object,
                # resize the mask such that it's the same dimensions of
                # the bounding box, and then finally threshold to create
                # a *binary* mask
                mask = masks[i, classID]
                mask = cv2.resize(mask, (boxW, boxH),
                    interpolation=cv2.INTER_NEAREST)
                mask = (mask > 0.3)

                # extract the ROI of the image but *only* extracted the
                # masked region of the ROI
                roi = frame[startY:endY, startX:endX][mask]

                # grab the color used to visualize this particular class,
                # then create a transparent overlay by blending the color
                # with the ROI
                color = COLORS[classID]
                blended = ((0.4 * color) + (0.6 * roi)).astype("uint8")

                # store the blended ROI in the original frame
                frame[startY:endY, startX:endX][mask] = blended

                # draw the bounding box of the instance on the frame
                color = [int(c) for c in color]
                cv2.rectangle(frame, (startX, startY), (endX, endY),
                    color, 2)

                text = "{}: {:.4f}".format(LABELS[classID], confidence)
                cv2.putText(frame, text, (startX, startY - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    if writer is None:
        # initialize our video writer
        fourcc = cv2.VideoWriter_fourcc(*"MJPG")
        writer = cv2.VideoWriter("./output/test.avi", fourcc, 30, (frame.shape[1], frame.shape[0]), True)

        if total > 0:
            elap = (end - start)
            print("[INFO] single frame took {:.4f} seconds".format(elap))
            print("[INFO] estimated total time to finish: {:.4f}".format(elap * total))

#     cv2.imshow('frame', frame)

    writer.write(frame)

# release the file pointers
print("[INFO] cleaning up...")
writer.release()
vs.release()

mask-rcnn-coco/frozen_inference_graph.pb
[INFO] loading Mask R-CNN from disk...
[INFO] could not determine # of frames in video


KeyboardInterrupt: 