In [3]:
import os
import sys
import random
import math
import numpy as np
import skimage.io
from argparse import Namespace
import tensorflow as tf
import pickle 
import keras
import itertools
import cv2
import matplotlib
import pylab
import matplotlib.pyplot as plt
plt.ioff()

from skimage.measure import find_contours
from matplotlib import patches,  lines
from matplotlib.patches import Polygon

0.4.0


# I. Running Inference with Faster-RCNN

In [None]:
ROOT_DIR = os.path.abspath("../Pedestrian-Detection-modified/")

# Set up 
os.system('protoc object_detection/protos/*.proto --python_out=.')
os.system('export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim')

# Import Faster RCNN directory
sys.path.append(ROOT_DIR)  # To find local version of the library
from inference import *

print ("Checking Tensorflow version, making sure we have a GPU")
print (tf.VERSION)
from tensorflow.python.client import device_lib
print (device_lib.list_local_devices())

In [None]:
# Choose which set of input images to perform inference on 
# OPTIONS -> LEFT or RIGHT
img_set = 'left'

IMAGE_DIR = '../input_images/%s_view'%img_set
SAVE_DIR = '../RESULTS_faster_rcnn/%s_view/'%img_set

print ("Performing inference on images from:", IMAGE_DIR, " Saving output images to: ", SAVE_DIR)

args = Namespace(delay=0, frozen_graph=ROOT_DIR+'/output/frozen_inference_graph.pb', input_dir=IMAGE_DIR,
                 label_map=ROOT_DIR+'/annotations/label_map.pbtxt', n_jobs=1, num_output_classes=1, output_dir=SAVE_DIR)    
    
# Initializing queues and events
stitch_queue = Queue()
feed_queue = Queue()
completed = Event()

gpu_workers = []

# Creating processes for GPU inference, loading data and stitching data
for gpu_id in range(args.n_jobs):
    gpu_workers.append(Process(target=infer, args=(args, feed_queue, stitch_queue, completed, gpu_id)))
stitch_cpu = Process(target=stitch, args=(stitch_queue, completed, args))
feed_cpu = Process(target=feed, args=(feed_queue, args))

# Optional delay to give imread a head start
feed_cpu.start()
time.sleep(args.delay)

stitch_cpu.start()
for gpu in gpu_workers:
    gpu.start()

feed_cpu.join()
stitch_cpu.join()

for gpu in gpu_workers:
    gpu.join()

# II. Running Inference with Mask-RCNN

The following code is an adaptation of the demo code that comes with the Mask-RCNN directory, with some modifications to work on our input data, and a modification on the visualization script to save output images with overlaid masks. 

In [2]:
# Root directory of the project
ROOT_DIR = os.path.abspath("../Mask_RCNN_modified/")

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize

# Import COCO config
sys.path.append(os.path.join(ROOT_DIR, "samples/coco/"))  # To find local version
import coco

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "./mask_rcnn_coco.h5")

# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

# Device to load the neural network on.
DEVICE = "/gpu:0"  # /cpu:0 or /gpu:0

# Inspect the model in training or inference modes
# values: 'inference' or 'training'
TEST_MODE = "inference"

### Configure/Create Inference Model and Load Trained Weights


We'll be using a model trained on the MS-COCO dataset. The configurations of this model are in the ```CocoConfig``` class in ```coco.py```.

For inferencing, modify the configurations a bit to fit the task. To do so, sub-class the ```CocoConfig``` class and override the attributes you need to change.

In [3]:
class InferenceConfig(coco.CocoConfig):
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    IMAGE_MIN_DIM = 720
    IMAGE_MAX_DIM = 1280

config = InferenceConfig()
config.display()

with tf.device(DEVICE):
    # Create model object in inference mode.
    model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

    # Load weights trained on MS-COCO
    model.load_weights(COCO_MODEL_PATH, by_name=True)


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     1
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 1
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  1280
IMAGE_META_SIZE                93
IMAGE_MIN_DIM                  720
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [1280 1280    3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE         

### Class Names

The model classifies objects and returns class IDs, which are integer value that identify each class. Some datasets assign integer values to their classes and some don't. For example, in the MS-COCO dataset, the 'person' class is 1 and 'teddy bear' is 88. The IDs are often sequential, but not always. The COCO dataset, for example, has classes associated with class IDs 70 and 72, but not 71.

To improve consistency, and to support training on data from multiple sources at the same time, our ```Dataset``` class assigns it's own sequential integer IDs to each class. For example, if you load the COCO dataset using our ```Dataset``` class, the 'person' class would get class ID = 1 (just like COCO) and the 'teddy bear' class is 78 (different from COCO). Keep that in mind when mapping class IDs to class names.

We include the hard-coded classnames corresponding to their class IDs below so you don't have to download the Coco dataset to run inference. 

In [4]:
# COCO Class names
# Index of the class in the list is its ID. For example, to get ID of
# the teddy bear class, use: class_names.index('teddy bear')
class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']

### Modified Visualization Code To Save Images

In [5]:
def display_imgs(filename, image, boxes, masks, class_ids, class_names,
                 scores=None, title="",
                 figsize=(14, 14), ax=None,
                 show_mask=True, show_bbox=True,
                 colors=None, captions=None, show=False):
    """
    boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
    masks: [height, width, num_instances]
    class_ids: [num_instances]
    class_names: list of class names of the dataset
    scores: (optional) confidence scores for each box
    title: (optional) Figure title
    show_mask, show_bbox: To show masks and bounding boxes or not
    figsize: (optional) the size of the image
    colors: (optional) An array or colors to use with each object
    captions: (optional) A list of strings to use as captions for each object
    """
    # Number of instances
    # Turn interactive plotting off
    N = boxes.shape[0]
    if not N:
        print("\n*** No instances to display *** \n")
    else:
        assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]

    # If no axis is passed, create one and automatically call show()
    auto_show = False
    if not ax:
        fig, ax = plt.subplots(1, figsize=figsize)

    plt.gca().set_axis_off()
    plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, 
            hspace = 0, wspace = 0)
    plt.margins(0,0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    # Generate random colors
    colors = colors or visualize.random_colors(N)

    # Show area outside image boundaries.
    height, width = image.shape[:2]
    ax.set_ylim(height + 10, -10)
    ax.set_xlim(-10, width + 10)
    ax.axis('off')
    ax.set_title(title)

    masked_image = image.astype(np.uint32).copy()
    for i in range(N):
        if (class_ids[i] == 1):
            color = colors[i]

            # Bounding box
            if not np.any(boxes[i]):
                # Skip this instance. Has no bbox. Likely lost in image cropping.
                continue
            y1, x1, y2, x2 = boxes[i]
            if show_bbox:
                p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
                                    alpha=0.7, linestyle="dashed",
                                    edgecolor=color, facecolor='none')
                ax.add_patch(p)

            # Label
            if not captions:
                class_id = class_ids[i]
                score = scores[i] if scores is not None else None
                label = class_names[class_id]
                x = random.randint(x1, (x1 + x2) // 2)
                caption = "{} {:.3f}".format(label, score) if score else label
            else:
                caption = captions[i]
            ax.text(x1, y1 + 8, caption,
                    color='w', size=11, backgroundcolor="none")

            # Mask
            mask = masks[:, :, i]
            if show_mask:
                masked_image = visualize.apply_mask(masked_image, mask, color)

            # Mask Polygon
            # Pad to ensure proper polygons for masks that touch image edges.
            padded_mask = np.zeros(
                (mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)
            padded_mask[1:-1, 1:-1] = mask
            contours = find_contours(padded_mask, 0.5)
            for verts in contours:
                # Subtract the padding and flip (y, x) to (x, y)
                verts = np.fliplr(verts) - 1
                p = Polygon(verts, facecolor="none", edgecolor=color)
                ax.add_patch(p)
    ax.imshow(masked_image.astype(np.uint8))
    if show: plt.show()
    fig.savefig(filename, bbox_inches = 'tight', pad_inches = 0)
    plt.close("all")

### Load both sets of images (left_view, right_view) to perform inference

In [6]:
# Input and output directories for both sets of images
pickles = ['left_view.pkl','right_view.pkl']
IMAGE_DIRS = ['../input_images/left_view/', '../input_images/right_view/']
SAVE_DIRS = ['../RESULTS_mask_rcnn/left_view/', '../RESULTS_mask_rcnn/right_view/']
DEPTH_SAVE_DIRS = ['../RESULTS_mask_rcnn_depth/left_view/', '../RESULTS_mask_rcnn_depth/right_view/']

images = [[],[]] 
names = [[],[]]
for d in range(len(IMAGE_DIRS)):
    images[d] = []
    names[d] = []
    ord_names = []

    for filename in os.listdir(IMAGE_DIRS[d]):
        try:
            img = skimage.io.imread(os.path.join(IMAGE_DIRS[d], filename))
            if img is not None:
                images[d].append(img)
                names[d].append(filename)
                ord_names.append(int(filename[5:].split('.')[0]))
        except:
            print('Cant import ' + filename)    
    
    # order the images by name to know in order which frame they belong
    zipped = sorted(zip(ord_names, names[d], images[d]))
    ord_names, names[d], images[d] = zip(*zipped)

Cant import .ipynb_checkpoints
Cant import .ipynb_checkpoints


### Perform Inference with Mask-RCNN and visualize/save results

We perform inference on the left/right view input images from the loaded Mask-RCNN model, and visualize one of the images here. We run the display_save_imgs function (written above) to save the images with pedestrian bounding boxes and drawn masks in the specified SAVE_DIRS, if they don't already exist in that folder.

In [7]:
results = [[],[]]
save = False
for d in range(len(IMAGE_DIRS)):
    imgs = images[d]
    nam = names[d]
    for i in range(len(imgs)):
        # Perform inference (in order)
        with tf.device(DEVICE): 
            results[d].append(model.detect([imgs[i]]));
        r = results[d][i][0]
        if save: 
            if (i == 1):
                print ("Visualizing results from inference for image: ", nam[i])
                display_imgs(SAVE_DIRS[d]+"/proc_"+nam[i], imgs[i], r['rois'], r['masks'], r['class_ids'], class_names, r['scores'], show=True)
            else:  
                display_imgs(SAVE_DIRS[d]+"/proc_"+nam[i], imgs[i], r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])


# III. Using Mask-RCNN bounding boxes to estimate pedestrian depth
In the following section, we compute depth maps by combining the two sets of input images to create a stereo image on which we use OpenCV. We then use the computed bounding boxes from the previous section, combine the left/right view bounding boxes, and estimate depth by looking at the computed depth map. We then super-impose this depth as a caption on the images, along with the bounding boxes and masks.

In [8]:
from depth_map_creator import *

In [9]:
DEPTH_DIR = '../input_images/depth_maps/'
depth_img = [] 
depth_names = []
ord_names = []

print ("Reading in computed depth maps")
for filename in os.listdir(DEPTH_DIR):
    try:
        img = skimage.io.imread(os.path.join(DEPTH_DIR, filename))
        if img is not None:
            depth_img.append(img)
            depth_names.append(filename)
            ord_names.append(int(filename[9:].split('.')[0]))
    except:
        print('Cant import ' + filename)
    
# order the images by name to know in order which frame they belong
zipped = sorted(zip(ord_names, depth_names, depth_img))
ord_names, depth_names, depth_img = zip(*zipped)

Reading in computed depth maps


In [66]:
def depth_list(lbbox,rbbox,depth):
    """
    Given depth image, list of left and right bbox coordinates and indexes,
    find depth per object and return according to index per image
    """
    out = np.zeros((20,3))
    k = 0
    # For each of the left bounding boxes,
    # find the one in rbbox that is the closest in
    # length/width, and centroids. 
    for i in range(0,lbbox.shape[0]):
        for j in range(0,rbbox.shape[0]):
            l_obj = lbbox[i]
            r_obj = rbbox[j]
            
            print ("Comparing ", l_obj, " to ", r_obj)

            l_x1 = l_obj[1]
            l_x2 = l_obj[3]
            l_y1 = l_obj[0]
            l_y2 = l_obj[2]
            
            r_x1 = r_obj[1]
            r_x2 = r_obj[3]
            r_y1 = r_obj[0]
            r_y2 = r_obj[2]

            l_length = abs(l_x2 - l_x1)
            l_width = abs(l_y2 - l_y1)
            l_centroid_x = l_x1 + 0.5*l_length 
            l_centroid_y = l_y1 + 0.5*l_width

            r_length = abs(r_x2 - r_x1)
            r_width = abs(r_y2 - r_y1)
            r_centroid_x = r_x1 + 0.5*r_length 
            r_centroid_y = r_y1 + 0.5*r_width

            thresh = 20
            print ("Lengths: ", l_length, r_length)
            print ("Widths: ", l_width, r_width)
            print ("Centroid (x): ", l_centroid_x, r_centroid_x)
            print ("Centroid (y): ", l_centroid_y, r_centroid_y) 
            if (abs(l_length - r_length) < thresh and abs(l_width - r_width) < thresh):
                depth_x1 = int((l_x1 + r_x1)/2.0)
                depth_x2 = int((l_x2 + r_x2)/2.0)
                depth_y1 = int((l_y1 + r_y1)/2.0)
                depth_y2 = int((l_y2 + r_y2)/2.0)
                depth_l_index = i
                depth_r_index = j
                depth_obj_matrix = depth[depth_x1:depth_x2,depth_y1:depth_y2]
                depth_obj = np.mean(depth_obj_matrix[depth_obj_matrix > 0])
                out[k] = [depth_obj, depth_l_index, depth_r_index]
                k += 1
                break 
    return out

In [67]:
for i in range(100,101):
    lrois = results[0][i][0]['rois'] 
    rrois = results[1][i][0]['rois']
    left_classes = results[0][i][0]['class_ids']
    right_classes = results[1][i][0]['class_ids']
    
    lbbox = []
    rbbox = []
    for j in range(len(lrois)): 
        if (left_classes[j] == 1): lbbox.append(lrois[j])
    for j in range(len(rrois)): 
        if (right_classes[j] == 1): rbbox.append(rrois[j])
            
    lbbox = np.array(lbbox)
    rbbox = np.array(rbbox)
            
    print (names[0][i], names[1][i], depth_names[i])
    out = depth_list(lbbox, rbbox, depth_img[i])
    print (out)
    
    for k in range(out.shape[0]):
        # get the indices of each thing, and try to visualize it
        display_imgs("foo", images[0][i], lrois, r['masks'], r['class_ids'], class_names, r['scores'], show=True)

    

frame100.jpg frame100.jpg depth_map100.png
Comparing  [208 707 391 807]  to  [202 551 385 652]
Lengths:  100 101
Widths:  183 183
Centroid (x):  757.0 601.5
Centroid (y):  299.5 293.5
Comparing  [195 519 407 613]  to  [202 551 385 652]
Lengths:  94 101
Widths:  212 183
Centroid (x):  566.0 601.5
Centroid (y):  301.0 293.5
Comparing  [195 519 407 613]  to  [ 178  929  336 1019]
Lengths:  94 90
Widths:  212 158
Centroid (x):  566.0 974.0
Centroid (y):  301.0 257.0
Comparing  [195 519 407 613]  to  [ 172 1020  353 1081]
Lengths:  94 61
Widths:  212 181
Centroid (x):  566.0 1050.5
Centroid (y):  301.0 262.5
Comparing  [195 519 407 613]  to  [192 349 398 456]
Lengths:  94 107
Widths:  212 206
Centroid (x):  566.0 402.5
Centroid (y):  301.0 295.0
Comparing  [207 310 416 388]  to  [202 551 385 652]
Lengths:  78 101
Widths:  209 183
Centroid (x):  349.0 601.5
Centroid (y):  311.5 293.5
Comparing  [207 310 416 388]  to  [ 178  929  336 1019]
Lengths:  78 90
Widths:  209 158
Centroid (x):  349.0

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
