This file contains a performance test of the ROI pooling layer. It runs both a forward and backward pass
of the layer using a single batch of random data. The input size is set below and can be adjusted to taste.

In [1]:
import numpy as np
import PIL
from PIL import Image

import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops

import time

import os
home = os.getenv("HOME")

In [2]:
# Since we've added custom operations, we need to import them. Tensorflow does not automatically add custom ops.
# Adjust the paths below to your tensorflow source folder.

# Import the forward op
roi_pooling_module = tf.load_op_library(
    home + "/packages/tensorflow/bazel-bin/tensorflow/core/user_ops/roi_pooling_op.so")
roi_pooling_op = roi_pooling_module.roi_pooling

# Import the gradient op
roi_pooling_module_grad = tf.load_op_library(
    home + "/packages/tensorflow/bazel-bin/tensorflow/core/user_ops/roi_pooling_op_grad.so")
roi_pooling_op_grad = roi_pooling_module_grad.roi_pooling_grad

In [3]:
# Configuring the size of the benchmark inputs.
# This is about as large as my card (2GB memory) can handle
num_batches = 150
num_channels = 10
image_height = 200
image_width = 200
result_size = 20

input_shape = (num_batches, num_channels, image_height, image_width)
other_shape = (num_batches, num_channels, 1, result_size, result_size)
result_shape = (result_size, result_size)

In [4]:
# A ROI. Just use the whole image
rois = [[0, 0, image_height, image_width]]
rois = [rois for x in range(num_batches)] 

In [5]:
# We pre-compute the argmax so we can benchmark the gradient function separately
def get_argmax():
    sess = tf.Session()
    
    data = tf.constant(np.random.random(input_shape).astype(np.float32))
    grad = tf.constant(np.random.random((num_batches, num_channels, 1, result_shape[0], result_shape[1])).astype(np.float32))
    rois_tensor = tf.constant(np.asarray(rois).astype(np.int32))
    output_shape_tensor = tf.constant(np.asarray(result_shape).astype(np.int32))
    input_shape_tensor = array_ops.shape(data)

    result, argmax = roi_pooling_op(data, rois_tensor, output_shape_tensor)

    return sess.run(argmax)

In [6]:
argmax_data = get_argmax()

In [7]:
# Get the random inputs ready
thing1 = np.random.random(input_shape).astype(np.float32)
thing2 = np.random.random((num_batches, num_channels, 1, result_shape[0], result_shape[1])).astype(np.float32)

In [8]:
# The main benchmarking function. Device is "gpu" or "cpu".
def benchmark(device):
    sess = tf.Session()
    with tf.device("/{}:0".format(device)):
        
        # Set up the inputs
        data = tf.constant(thing1)
        grad = tf.constant(thing2)
        argmax_precomp = tf.constant(argmax_data)

        # We make them into variables so they can be cached to isolate load time from op time
        data_var = tf.Variable(data, dtype=tf.float32, name="var1")
        grad_var = tf.Variable(grad, dtype=tf.float32, name="var2")
        
        # Other small inputs
        rois_tensor = tf.constant(np.asarray(rois).astype(np.int32))
        output_shape_tensor = tf.constant(np.asarray(result_shape).astype(np.int32))
        input_shape_tensor = array_ops.shape(data)
    
        # The actual ops
        result, argmax = roi_pooling_op(data_var, rois_tensor, output_shape_tensor)
        
        # We call the gradient op explicitly to avoid the lookup overhead
        gradient_explicit = roi_pooling_op_grad(grad_var, argmax_precomp, input_shape_tensor)

        # Initialize the variables
        init = tf.initialize_all_variables()
        sess.run([init])
    
        # Run timings for the forward pass
        start = time.time()
        result_out, argmax_out = sess.run([result, argmax])
        print("Forward Pass took {} seconds".format(time.time() - start))
        
        # Since we can't cache argmax (because it's int32, long story..)
        # we compute how long it takes to load, and subtract that from the total time
        overhead = time.time()
        sess.run([argmax_precomp, grad_var, input_shape_tensor])
        overhead = time.time() - start
        
        # Timings for backward pass
        start = time.time()
        sess.run([gradient_explicit])
        print("Backwards Pass took {} seconds".format(time.time() - start - overhead))

    sess.close()  
    tf.reset_default_graph()

In [11]:
# Benchmark the CPU
benchmark("cpu")

Forward Pass took 0.08390259742736816 seconds
Backwards Pass took 0.175032377243042 seconds


In [12]:
# Benchmark the gpu
benchmark("gpu")

Forward Pass took 0.06426715850830078 seconds
Backwards Pass took 0.08208394050598145 seconds
