In [1]:
# This script does deep neural network inference without parallelization
# for comparing speed and correctness
import numpy as np

In [2]:
# Decide the parameters of the structure of the neural network
n_layers = 4 # Including input and output layer
n_classes = 2 # Size of output layer
n_neurons = [2**8, 2**8, 2**8, n_classes]

In [3]:
# Generate weights
weights = []
for layer_i in range(n_layers - 1):
    n_pre_layer = n_neurons[layer_i]
    n_post_layer = n_neurons[layer_i + 1]
    weights.append(np.random.normal(size=(n_post_layer, n_pre_layer)))

In [4]:
# Generate inputs
n_inputs = 3
inputs = np.random.normal(size=(n_neurons[0], n_inputs)) # random inputs
# inputs = np.zeros(shape=(n_neurons[0], n_inputs)) # zero inputs 

In [5]:
# Generate nonlinear activation function
def nonlin_activation(x):
    return 2 * np.exp(x) / (np.exp(x) + 1) - 1

def softmax(x):
    out = np.zeros_like(x)
    for colm_i in range(x.shape[1]):        
        exp_elem = np.exp(x[:, colm_i])
        out[:, colm_i] = exp_elem / np.sum(exp_elem)
    return(out)

In [6]:
# Propagate inputs through network
def infer_np_serial(inputs):
    layer_inputs = inputs
    for layer_i in range(n_layers - 1):
        if layer_i != n_layers - 2:
            layer_inputs = nonlin_activation(weights[layer_i].dot(layer_inputs))
        else:
            layer_inputs = softmax(weights[layer_i].dot(layer_inputs))
            output = layer_inputs
    return(output)

In [15]:
[x.shape for x in weights]

[(256, 256), (256, 256), (2, 256)]

In [14]:
np.dstack(weights).shape

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [None]:
# %load run_NNs.py
from __future__ import division
import sys
import pyopencl as cl
import numpy as np

import NN_serial

# if __name__ == '__main__':
# List our platforms
platforms = cl.get_platforms()
print 'The platforms detected are:'
print '---------------------------'
for platform in platforms:
    print platform.name, platform.vendor, 'version:', platform.version

# List devices in each platform
for platform in platforms:
    print 'The devices detected on platform', platform.name, 'are:'
    print '---------------------------'
    for device in platform.get_devices():
        print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
        print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
        print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
        print 'Maximum work group size', device.max_work_group_size
        print '---------------------------'

# Create a context with all the devices
devices = platforms[0].get_devices()
context = cl.Context([devices[2]])
print 'This context is associated with ', len(context.devices), 'devices'

# Create a queue for transferring data and launching computations.
# Turn on profiling to allow us to check event times.
queue = cl.CommandQueue(context, context.devices[0],
                        properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

In [27]:
### Set up neural network parameters ###
# Decide the parameters of the structure of the neural network
n_layers = 4 # Including input and output layer
n_classes = 2 # Size of output layer
n_neurons = [2**8, 2**8, 2**8, n_classes]
n_neurons = [3, 3, 3, n_classes]

# Generate weights
weights = []
for layer_i in range(n_layers - 1):
    n_pre_layer = n_neurons[layer_i]
    n_post_layer = n_neurons[layer_i + 1]
    weights.append(np.random.normal(size=(n_post_layer, n_pre_layer)))

In [48]:
weights_1d = np.hstack([x.flatten() for x in weights])
inputs_1d = inputs.flatten()

In [40]:
# Generate inputs
n_inputs = 3
inputs = np.random.normal(size=(n_neurons[0], n_inputs)) # random inputs
# inputs = np.zeros(shape=(n_neurons[0], n_inputs)) # zero inputs

In [None]:
inputs

In [41]:
output_serial = NN_serial.infer_np_serial(inputs,
                                weights,
                                n_layers,
                                n_classes,
                                n_neurons)
print(output_serial)

array([[ 0.8801707 ,  1.1783842 , -0.11314505],
       [-1.62698477, -0.18739597, -0.8429538 ],
       [ 0.97693783,  0.84947339, -0.40892578]])

In [18]:
output_serial = NN_serial.infer_np_serial(inputs,
                                weights,
                                n_layers,
                                n_classes,
                                n_neurons)
print(output_serial)

The platforms detected are:
---------------------------
Apple Apple version: OpenCL 1.2 (Sep 20 2014 22:01:02)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i7-3615QM CPU @ 2.30GHz [Type: CPU ]
Maximum clock Frequency: 2300 MHz
Maximum allocable memory size: 4294 MB
Maximum work group size 1024
---------------------------
HD Graphics 4000 [Type: GPU ]
Maximum clock Frequency: 1200 MHz
Maximum allocable memory size: 268 MB
Maximum work group size 512
---------------------------
GeForce GT 650M [Type: GPU ]
Maximum clock Frequency: 900 MHz
Maximum allocable memory size: 268 MB
Maximum work group size 1024
---------------------------
This context is associated with  1 devices
The queue is using the device: GeForce GT 650M
[[  9.99999537e-01   9.99999939e-01   5.81488874e-05]
 [  4.63442444e-07   6.09749243e-08   9.99941851e-01]]


In [None]:
#    program = cl.Program(context, open('label_regions.cl').read()).build(options='')
#
#    host_image = np.load('maze2.npy')
#    host_labels = np.empty_like(host_image)
#    host_done_flag = np.zeros(1).astype(np.int32)
#
#    gpu_image = cl.Buffer(context, cl.mem_flags.READ_ONLY, host_image.size * 4)
#    gpu_labels = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
#    gpu_done_flag = cl.Buffer(context, cl.mem_flags.READ_WRITE, 4)
#
#    # Send to the device, non-blocking
#    cl.enqueue_copy(queue, gpu_image, host_image, is_blocking=False)
#
#    local_size = (8, 8)  # 64 pixels per work group
#    global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
#    print global_size
#    width = np.int32(host_image.shape[1])
#    height = np.int32(host_image.shape[0])
#    halo = np.int32(1)
#
#    # Create a local memory per working group that is
#    # the size of an int (4 bytes) * (N+2) * (N+2), where N is the local_size
#    buf_size = (np.int32(local_size[0] + 2 * halo), np.int32(local_size[1] + 2 * halo))
#    gpu_local_memory = cl.LocalMemory(4 * buf_size[0] * buf_size[1])
#
#    # initialize labels
#    program.initialize_labels(queue, global_size, local_size,
#                              gpu_image, gpu_labels,
#                              width, height)
#
#    # while not done, propagate labels
#    itercount = 0
#
#    # Show the initial labels
#    cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
#    pylab.imshow(host_labels)
#    pylab.title(itercount)
#    pylab.colorbar()
#    pylab.show()
#
##    cl.enqueue_copy(queue, gpu_done_flag, host_done_flag, is_blocking=False)
##    prop_exec = program.propagate_labels(queue, global_size, local_size,
##                                             gpu_labels, gpu_done_flag,
##                                             gpu_local_memory,
##                                             width, height,
##                                             buf_size[0], buf_size[1],
##                                             halo)
#
#    show_progress = True
#    total_time = 0
#
#    while True:
#        itercount += 1
#        host_done_flag[0] = 0
#        print 'iter', itercount
#        cl.enqueue_copy(queue, gpu_done_flag, host_done_flag, is_blocking=False)
#        prop_exec = program.propagate_labels(queue, global_size, local_size,
#                                             gpu_labels, gpu_done_flag,
#                                             gpu_local_memory,
#                                             width, height,
#                                             buf_size[0], buf_size[1],
#                                             halo)
#        prop_exec.wait()
#        elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start)
#        total_time += elapsed
#        # read back done flag, block until it gets here
#        cl.enqueue_copy(queue, host_done_flag, gpu_done_flag, is_blocking=True)
#        if host_done_flag[0] == 0:
#            # no changes
#            break
#        # there were changes, so continue running
#        print host_done_flag
#        if itercount % 100 == 0 and show_progress:
#            cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
#            pylab.imshow(host_labels)
#            pylab.title(itercount)
#            pylab.show()
#        if itercount % 10000 == 0:
#            print 'Reached maximal number of iterations, aborting'
#            sys.exit(0)
#
#    print('Finished after {} iterations, {} ms total, {} ms per iteration'.format(itercount, total_time, total_time / itercount))
#    # Show final result
#    cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
#    print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
#    pylab.imshow(host_labels)
#    pylab.title(itercount)
#    pylab.show()