In [2]:
import os
import pyopencl as pcl
import numpy as np

# let's try to find available devices
platforms = pcl.get_platforms()
for p in platforms:
    devs = p.get_devices()
    for d in devs:
        print(d.name, pcl.device_type.to_string(d.type), d.global_mem_size / 10**9)

# let's select the AMD radeon card in this case
dev=None
for p in pcl.get_platforms():
    devs = p.get_devices()
    for d in devs:
        if pcl.device_type.to_string(d.type) == 'GPU' and (d.global_mem_size / 10**9) > 2.0:
            dev = d
            
# make the opencl context
# cntx = pcl.create_some_context()
cntx = pcl.Context(devices=[dev])
queue = pcl.CommandQueue(cntx, device=dev)

Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz CPU 17.179869184
Iris Pro GPU 1.610612736
AMD Radeon R9 M370X Compute Engine GPU 2.147483648


In [3]:
ktest_cl_file = os.path.join('..', 'src', 'cl', 'kernel_tests.cl')
os.path.isfile(ktest_cl_file)

True

In [271]:
# build the kernel
with open(ktest_cl_file, 'r') as f:
    programs = pcl.Program(cntx, f.read()).build()
    f.seek(0)
    print(f.read())

__kernel void addem(__global float * a, __global float * b, __global float * c)
{

  int i = get_global_id(0);
  c[i] = a[i] + b[i];

}


__kernel void multiplyem(__global float * a, __global float * b, __global float * c)
{
  int i = get_global_id(0);
  c[i] = a[i] * b[i];
}

__kernel void testdot(__global float * a, __global float * b, __global float * c){
  int gid = get_global_id(0);
  c[gid] = dot(a[gid], b[gid]);
}

__kernel void test_rowaverage(__global float * in, __global float * out, const int nrows, const int ncols)
{
  float nrowsf = (float) nrows;
  for(int i = 0; i < nrows; i++){
    for (int j = 0; j < ncols; j++){
      out[j] += in[i * ncols + j];
      out[j] /= nrowsf;
    }
  }

}

__kernel void test_reduction_avg(__global float * in, __global float * out, __local float * partial_sums, const int nrows)
{
  int lid = get_local_id(0);
  int gid = get_global_id(0);
  int group_size = get_local_size(0);
  float nrowsf = (float) nrows;

  partial_sums[lid] = in[gid];
  b

In [5]:
# set up the buffers and arrays
a = np.ones(shape=(10, ), dtype=np.float32) * 3
# b = np.ones(shape=(10, ), dtype=np.float32) * 5
b = np.arange(0,10,1, dtype=np.float32)
c = np.zeros(shape=(10, ), dtype=np.float32)

In [6]:
a, b, c

(array([ 3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.], dtype=float32),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.], dtype=float32),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32))

In [7]:
a_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=b)
c_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=c)

In [8]:
# queue up the two kernels 
# add_event = programs.addem(queue,
#                            a.shape,
#                            None,
#                            a_buf, 
#                            b_buf,
#                            c_buf)

# multiply_event = programs.multiplyem(queue,
#                                      a.shape,
#                                      None,
#                                      c_buf,
#                                      a_buf,
#                                      b_buf)

dot_event = programs.testdot(queue,
                             a.shape,
                             None,
                             a_buf,
                             b_buf,
                             c_buf)

In [9]:
# queue.finish()

In [10]:
# add_event.wait()
# multiply_event.wait()

In [11]:
pcl.enqueue_copy(queue, b, b_buf)
pcl.enqueue_copy(queue, c, c_buf)

<pyopencl.cffi_cl.NannyEvent at 0x110094748>

In [12]:
b

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.], dtype=float32)

In [13]:
c

array([  0.,   3.,   6.,   9.,  12.,  15.,  18.,  21.,  24.,  27.], dtype=float32)

In [70]:
X = np.array([[1,2,3], [4,5,6], [7,8,9]], dtype=np.float32)
x_avg = np.zeros(shape=(X.shape[1],), dtype=np.float32)

In [15]:
c

array([ 8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.], dtype=float32)

In [71]:
X_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=X)
x_avg_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=x_avg)

In [72]:
x_avg

array([ 0.,  0.,  0.], dtype=float32)

In [73]:
row_avg_event = programs.test_rowaverage(queue,
                                         X.shape,
                                         None,
                                         X_buf, 
                                         x_avg_buf,
                                         np.int32(X.shape[0]),
                                         np.int32(X.shape[1]))

In [74]:
pcl.enqueue_copy(queue, x_avg, x_avg_buf)

<pyopencl.cffi_cl.NannyEvent at 0x1100bf1d0>

In [75]:
x_avg

array([ 2.81481481,  3.29629588,  3.77777743], dtype=float32)

In [76]:
X.mean(axis=0)

array([ 4.,  5.,  6.], dtype=float32)

In [77]:
X

array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.],
       [ 7.,  8.,  9.]], dtype=float32)

In [228]:
z = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.float32)
z_out = np.zeros(shape=z.shape, dtype=np.float32)

In [229]:
next_pow2_size=16

In [230]:
z_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=z)
z_out_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=z_out)
partial_buf = pcl.LocalMemory(size=z[0].nbytes * next_pow2_size)

In [231]:
z.shape

(10,)

In [232]:
reduction_event = programs.test_reduction_avg(queue,
                                              (16,),
                                              None,
                                              z_buf,
                                              z_out_buf,
                                              partial_buf,
                                              np.int32(z.shape[0]))

In [233]:
pcl.enqueue_copy(queue, z_out, z_out_buf)

<pyopencl.cffi_cl.NannyEvent at 0x1100bb1d0>

In [234]:
z_out

array([ 5.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ], dtype=float32)

In [235]:
z.sum()

55.0

In [236]:
z.mean()

5.5

In [279]:
y = np.random.normal(size = (10, 6)).astype(np.float32)
y_row_avg = np.zeros(shape=(6,),).astype(np.float32)

In [280]:
y_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=y)
y_out_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=y_row_avg)
partial_sum_buf = pcl.LocalMemory(size=y[0].nbytes*next_pow2_size*y.shape[1])

In [281]:
row_reduction_event = programs.test_reduction_avg_matrix(queue,
                                                         (next_pow2_size, y.shape[1]),
                                                         None,
                                                         y_buf,
                                                         y_out_buf,
                                                         partial_sum_buf,
                                                         np.int32(y.shape[0]),
                                                         np.int32(y.shape[1]))

In [282]:
pcl.enqueue_copy(queue, y_row_avg, y_out_buf)

<pyopencl.cffi_cl.NannyEvent at 0x110436eb8>

In [283]:
y_row_avg

array([ 0.2853137 ,  0.1544431 , -0.01165392,  0.11329027,  0.45523071,
        0.16340153], dtype=float32)

In [284]:
y.mean(axis=0)

array([ 0.2853137 ,  0.15444311, -0.01165392,  0.11329027,  0.45523077,
        0.16340156], dtype=float32)