In [1]:
import os
import pyopencl as pcl
import numpy as np

# let's try to find available devices
platforms = pcl.get_platforms()
for p in platforms:
    devs = p.get_devices()
    for d in devs:
        print(d.name, pcl.device_type.to_string(d.type), d.global_mem_size / 10**9)

# let's select the AMD radeon card in this case
dev=None
for p in pcl.get_platforms():
    devs = p.get_devices()
    for d in devs:
        if pcl.device_type.to_string(d.type) == 'GPU' and (d.global_mem_size / 10**9) > 2.0:
            dev = d
            
# make the opencl context
# cntx = pcl.create_some_context()
cntx = pcl.Context(devices=[dev])
queue = pcl.CommandQueue(cntx, device=dev)

Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz CPU 17.179869184
Iris Pro GPU 1.610612736
AMD Radeon R9 M370X Compute Engine GPU 2.147483648


In [2]:
ktest_cl_file = os.path.join('..', 'src', 'cl', 'kernel_tests.cl')
os.path.isfile(ktest_cl_file)

True

In [3]:
# build the kernel
with open(ktest_cl_file, 'r') as f:
    programs = pcl.Program(cntx, f.read()).build()
    f.seek(0)
    print(f.read())

__kernel void addem(__global float * a, __global float * b, __global float * c){

  int i = get_global_id(0);
  c[i] = a[i] + b[i];

}


__kernel void multiplyem(__global float * a, __global float * b, __global float * c){
  int i = get_global_id(0);
  c[i] = a[i] * b[i];
}



In [70]:
# set up the buffers and arrays
a = np.ones(shape=(10, ), dtype=np.float32) * 3
b = np.ones(shape=(10, ), dtype=np.float32) * 5
c = np.zeros(shape=(10, ), dtype=np.float32)

In [71]:
a, b, c

(array([ 3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.], dtype=float32),
 array([ 5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.], dtype=float32),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32))

In [72]:
a_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=b)
c_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=c)

In [73]:
# queue up the two kernels 
add_event = programs.addem(queue,
                           a.shape,
                           None,
                           a_buf, 
                           b_buf,
                           c_buf)

multiply_event = programs.multiplyem(queue,
                                     a.shape,
                                     None,
                                     c_buf,
                                     a_buf,
                                     b_buf)



In [74]:
add_event.wait()
# multiply_event.wait()

In [75]:
pcl.enqueue_copy(queue, b, b_buf).wait()

In [76]:
b

array([ 5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.], dtype=float32)

In [77]:
pcl.enqueue_copy(queue, c, c_buf).wait()

In [78]:
c

array([ 8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.], dtype=float32)