In [252]:
import os
import pyopencl as pcl
import numpy as np

# let's try to find available devices
platforms = pcl.get_platforms()
for p in platforms:
    devs = p.get_devices()
    for d in devs:
        print(d.name,d.type, pcl.device_type.to_string(d.type), d.global_mem_size / 10**9)

# let's select the AMD radeon card in this case
dev=None
for p in pcl.get_platforms():
    devs = p.get_devices()
    for d in devs:
        if pcl.device_type.to_string(d.type) == 'GPU' and (d.global_mem_size / 10**9) > 2.0:
            dev = d
            
# make the opencl context
# cntx = pcl.create_some_context()
cntx = pcl.Context( [dev])
queue = pcl.CommandQueue(cntx)

Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz 2 CPU 17.179869184
Iris Pro 4 GPU 1.610612736
AMD Radeon R9 M370X Compute Engine 4 GPU 2.147483648


In [239]:
ktest_cl_file = os.path.join('..', 'src', 'cl', 'kernel_tests.cl')
os.path.isfile(ktest_cl_file)

True

In [897]:
# build the kernel
with open(ktest_cl_file, 'r') as f:
    programs = pcl.Program(cntx, f.read()).build()
    f.seek(0)
    print(f.read())

__kernel void addem(__global float * a, __global float * b, __global float * c)
{

  int i = get_global_id(0);
  c[i] = a[i] + b[i];

}


__kernel void multiplyem(__global float * a, __global float * b, __global float * c)
{
  int i = get_global_id(0);
  c[i] = a[i] * b[i];
}

__kernel void testdot(__global float * a, __global float * b, __global float * c){
  int gid = get_global_id(0);
  c[gid] = dot(a[gid], b[gid]);
}

__kernel void test_rowaverage(__global float * in, __global float * out, const int nrows, const int ncols)
{
  float nrowsf = (float) nrows;
  for(int i = 0; i < nrows; i++){
    for (int j = 0; j < ncols; j++){
      out[j] += in[i * ncols + j];
      out[j] /= nrowsf;
    }
  }

}


__kernel void two_stage_reduce(__global float * in, __local float * scratch, __global float * out, __const int size)
{
  int gid = get_global_id(0);
  float accum = INFINITY;
  // loop sequentially over the input
  while(gid < size){
    float element = in[gid];
    accum = (accum < elem

Build on <pyopencl.Device 'AMD Radeon R9 M370X Compute Engine' on 'Apple' at 0x1021c00> succeeded, but said:

  float nrowsf = (float) nrows;
        ^
  float nrowsf = (float) nrows;
        ^



In [None]:
# set up the buffers and arrays
a = np.ones(shape=(10, ), dtype=np.float32) * 3
# b = np.ones(shape=(10, ), dtype=np.float32) * 5
b = np.arange(0,10,1, dtype=np.float32)
c = np.zeros(shape=(10, ), dtype=np.float32)

In [None]:
a, b, c

In [None]:
a_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=b)
c_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=c)

In [None]:
# queue up the two kernels 
# add_event = programs.addem(queue,
#                            a.shape,
#                            None,
#                            a_buf, 
#                            b_buf,
#                            c_buf)

# multiply_event = programs.multiplyem(queue,
#                                      a.shape,
#                                      None,
#                                      c_buf,
#                                      a_buf,
#                                      b_buf)

dot_event = programs.testdot(queue,
                             a.shape,
                             None,
                             a_buf,
                             b_buf,
                             c_buf)

In [None]:
# queue.finish()

In [None]:
# add_event.wait()
# multiply_event.wait()

In [None]:
pcl.enqueue_copy(queue, b, b_buf)
pcl.enqueue_copy(queue, c, c_buf)

In [None]:
b

In [None]:
c

In [None]:
X = np.array([[1,2,3], [4,5,6], [7,8,9]], dtype=np.float32)
x_avg = np.zeros(shape=(X.shape[1],), dtype=np.float32)

In [None]:
c

In [None]:
X_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=X)
x_avg_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=x_avg)

In [None]:
x_avg

In [None]:
row_avg_event = programs.test_rowaverage(queue,
                                         X.shape,
                                         None,
                                         X_buf, 
                                         x_avg_buf,
                                         np.int32(X.shape[0]),
                                         np.int32(X.shape[1]))

In [None]:
pcl.enqueue_copy(queue, x_avg, x_avg_buf)

In [None]:
x_avg

In [None]:
X.mean(axis=0)

In [None]:
X

In [292]:
dev

<pyopencl.Device 'AMD Radeon R9 M370X Compute Engine' on 'Apple' at 0x1021c00>

In [1269]:
os.environ['PYOPENCL_COMPILER_OUTPUT'] = "1"
cntx = pcl.Context( [dev])
queue = pcl.CommandQueue(cntx, properties=pcl.command_queue_properties.PROFILING_ENABLE)
with open(ktest_cl_file, 'r') as f:
    programs = pcl.Program(cntx, f.read()).build()

Build on <pyopencl.Device 'AMD Radeon R9 M370X Compute Engine' on 'Apple' at 0x1021c00> succeeded, but said:

  float nrowsf = (float) nrows;
        ^
  float nrowsf = (float) nrows;
        ^



In [1297]:
# z = np.array([1,2,3,4,5,6,7,8,9,10], dtype=np.float32)
z = np.ones(shape=(2**24,)).astype(np.float32)
z_out = np.zeros(shape=z.shape, dtype=np.float32)
scratch = np.zeros(shape=z.shape, dtype=np.float32)

In [1298]:
z_out

array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [1299]:
2**20

1048576

In [1300]:
next_pow2_size=16

In [1301]:
z_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=z)
z_out_buf = pcl.Buffer(cntx, pcl.mem_flags.WRITE_ONLY, size=z_out.nbytes)
partial_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_WRITE | pcl.mem_flags.COPY_HOST_PTR, hostbuf=scratch)
# partial_buf = pcl.LocalMemory(size=256*4)

In [1302]:
z.shape, z.size, z.nbytes/256, z_out.nbytes

((16777216,), 16777216, 262144.0, 67108864)

In [1303]:
# reduction_event = programs.test_reduction_avg_global(queue,
#                                                       z.shape,
#                                                       (32,),
#                                                       z_buf,
#                                                       z_out_buf,
#                                                       partial_buf,
#                                                       np.int32(z.shape[0]))

# partial_buf = pcl.LocalMemory(size=256*4)
# reduction_event = programs.test_reduction_avg(queue,
#                                               z.shape,
#                                               None,
#                                               z_buf,
#                                               z_out_buf,
#                                               partial_buf,
#                                               np.int32(z.shape[0]))

partial_buf = pcl.LocalMemory(4*256)

reduction_event = programs.two_stage_reduce(queue,
                                            (int(z.shape[0]/2**16),),
                                            None,
                                            z_buf,
                                            partial_buf,
                                            z_out_buf,
                                            np.int32(z.shape[0]))

In [1304]:
reduction_event.wait()

In [1305]:
pcl.enqueue_copy(queue, z_out, z_out_buf).wait()

In [1306]:
z_out

array([  1.67772160e+07,   1.34246407e-11,   1.34246407e-11, ...,
        -5.10447363e-14,  -5.10447363e-14,  -5.10447363e-14], dtype=float32)

In [1307]:
z.sum()

16777216.0

In [1308]:
np.float32((z.sum())) == z_out[0]

True

In [1289]:
z.shape[0]/4096

256.0

In [1290]:
z_out

array([  1.04857600e+06,  -3.17465100e+11,  -3.17465100e+11, ...,
        -5.10447363e-14,  -5.10447363e-14,  -5.10447363e-14], dtype=float32)

In [1291]:
z.sum()/256

4096.0

In [1292]:
z_out.shape[0] / 256

4096.0

In [1293]:
(z_out == 256.0).sum()

0

In [1294]:
z.sum()/z_out[0]

1.0

In [146]:
nrows = int(2**9)
y = np.random.normal(size = (nrows, 6), loc=10.0).astype(np.float32)
y_row_avg = np.zeros(shape=(6,)).astype(np.float32)

In [147]:
y.nbytes

12288

In [148]:
y_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=y)
y_out_buf = pcl.Buffer(cntx, pcl.mem_flags.WRITE_ONLY, size=y_row_avg.nbytes)
partial_sum_buf = pcl.LocalMemory(size=y.nbytes)

In [149]:
row_reduction_event = programs.test_reduction_avg_matrix(queue,
                                                         y.shape,
                                                         None,
                                                         y_buf,
                                                         y_out_buf,
                                                         partial_sum_buf,
                                                         np.int32(y.shape[0]),
                                                         np.int32(y.shape[1]))

In [150]:
row_reduction_event.wait()

In [151]:
pcl.enqueue_copy(queue, y_row_avg, y_out_buf, wait_for=[row_reduction_event])

<pyopencl.cffi_cl.NannyEvent at 0x10eb92470>

In [152]:
y_row_avg

array([ 4.97654247,  5.01037407,  5.0156889 ,  5.06041002,  4.98361921,
        5.00504589], dtype=float32)

In [153]:
y.mean(axis=0), y_row_avg

(array([  9.98460674,  10.00198746,  10.04318142,  10.05097008,
          9.94996071,  10.02250481], dtype=float32),
 array([ 4.97654247,  5.01037407,  5.0156889 ,  5.06041002,  4.98361921,
         5.00504589], dtype=float32))

In [154]:
y.nbytes * 2

24576

In [155]:
y.mean(axis=0)

array([  9.98460674,  10.00198746,  10.04318142,  10.05097008,
         9.94996071,  10.02250481], dtype=float32)

In [200]:
# from: https://github.com/pyopencl/pyopencl/blob/master/examples/benchmark.py

In [311]:
from __future__ import print_function
from __future__ import absolute_import
import pyopencl as cl
import numpy
import numpy.linalg as la
import datetime
from time import time

data_points = 2**24 # ~8 million data points, ~32 MB data
workers = 2**6 # 256 workers, play with this to see performance differences
               # eg: 2**0 => 1 worker will be non-parallel execution on gpu
               # data points must be a multiple of workers

a = numpy.random.rand(data_points).astype(numpy.float32)
b = numpy.random.rand(data_points).astype(numpy.float32)
c_result = numpy.empty_like(a)

# Speed in normal CPU usage
time1 = time()
c_temp = (a+b) # adds each element in a to its corresponding element in b
c_temp = c_temp * c_temp # element-wise multiplication
c_result = c_temp * (a/2.0) # element-wise half a and multiply
time2 = time()

print("Execution time of test without OpenCL: ", time2 - time1, "s")


for platform in cl.get_platforms():
    for device in platform.get_devices():
        print("===============================================================")
        print("Platform name:", platform.name)
        print("Platform profile:", platform.profile)
        print("Platform vendor:", platform.vendor)
        print("Platform version:", platform.version)
        print("---------------------------------------------------------------")
        print("Device name:", device.name)
        print("Device type:", cl.device_type.to_string(device.type))
        print("Device memory: ", device.global_mem_size//1024//1024//1024, 'GB')
        print("Device max clock speed:", device.max_clock_frequency, 'MHz')
        print("Device compute units:", device.max_compute_units)
        print("Device max work group size:", device.max_work_group_size)
        print("Device max work item sizes:", device.max_work_item_sizes)

        # Simnple speed test
        ctx = cl.Context([device])
        queue = cl.CommandQueue(ctx, 
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        mf = cl.mem_flags
        a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
        b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
        dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

        prg = cl.Program(ctx, """
            __kernel void sum(__global float *a,
            __global float *b, __global float *c)
            {
                        int gid = get_global_id(0);
                        float a_temp;
                        float b_temp;
                        float c_temp;
                        a_temp = a[gid]; // my a element (by global ref)
                        b_temp = b[gid]; // my b element (by global ref)
                        
                        c_temp = a_temp+b_temp; // sum of my elements
                        c_temp = c_temp * c_temp; // product of sums
                        c_temp = c_temp * (a_temp/2.0f); // times 1/2 my a
                        c[gid] = c_temp; // store result in global memory
                }
                """).build()

        global_size=(data_points,1)
        local_size=(workers,)
        preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
            device)

        print("Data points:", data_points)
        print("Workers:", workers)
        print("Preferred work group size multiple:", preferred_multiple)

        if (workers % preferred_multiple):
            print("Number of workers not a preferred multiple (%d*N)." \
                    % (preferred_multiple))
            print("Performance may be reduced.")

        exec_evt = prg.sum(queue, global_size, None, a_buf, b_buf, dest_buf)
        exec_evt.wait()
        elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

        print("Execution time of test: %g s" % elapsed)

        c = numpy.empty_like(a)
        cl.enqueue_copy(queue, c, dest_buf).wait()
        equal = numpy.all( c == c_result)

        if not equal:
                print("Results doesn't match!!")
        else:
                print("Results OK")

Execution time of test without OpenCL:  0.15439105033874512 s
Platform name: Apple
Platform profile: FULL_PROFILE
Platform vendor: Apple
Platform version: OpenCL 1.2 (Nov  1 2016 21:34:57)
---------------------------------------------------------------
Device name: Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device type: CPU
Device memory:  16 GB
Device max clock speed: 2500 MHz
Device compute units: 8
Device max work group size: 1024
Device max work item sizes: [1024, 1, 1]
Data points: 16777216
Workers: 64
Preferred work group size multiple: 1
Execution time of test: 0.0351943 s
Results OK
Platform name: Apple
Platform profile: FULL_PROFILE
Platform vendor: Apple
Platform version: OpenCL 1.2 (Nov  1 2016 21:34:57)
---------------------------------------------------------------
Device name: Iris Pro
Device type: GPU
Device memory:  1 GB
Device max clock speed: 1200 MHz
Device compute units: 40
Device max work group size: 512
Device max work item sizes: [512, 512, 512]
Data points: 16777