In [1]:
import os
import pyopencl as pcl
import numpy as np

In [2]:
# let's try to find available devices
platforms = pcl.get_platforms()
for p in platforms:
    devs = p.get_devices()
    for d in devs:
        print(d.name, pcl.device_type.to_string(d.type), d.global_mem_size / 10**9)

Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz CPU 17.179869184
Iris Pro GPU 1.610612736
AMD Radeon R9 M370X Compute Engine GPU 2.147483648


In [3]:
# let's select the AMD radeon card in this case
dev=None
for p in pcl.get_platforms():
    devs = p.get_devices()
    for d in devs:
        if pcl.device_type.to_string(d.type) == 'GPU' and (d.global_mem_size / 10**9) > 2.0:
            dev = d

In [60]:
# make the opencl context
# cntx = pcl.create_some_context()
cntx = pcl.Context(devices=[dev])
queue = pcl.CommandQueue(cntx, device=dev)

In [64]:
# get the location of the opencl code
first_cl_file = os.path.join('..', 'src', 'cl', 'first.cl')
os.path.isfile(first_cl_file)

True

In [235]:
# build the kernel
with open(first_cl_file, 'r') as f:
    programs = pcl.Program(cntx, f.read()).build()
    f.seek(0)
    print(f.read())

__kernel void first(__global int* num1, __global int* num2, __global int* out)
{
    int i = get_global_id(0);
    out[i] = num1[i]*num1[i]+ num2[i]*num2[i];
}

__kernel void access2darray(__global float * array, __global float * out, const int row_to_access, const int nrows, const int ncols)
{
  int gr = get_global_id(0);


  if(gr == row_to_access){

  }

  __global float * row = &array[row_to_access * nrows]; //gets the starting position of the row
  for(int i=0; i < nrows; i++){
    out[i] = row[i];
  }



}



In [236]:
arr = np.ones(shape=(100, 10), dtype=np.float32)
special_row = np.int32(55)
arr[special_row, :] *= 150.0

In [251]:
out = np.empty(shape=(arr.shape[0],), dtype=np.float32)

In [252]:
arr.shape

(100, 10)

In [253]:
out

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [254]:
arr[special_row, :]

array([ 150.,  150.,  150.,  150.,  150.,  150.,  150.,  150.,  150.,  150.], dtype=float32)

In [255]:
arr_buf = pcl.Buffer(cntx, pcl.mem_flags.READ_ONLY | pcl.mem_flags.COPY_HOST_PTR, hostbuf=arr)
out_buf = pcl.Buffer(cntx, pcl.mem_flags.WRITE_ONLY, out.nbytes)

In [256]:
np.int32(arr.shape[0]), np.int32(arr.shape[1])

(100, 10)

In [270]:
execute_event = programs.access2darray(queue, 
                                       arr.shape, 
                                       None, 
                                       arr_buf, 
                                       out_buf, 
                                       np.int32(55), 
                                       np.int32(arr.shape[0]), 
                                       np.int32(arr.shape[1]))

In [271]:
execute_event.wait()

In [272]:
pcl.enqueue_copy(queue, out, out_buf).wait()

In [273]:
out

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [269]:
# looks like it uses column-major ordering....