In [1]:
import pyopencl as cl
import numpy as np

In [32]:
def kernel_funtion(queue, kernel_program, global_size, input_tuple, output_tuple, local_size = (32,)):
    for (array, buffer) in input_tuple:
        cl.enqueue_copy(queue, src = array, dest = buffer)
    
    argument = [buffer for (_,buffer) in input_tuple]
    argument = argument + [buffer for (_,buffer) in output_tuple]
    
    kernel_program(queue, global_size, local_size, *argument)
    
    for (arr, buffer) in output_tuple:
        cl.enqueue_copy(queue, src = buffer , dest = arr)
    
    queue.finish()

In [40]:
platform = [device for device  in cl.get_platforms()
           if device.name == "AMD Accelerated Parallel Processing"][0]

gpu = platform.get_devices()
context = cl.Context(devices = gpu)


In [41]:
program = """void kernel sum (global float *a, global float *b, global float *c){int gid = get_global_id(0);
c[gid] = a[gid] + b[gid];}"""

In [48]:
kernel = cl.Program(context, program)

kernel_program = kernel.build()

In [49]:
N = int(2**10)
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)
c = np.empty_like(a)

a_buffer = cl.Buffer(context, flags = cl.mem_flags.READ_ONLY, size = a.nbytes)
b_buffer = cl.Buffer(context, flags = cl.mem_flags.READ_ONLY, size = b.nbytes)
c_buffer = cl.Buffer(context, flags = cl.mem_flags.WRITE_ONLY, size = c.nbytes)

input_tuple = ((a, a_buffer), (b,b_buffer),)
output_tuple = ((c,c_buffer),)

queue = cl.CommandQueue(context)

In [50]:
%timeit kernel_funtion(queue, kernel_program.sum, (N,), input_tuple, output_tuple)

487 µs ± 6.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [51]:
def sumcheck(a,b,c):
    c_check = a + b
    if np.abs(c-c_check > 0.0).any():
        print("results not match")
    else:
        print("results match")
    

In [52]:
sumcheck(a,b,c)

results match
