In [136]:
import OpenCL
const cl = OpenCL

const generate_kernel = "
__kernel void generate(__global const int *a,
                     __global int *b)
{
    int gid = get_global_id(0);
    int nAcc = a[gid];
    int nSum = 0;
    while(nAcc > 0) {
        nSum += (nAcc % 10);
        nAcc /= 10; 
    }
    b[gid] = nSum + a[gid];
}
"

a = Array(1:5000)
b = rand(Int, 5000)
device, ctx, queue = cl.create_compute_context()

a_buff = cl.Buffer(Int, ctx, (:r, :copy), hostbuf=a)
b_buff = cl.Buffer(Int, ctx, :w, length(a))

prog = cl.Program(ctx, source=generate_kernel) |> cl.build!
kern = cl.Kernel(prog, "generate")

cl.call(queue, kern, size(a), nothing, a_buff, b_buff)

result = cl.read(queue, b_buff)
#println(result[4001:4100])
println(length(union(result)))



2270


In [38]:
from __future__ import print_function
import numpy as np
from numba import *
from timeit import default_timer as time

@cuda.jit(argtypes=[f4[:], f4[:], f4[:]])
def cuda_sum(a, b, c):
    i = cuda.grid(1)
    c[i] = a[i] + b[i]

griddim = 50, 1
blockdim = 32, 1, 1
N = griddim[0] * blockdim[0]
print("N", N)
cuda_sum_configured = cuda_sum.configure(griddim, blockdim)
a = np.array(np.random.random(N), dtype=np.float32)
b = np.array(np.random.random(N), dtype=np.float32)
c = np.empty_like(a)

ts = time()
cuda_sum_configured(a, b, c)
te = time()
print(te - ts)
assert (a + b == c).all()

x : [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
result : [  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
