In [1]:
using CUDAnative, CUDAdrv

In [2]:
function kernel_dist(X::AbstractVector{Float32}, Y::AbstractVector{Float32}, out::AbstractVector{Float32})
    #Thread index
    #Need to do the n-1 dance, since CUDA expects 0 and Julia does 1-indexing
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x

   out[i] = (X[i]-0.5)^2 + (Y[i]-0.5)^2

   #Return nothing, since we’re writing directly to the out array allocated on GPU
    return nothing
end

kernel_dist (generic function with 1 method)

In [4]:
samples = 1000000
a = rand(Float32, (samples))
b = rand(Float32, (samples));
a_cu = CuArray(a)
b_cu = CuArray(b)
c_cu = similar(a_cu);
n = length(a)

ctx = CuCurrentContext()
dev = device(ctx)
max_threads = attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
threads = min(max_threads, n)
blocks = ceil(Int, n/threads)

977

In [5]:
@cuda (blocks, threads) kernel_dist(a_cu, b_cu, c_cu)
c = Array(c_cu)

destroy!(ctx)

c = Array(c_cu)
pi = 4*count(x->x<0.25,c)/length(c)

3.139884