In [1]:
using Pkg
Pkg.activate(joinpath(@__DIR__, "gpuenv"))
Pkg.instantiate()
pkg"precompile"

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1mPrecompiling[22m[39m project...


# CUDA programming in Julia
- CUDAdrv.jl: Device managment
- CUDAnative.jl: Native code compilation
- CuArrays.jl: High-level abstraction

We support both an array interface for the common numerical routines and expose the ability to hand-write cuda kernels in Julia

# Writing native cuda code

In [2]:
using CuArrays, CUDAnative

using Test

function vadd(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    c[i] = a[i] + b[i]
end

dims = (3,4)
a = round.(rand(Float32, dims) * 100)
b = round.(rand(Float32, dims) * 100)

d_a = CuArray(a)
d_b = CuArray(b)
d_c = similar(d_a)

len = prod(dims)
@cuda threads=len vadd(d_a, d_b, d_c)
c = Array(d_c)
@test a+b ≈ c

[32m[1mTest Passed[22m[39m

# High-Level interface

In [3]:
A = CuArray(ones(1024))
B = CuArray(ones(1024))

C = A .+ B

1024-element CuArray{Float64,1}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 ⋮  
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

# DistributedArrays and CuArray

In [5]:
using Distributed
addprocs(2)

@everywhere begin
    using Pkg
    Pkg.activate(joinpath(@__DIR__, "gpuenv"))
    using DistributedArrays
    using CuArrays
end



In [6]:
asyncmap(workers()) do p
         remotecall_fetch(() -> CUDAnative.device!(p), p)
end

2-element Array{Nothing,1}:
 nothing
 nothing

In [7]:
using CuArrays
# temp fix
CuArrays.poolidx(n) = n == 0 ? 1 : ceil(Int, log2(n))+1

In [8]:
A = DArray((400, 400)) do I
        m, n = map(length, I)
        reshape(CuArrays.CURAND.curand(Float32, m*n), m, n)
    end

400×400 DArray{Float32,2,CuArray{Float32,2}}:
 0.740219    0.172658   0.821089   …  0.548135   0.243916  0.727783 
 0.920994    0.458052   0.810904      0.588987   0.477883  0.266962 
 0.0390205   0.174865   0.685981      0.361033   0.838053  0.719861 
 0.968963    0.311784   0.991953      0.134364   0.457187  0.256853 
 0.925141    0.368595   0.665371      0.638219   0.897029  0.179975 
 0.44635     0.135313   0.463454   …  0.240648   0.726386  0.313926 
 0.667319    0.273761   0.185223      0.206423   0.210963  0.425871 
 0.109931    0.735437   0.0184075     0.287933   0.678794  0.706352 
 0.470219    0.462143   0.538282      0.275517   0.51972   0.733093 
 0.513194    0.993498   0.512975      0.36218    0.173922  0.351693 
 0.776175    0.745678   0.923918   …  0.0489085  0.901649  0.196911 
 0.294766    0.363851   0.335562      0.0849137  0.400561  0.0804158
 0.71401     0.820029   0.0619504     0.764376   0.344917  0.1903   
 ⋮                                 ⋱                     

In [9]:
sum(A)

79737.1f0

In [10]:
# A .+ sin.(A)