In [1]:
using CUDAnative, CuArrays
using Base.Test

In [2]:
function kernel_vadd(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    c[i] = a[i] + b[i]

    return nothing
end

kernel_vadd (generic function with 1 method)

In [3]:
dims = (3,4)
a = round.(rand(Float32, dims) * 100)
b = round.(rand(Float32, dims) * 100)

d_a = CuArray(a)
d_b = CuArray(b)
d_c = similar(d_a)

3×4 CuArray{Float32,2}:
 0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0

In [15]:
len = prod(dims)
@cuda (len, 1) kernel_vadd(d_a, d_b, d_c)
c = Array(d_c)
@test a .+ b ≈ c

[1m[32mTest Passed[39m[22m

## Distributed computing and accelerated computing
### The Julia way!

Julia supports various forms of distributed computing.

1. A native master-worker system based on remote procedure calls
2. MPI through `MPI.jl`
3. `DistributedArrays.jl`

Julia also has support for accelerated computing

1. Low-level (C kernel) based operations `OpenCL.jl` and `CUDAdrv.jl`
2. Low-level (Julia kernel) based operations through `CUDAnative.jl`
2. High-level vendor specific abstractions `CuArray.jl` and `CLArray.jl`
2. High-level libraries like `ArrayFire.jl` and `GPUArrays.jl`

#### The Julia way! Tell us where your data is and your program will follow.
##### `broadcast` example

In [5]:
B = ones(10_000) ./ 2;
A = ones(10_000) .* π;

In [6]:
C = 2 .* A ./ B;
all(C .≈ 4*π)

true

In [7]:
typeof(C)

Array{Float64,1}

In [8]:
cuB = CuArray(B);
cuA = CuArray(A);

In [16]:
cuC = 2 .* cuA ./ (cuB .* 3);
# Disclaimer on Julia v0.6 some operations don't work `sin`. Use CUDAnative.sin instead.
all(cuC .≈ 4*π)

false

In [10]:
typeof(cuC)

CuArray{Float64,1}

In [18]:
addprocs(4)
@everywhere using DistributedArrays

In [19]:
dB = distribute(B);
dA = distribute(A);

In [20]:
dC = 2 .* dA ./ dB;
all(dC .≈ 4*π)

true

In [31]:
function mysum(X)
    acc = 0)
    for x in X
        acc += x
    end
    acc
end

LoadError: [91msyntax: extra token ")" after end of expression[39m

In [25]:
mysum(rand(10_000))

4999.974425762488

In [32]:
@code_warntype mysum(rand( 10))

Variables:
  #self# <optimized out>
  X::Array{Float64,1}
  x::Float64
  #temp#::Int64
  acc::Float64

Body:
  begin 
      acc::Float64 = (Base.sitofp)(Float64, 0)::Float64 # line 3:
      #temp#::Int64 = 1
      4: 
      unless (Base.not_int)((#temp#::Int64 === (Base.add_int)((Base.arraylen)(X::Array{Float64,1})::Int64, 1)::Int64)::Bool)::Bool goto 14
      SSAValue(2) = (Base.arrayref)(X::Array{Float64,1}, #temp#::Int64)::Float64
      SSAValue(3) = (Base.add_int)(#temp#::Int64, 1)::Int64
      x::Float64 = SSAValue(2)
      #temp#::Int64 = SSAValue(3) # line 4:
      acc::Float64 = (Base.add_float)(acc::Float64, x::Float64)::Float64
      12: 
      goto 4
      14:  # line 6:
      return acc::Float64
  end::Float64
