# Parallel computing in Julia

## On CPU: Multi-process on multiple machines

### Monte Carlo simulation

##### Running on one core

In [None]:
function calc_pi(samples)
    counter = 0
    for i in 1:samples
        x, y = rand(2)
        if (x^2 + y^2 <=1)
            counter += 1
        end
    end
    π = 4 * counter / samples
    return π
end;

In [None]:
samples = 1e8
@time calc_pi(samples)

In [None]:
# Check number of processes
nprocs()

In [None]:
# Check workers
workers()

In [None]:
# Clear workers on all hosts
rmprocs(workers())

##### Adding more processes is a one-liner

In [None]:
# Add some more on another host
#addprocs()
addprocs([("root@10.4.1.4:6666",2), ("10.4.1.5:6666",2)], sshflags=`-i id_rsa`, tunnel=true)

#### Running on multiple cores

In [None]:
function parallel_calc_pi(samples)
    counter = @parallel (+) for i=1:samples
        x, y = rand(2)
        return ifelse(x^2 + y^2 <= 1, 1, 0)
    end
    π = 4 * counter / samples
end

In [None]:
@time parallel_calc_pi(samples)

## On a single GPU

In [1]:
using CUDAnative, CUDAdrv

In [2]:
function kernel_dist(X::AbstractVector{Float32}, Y::AbstractVector{Float32}, out::AbstractVector{Float32})
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    out[i] = (X[i]-0.5)^2 + (Y[i]-0.5)^2
    #Return nothing, since we’re writing directly to the out array allocated on GPU
    return nothing
end

kernel_dist (generic function with 1 method)

In [11]:
samples = 10000000
a = rand(Float32, (samples))
b = rand(Float32, (samples));
a_cu = CuArray(a)
b_cu = CuArray(b)
c_cu = similar(a_cu);
n = length(a)

ctx = CuCurrentContext()
dev = device(ctx)
max_threads = attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
threads = min(max_threads, n)
blocks = ceil(Int, n/threads)

9766

In [13]:
println("$max_threads\n$threads\n$blocks")

1024
1024
9766


In [9]:
@cuda (blocks, threads) kernel_dist(a_cu, b_cu, c_cu)
c = Array(c_cu)

destroy!(ctx)

c = Array(c_cu)
pi = 4*count(x->x<0.25,c)/length(c)

3.141372

## On GPU's located on multiple machines

In [1]:
addprocs([("root@10.4.1.4:6666",1)], sshflags=`-i id_rsa`, tunnel=true)

1-element Array{Int64,1}:
 2

In [2]:
@everywhere using CUDAnative, CUDAdrv

In [3]:
@everywhere function kernel_dist(X::AbstractVector{Float32}, Y::AbstractVector{Float32}, gpu_cu::AbstractVector{Float32})
    #Thread index
    #Need to do the n-1 dance, since CUDA expects 0 and Julia does 1-indexing
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    gpu_cu[i] = (X[i]-0.5)^2 + (Y[i]-0.5)^2
    #Return nothing, since we’re writing directly to the out array allocated on GPU
    return nothing
end

In [4]:
@everywhere function distmontegpu()
    samples = 50000000
    a = rand(Float32, (samples))
    b = rand(Float32, (samples));
    a_cu = CuArray(a)
    b_cu = CuArray(b)
    c_cu = similar(a_cu);
    n = length(a)

    ctx = CuCurrentContext()
    dev = device(ctx)
    max_threads = attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
    threads = min(max_threads, n)
    blocks = ceil(Int, n/threads)
    
    @cuda (blocks, threads) kernel_dist(a_cu, b_cu, c_cu)
    c = Array(c_cu)
    destroy!(ctx)
    c = Array(c_cu)
    pi = 4*count(x->x<0.25,c)/length(c)
    return pi
end

In [6]:
@everywhere distmontegpu()

# Parallel macro

#### @parallel - The go-to tool for handling small tasks

In [None]:
addprocs([("root@10.4.1.4:6666", 1)], tunnel=true)

In [None]:
sum = 0
tic()
for i in 1:200000000
    sum += i
end
toc()
println(sum)

In [None]:
tic()
sum = @parallel (+) for i = 1:200000000
    Int(i)
end
toc()
println(sum)

###### (daaaaaaaaaaamn!)