> # Array-Addition


### `Adding elements of two large arrays using Parallel code and then benchmarking it with the serial implementation of the same.`

In [1]:
using Pkg
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("CUDAdrv")
Pkg.add("BenchmarkTools")
Pkg.add("Test")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/envir

In [0]:
using CUDAnative, CUDAdrv, CuArrays, BenchmarkTools, Test

In [23]:
# Implementation using serial code

function serial_array_add!(a,b,c)
  c .= a .+ b
end

serial_array_add! (generic function with 1 method)

In [24]:
# Implementation using parallel CUDA Kernel

function parallel_array_add!(a,b,c)

  idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
  
  if(idx <= size(c,1))
    @inbounds c[idx] = a[idx] + b[idx]
  end
  
  return nothing
end

parallel_array_add! (generic function with 1 method)

In [25]:
N = 2^20
a = rand(N)
b = rand(N)
serial_result = zeros(N)
parallel_result = zeros(N)

# serial_result would be the result after serial addition of two arrays
# parallel_result would be the result after parallel addition of two arrays 

1048576-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [26]:
Device_a=CuArray(a)
Device_b=CuArray(b)
Device_result=CuArray(parallel_result)

# allocating memory on GPU(Device)

1048576-element CuArray{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [27]:
serial_array_add!(a,b,serial_result)

1048576-element Array{Float64,1}:
 1.551014628332057  
 0.8357149293529884 
 0.9422275980542627 
 0.252230130941983  
 0.8221095977064417 
 1.0175049548023267 
 0.8450697011035475 
 0.7336682673783679 
 0.7036961407539621 
 0.8723674468917757 
 0.39851313281466383
 0.3029388182289803 
 1.0949325154452676 
 ⋮                  
 1.3725077924315896 
 1.5654275490990286 
 0.3738082766468458 
 0.5232438862044433 
 1.281604373002078  
 1.3510478670188582 
 1.0661870941031548 
 1.0736735431584783 
 0.5812487416214256 
 0.3066193244795661 
 0.36529984385710446
 0.7490638357044848 

In [0]:
@cuda blocks = ceil(Int,N/(2^10)) threads = 2^10 parallel_array_add!(Device_a, Device_b, Device_result)

In [29]:
parallel_result = Array(Device_result)

1048576-element Array{Float64,1}:
 1.551014628332057  
 0.8357149293529884 
 0.9422275980542627 
 0.252230130941983  
 0.8221095977064417 
 1.0175049548023267 
 0.8450697011035475 
 0.7336682673783679 
 0.7036961407539621 
 0.8723674468917757 
 0.39851313281466383
 0.3029388182289803 
 1.0949325154452676 
 ⋮                  
 1.3725077924315896 
 1.5654275490990286 
 0.3738082766468458 
 0.5232438862044433 
 1.281604373002078  
 1.3510478670188582 
 1.0661870941031548 
 1.0736735431584783 
 0.5812487416214256 
 0.3066193244795661 
 0.36529984385710446
 0.7490638357044848 

In [30]:
@test serial_result == parallel_result

# testing the correctness

[32m[1mTest Passed[22m[39m

In [31]:
@benchmark serial_array_add!(a,b,serial_result)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.072 ms (0.00% GC)
  median time:      1.159 ms (0.00% GC)
  mean time:        1.176 ms (0.00% GC)
  maximum time:     2.713 ms (0.00% GC)
  --------------
  samples:          4211
  evals/sample:     1

In [32]:
@benchmark @cuda blocks = ceil(Int,N/(2^10)) threads = 2^10 parallel_array_add!(Device_a, Device_b, Device_result)

BenchmarkTools.Trial: 
  memory estimate:  2.16 KiB
  allocs estimate:  71
  --------------
  minimum time:     10.561 μs (0.00% GC)
  median time:      98.638 μs (0.00% GC)
  mean time:        88.017 μs (0.00% GC)
  maximum time:     428.995 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

### We see that the parallel code is about 100 times faster than serial one !!