> # Array-Addition


## Adding elements of two large arrays using Parallel code and then benchmarking it with the serial implementation of the same.


In [1]:
using Pkg
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("CUDAdrv")
Pkg.add("BenchmarkTools")
Pkg.add("Test")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/envir

In [0]:
using CUDAnative, CUDAdrv, CuArrays, BenchmarkTools, Test

In [3]:
# Implementation using serial code

function serial_array_add!(a,b,c)
  c .= a .+ b
end

serial_array_add! (generic function with 1 method)

In [4]:
# Implementation using parallel CUDA Kernel

function parallel_array_add!(a,b,c)

  idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
  
 # Bounds check
  if(idx <= size(c,1))
    @inbounds c[idx] = a[idx] + b[idx]
  end
  
  return nothing
end

parallel_array_add! (generic function with 1 method)

In [5]:
N = 2^25
a = rand(N)
b = rand(N)
serial_result = zeros(N)
parallel_result = zeros(N) 

33554432-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [6]:
# Allocating memory on GPU(Device)

Device_a=CuArray(a)
Device_b=CuArray(b)
Device_result=CuArray(parallel_result)

33554432-element CuArray{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [7]:
serial_array_add!(a,b,serial_result)

33554432-element Array{Float64,1}:
 1.2852465425047586 
 0.3755673168736875 
 0.18717506095642933
 0.7066218968353946 
 0.7368304034565436 
 1.163561579127823  
 0.3776027570740401 
 1.1969300202021889 
 0.4400306832346019 
 0.8100256699268509 
 1.3959901676998763 
 0.9938515827570584 
 0.9833503890972217 
 ⋮                  
 0.8964199358422653 
 0.49829921954337597
 1.4946533595976714 
 0.6800103417666483 
 0.93163122967165   
 0.4373980903153407 
 1.6625721006271141 
 0.7254116104231512 
 0.874063312728165  
 0.37349873490326635
 1.0270753839484181 
 1.69387225388001   

In [8]:
@cuda blocks = ceil(Int,N/(2^10)) threads = 2^10 parallel_array_add!(Device_a, Device_b, Device_result)

┌ Info: Building the CUDAnative run-time library for your sm_37 device, this might take a while...
└ @ CUDAnative /root/.julia/packages/CUDAnative/gJDZI/src/compiler/rtlib.jl:154


In [9]:
parallel_result = Array(Device_result)

33554432-element Array{Float64,1}:
 1.2852465425047586 
 0.3755673168736875 
 0.18717506095642933
 0.7066218968353946 
 0.7368304034565436 
 1.163561579127823  
 0.3776027570740401 
 1.1969300202021889 
 0.4400306832346019 
 0.8100256699268509 
 1.3959901676998763 
 0.9938515827570584 
 0.9833503890972217 
 ⋮                  
 0.8964199358422653 
 0.49829921954337597
 1.4946533595976714 
 0.6800103417666483 
 0.93163122967165   
 0.4373980903153407 
 1.6625721006271141 
 0.7254116104231512 
 0.874063312728165  
 0.37349873490326635
 1.0270753839484181 
 1.69387225388001   

In [10]:
# Testing the correctness

@test serial_result == parallel_result

[32m[1mTest Passed[22m[39m

In [13]:
@btime serial_array_add!(a,b,serial_result)

  89.101 ms (0 allocations: 0 bytes)


33554432-element Array{Float64,1}:
 1.2852465425047586 
 0.3755673168736875 
 0.18717506095642933
 0.7066218968353946 
 0.7368304034565436 
 1.163561579127823  
 0.3776027570740401 
 1.1969300202021889 
 0.4400306832346019 
 0.8100256699268509 
 1.3959901676998763 
 0.9938515827570584 
 0.9833503890972217 
 ⋮                  
 0.8964199358422653 
 0.49829921954337597
 1.4946533595976714 
 0.6800103417666483 
 0.93163122967165   
 0.4373980903153407 
 1.6625721006271141 
 0.7254116104231512 
 0.874063312728165  
 0.37349873490326635
 1.0270753839484181 
 1.69387225388001   

In [14]:
@btime @cuda blocks = ceil(Int,N/(2^10)) threads = 2^10 parallel_array_add!(Device_a, Device_b, Device_result)

  14.153 μs (71 allocations: 2.16 KiB)


### We see that the parallel code is about 1000 times faster than serial one !!