# Vector-Reduction

## Suming the elements of a large array (Vector-Reduction) using Parallel code and then benchmarking it with the serial implementation of the same.


In [1]:
using Pkg
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("CUDAdrv")
Pkg.add("StaticArrays")
Pkg.add("BenchmarkTools")
Pkg.add("Test")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/envir

In [0]:
using CUDAnative, CUDAdrv, CuArrays, BenchmarkTools, Test, StaticArrays

In [13]:
N = 2^25
input = rand(N)
d_input = CuArray(input)

33554432-element CuArray{Float64,1}:
 0.20179555986464415  
 0.7891749987136487   
 0.8745746116634037   
 0.6387154833337982   
 0.7094325891928888   
 0.6651476378352903   
 0.5240269186242899   
 0.05225314432187744  
 0.869396830878671    
 0.7980381044072951   
 0.0994140181768608   
 0.2835369809184909   
 0.1922957117608639   
 ⋮                    
 0.6116356162029628   
 0.9302335559257233   
 0.0019453448190331901
 0.9075648458773309   
 0.6857375473919893   
 0.819824563037012    
 0.19390505895316879  
 0.9417410349313302   
 0.7410949640992879   
 0.19843182837461826  
 0.7893538574206285   
 0.01771276881916739  

In [14]:
# Serial approach to sum the elements of an array/vector

function serial_reduction(input)
  sum = 0
  for i=1:size(input,1)
    sum += input[i]
  end
  return sum
end

serial_reduction (generic function with 1 method)

In [15]:
# Parallel approach to sum the elements of an array/vector using Vector-Reduction 

function parallel_reduction(d_input)
  idx = ((blockIdx().x-1) * blockDim().x) + threadIdx().x
  
  # Bounds check
  if(idx <= size(d_input,1))
    step = 1
    
    while step < blockDim().x
    
      if((idx % (step * 2) == 1) && (idx + step) <= size(d_input,1))
        @inbounds d_input[idx] += d_input[idx + step]
      end
      
      step *= 2
      
      sync_threads()
    end
    
  # Atomically adding the first index in each block
    if(threadIdx().x == 1 && idx != 1)
      @inbounds @atomic d_input[1] += d_input[idx]
    end
    
  end
  
return nothing
end

parallel_reduction (generic function with 1 method)

In [16]:
serial_sum = serial_reduction(input)

1.6779476508068554e7

In [0]:
@cuda blocks = ceil(Int,N/1024) threads = 1024 parallel_reduction(d_input)

In [18]:
parallel_output = Array(d_input)

33554432-element Array{Float64,1}:
 1.677947650807114e7
 0.7891749987136487 
 1.513290094997202  
 0.6387154833337982 
 1.9508602899743464 
 0.6651476378352903 
 0.5762800629461673 
 0.05225314432187744
 3.3500900936221534 
 0.7980381044072951 
 0.3829509990953517 
 0.2835369809184909 
 1.2997041592408356 
 ⋮                  
 2.45137936282505   
 0.9302335559257233 
 0.9095101906963641 
 0.9075648458773309 
 4.387801623027202  
 0.819824563037012  
 1.135646093884499  
 0.9417410349313302 
 1.746593418713702  
 0.19843182837461826
 0.8070666262397959 
 0.01771276881916739

In [19]:
@test isapprox(serial_sum,parallel_output[1])

[32m[1mTest Passed[22m[39m

In [20]:
@benchmark serial_reduction(input)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     49.909 ms (0.00% GC)
  median time:      53.176 ms (0.00% GC)
  mean time:        53.475 ms (0.00% GC)
  maximum time:     61.321 ms (0.00% GC)
  --------------
  samples:          94
  evals/sample:     1

In [21]:
@benchmark @cuda blocks = ceil(Int,N/1024) threads = 1024 parallel_reduction(d_input)

BenchmarkTools.Trial: 
  memory estimate:  1008 bytes
  allocs estimate:  40
  --------------
  minimum time:     8.763 μs (0.00% GC)
  median time:      7.058 ms (0.00% GC)
  mean time:        6.499 ms (0.00% GC)
  maximum time:     7.295 ms (0.00% GC)
  --------------
  samples:          770
  evals/sample:     1

In [23]:
@btime @benchmark serial_reduction(input)

  11.703 s (341712 allocations: 17.31 MiB)


BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     50.313 ms (0.00% GC)
  median time:      53.207 ms (0.00% GC)
  mean time:        53.489 ms (0.00% GC)
  maximum time:     60.401 ms (0.00% GC)
  --------------
  samples:          94
  evals/sample:     1

In [24]:
@btime @cuda blocks = ceil(Int,N/1024) threads = 1024 parallel_reduction(d_input)

  8.845 μs (40 allocations: 1008 bytes)


## The parallel code is about 10^6 times faster than the serial code