# Histogram-Tone-Mapping

## To transform the intensities in an image and reduce the brightness spectrum bandwidth using parallel implementations of reduce, scan and histogram construction

In [1]:
using Pkg
Pkg.add("CuArrays")
Pkg.add("CUDAnative")
Pkg.add("CUDAdrv")
Pkg.add("StaticArrays")
Pkg.add("BenchmarkTools")
Pkg.add("Test")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/envir

In [0]:
using CUDAnative, CUDAdrv, CuArrays, StaticArrays, BenchmarkTools, Test

In [4]:
N = 2^10
# Brightness values for tone mapping
input_arr = rand(N)

1024-element Array{Float64,1}:
 0.3469296475829524 
 0.42652320964933255
 0.14672477427006925
 0.590779659025265  
 0.7991733975272257 
 0.18554553957191633
 0.5793484178233579 
 0.9902520929288257 
 0.4793704817045328 
 0.49014173417192786
 0.10424878780665314
 0.35466155136589417
 0.07201106740972119
 ⋮                  
 0.249573869176974  
 0.79763283942174   
 0.8157098777157057 
 0.14417111158756457
 0.895054338620928  
 0.29651677991731784
 0.2843714916298379 
 0.7016794885722495 
 0.7141857000778034 
 0.7202133037559715 
 0.46409043297108643
 0.27396556346921974

In [7]:
# Kernel to find minimum and maximum of brightness values using reduce

function find_minmax(d_inputmin,d_inputmax)
  idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
  
  if(idx <= size(d_inputmin,1))
    step = 1
    while step < blockDim().x
      if(idx % (step*2) ==1 && idx + step <= size(d_inputmin,1))
        @inbounds d_inputmin[idx] = min(d_inputmin[idx],d_inputmin[idx + step])
        @inbounds d_inputmax[idx] = max(d_inputmax[idx],d_inputmax[idx + step])
      end
      step *= 2
      sync_threads
    end
    if(threadIdx()==1 && idx!=1)
      @inbounds @atomic d_inputmin[1] = min(d_inputmin[1],d_inputmin[idx])
      @inbounds @atomic d_inputmax[1] = max(d_inputmax[1],d_inputmax[idx])
    end
  end
  
  return
end

find_minmax (generic function with 1 method)

In [8]:
d_inputmin = CuArray(input_arr)
d_inputmax = CuArray(input_arr)

1024-element CuArray{Float64,1}:
 0.3469296475829524 
 0.42652320964933255
 0.14672477427006925
 0.590779659025265  
 0.7991733975272257 
 0.18554553957191633
 0.5793484178233579 
 0.9902520929288257 
 0.4793704817045328 
 0.49014173417192786
 0.10424878780665314
 0.35466155136589417
 0.07201106740972119
 ⋮                  
 0.249573869176974  
 0.79763283942174   
 0.8157098777157057 
 0.14417111158756457
 0.895054338620928  
 0.29651677991731784
 0.2843714916298379 
 0.7016794885722495 
 0.7141857000778034 
 0.7202133037559715 
 0.46409043297108643
 0.27396556346921974

In [0]:
@cuda blocks = Int(ceil(N/1024)) threads = 1024 find_minmax(d_inputmin,d_inputmax)

In [13]:
minimum_brightness = Array(d_inputmin)[1]

0.00032795669397178706

In [14]:
maximum_brightness = Array(d_inputmax)[1]

0.998222849796156

In [15]:
range = maximum_brightness - minimum_brightness

0.9978948931021843

In [16]:
# Number of bins in histogram
const bins = 10
# Chunk of memory accessed by each thread while constructing histogram
const chunk_histo = 4

4

In [17]:
histo = zeros(Int,bins)
d_outputhisto = CuArray(histo)
d_input_to_histo = CuArray(input_arr)

1024-element CuArray{Float64,1}:
 0.3469296475829524 
 0.42652320964933255
 0.14672477427006925
 0.590779659025265  
 0.7991733975272257 
 0.18554553957191633
 0.5793484178233579 
 0.9902520929288257 
 0.4793704817045328 
 0.49014173417192786
 0.10424878780665314
 0.35466155136589417
 0.07201106740972119
 ⋮                  
 0.249573869176974  
 0.79763283942174   
 0.8157098777157057 
 0.14417111158756457
 0.895054338620928  
 0.29651677991731784
 0.2843714916298379 
 0.7016794885722495 
 0.7141857000778034 
 0.7202133037559715 
 0.46409043297108643
 0.27396556346921974

In [18]:
# Kernel to construct histogram out of brightness values using Chrominance Luminance formula

function make_histo(d_input,d_max,d_min,d_range,d_outputhisto)

  idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
 
  shared_histo = @cuStaticSharedMem(Int64,bins)
  
  if(idx <= div(size(d_input,1),chunk_histo))
  
    for i=1:bins
      shared_histo[i]=0
    end
    
    for i=(idx*chunk_histo-(chunk_histo-1)):(idx*chunk_histo)
      bin_calc = Int64(div((d_input[i]-d_min)*bins,d_range))
      bin_calc = bin_calc == 0 ? 1 : bin_calc
      @atomic shared_histo[bin_calc] += 1
    end    
    
    sync_threads()
    
    if(threadIdx().x==1)
      for i=1:bins
        @atomic d_outputhisto[i] += shared_histo[i]
      end
    end
    
  end
return nothing
end

make_histo (generic function with 1 method)

In [0]:
@cuda blocks = Int(ceil(N/(1024*chunk_histo))) threads = 1024 make_histo(d_input_to_histo,maximum_brightness,minimum_brightness,range,d_outputhisto)

In [21]:
# Histogram constructed
h_histo = Array(d_outputhisto)

10-element Array{Int64,1}:
 197
 101
  97
 110
 110
 105
  88
 116
  99
   1

In [22]:
const threads = bins
# Chunk of memory accessed by single thread during scan
const chunk_scan = Int(ceil(bins/threads))

1

In [23]:
# Parallel implementation of scan using thread overloading

function scan(d_input)

  idx=(blockIdx().x - 1) * blockDim().x + threadIdx().x
    
  temp = @MArray zeros(chunk_scan)
  
  start = chunk_scan * idx - (chunk_scan-1)
  stop = chunk_scan * idx
  
  # Bounds check
  if(idx <= min(size(d_input,1),blockDim().x))
    step = 1
    while step < size(d_input,1)
    
    for i=start:stop
      if(i <= size(d_input,1))
        temp[i - (idx-1)*chunk_scan] = d_input[i]
      end
    end
        
    for i=start:stop
      if(i + step <= size(d_input,1))
        temp[i - (idx-1)*chunk_scan] += d_input[i + step]
      end
    end
    
    sync_threads()
    
    for i=start:stop
      if(i + step <= size(d_input,1))
        d_input[i + step] = temp[i - (idx-1)*chunk_scan]
      end
    end
    
    step *= 2
    
    end
  end

return 
end

scan (generic function with 1 method)

In [0]:
d_histo = CuArray(h_histo)
@cuda threads = threads scan(d_histo)

In [25]:
# Inclusive scan (Hillis-Steele-scan) of the histogram
inc_scan = Array(d_histo)

10-element Array{Int64,1}:
  197
  298
  395
  505
  615
  720
  808
  924
 1023
 1024

In [26]:
# Conversion to exclusive scan
for i=size(inc_scan,1):-1:2
  inc_scan[i] = inc_scan[i-1]  
end
inc_scan[1] = 0

0

In [27]:
# Final Tone Mapping result (exclusive scan)
exc_scan = inc_scan

10-element Array{Int64,1}:
    0
  197
  298
  395
  505
  615
  720
  808
  924
 1023

## Result is an array of tone-mapped values which is nothing but cumulative distribution of luminance values