# Transpose of a square matrix

## Finding transpose of a square matrix using tiles


In [1]:
using Pkg
Pkg.add("CUDAnative")
Pkg.add("CuArrays")
Pkg.add("CUDAdrv")
Pkg.add("BenchmarkTools")
Pkg.add("Test")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
 [90m [be33ccc6][39m[92m + CUDAnative v2.2.1[39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Project.toml`
 [90m [c5f51814][39m[92m + CUDAdrv v3.0.1[39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.0/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package ve

In [0]:
using CUDAnative,CuArrays,CUDAdrv,BenchmarkTools,Test

In [55]:
N = 1024
input = rand(N,N)
output = zeros(N,N)
d_output = CuArray(output)
d_input = CuArray(input)
const tile = 16



16

In [56]:
# Kernel to find transpose of a square matrix using tiles

function transpose_tile(d_input,d_output,N)
  idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
  idy = (blockIdx().y - 1) * blockDim().y + threadIdx().y
  tidx = threadIdx().x
  tidy = threadIdx().y
  
  # Bounds check 
  if(idx <= N && idy <= N)
  
  # Initial indices(or coordinates) of the start of a tile
    initial_x = (blockIdx().x-1)*tile + 1
    initial_y = (blockIdx().y-1)*tile + 1
    
  # Final indices(or coordinates) of the start of a tile (Inversion of initial indices) 
    final_x = (blockIdx().y-1)*tile + 1
    final_y = (blockIdx().x-1)*tile + 1
    
  # Shared memory of size tile x tile  
    shared_mem = @cuStaticSharedMem(Float64,(tile,tile))
    
  # Reading from global memory in coalesced fashion (followed column major) and and assignment to shared memory  
    shared_mem[tidx,tidy] = d_input[initial_x + (tidx-1) + (initial_y + (tidy-1) - 1)*N]
    sync_threads()
  
  # Inverting the indices(transpose) and assignment to ouput matrix in a coalesced fashion 
    d_output[final_x + (tidx-1) + (final_y + (tidy-1) - 1)*N] = shared_mem[tidy,tidx]
    
  end
  
  return 
end

transpose_tile (generic function with 1 method)

In [0]:
@cuda blocks = (Int(ceil(N/tile)),Int(ceil(N/tile))) threads = (tile,tile) transpose_tile(d_input,d_output,N)

In [58]:
h_transpose_tile = Array(d_output)

1024×1024 Array{Float64,2}:
 0.874057   0.406268    0.363102  …  0.824866   0.991912  0.413938 
 0.0039426  0.869256    0.876876     0.352321   0.156918  0.743621 
 0.244711   0.428076    0.661831     0.726964   0.960881  0.694796 
 0.183667   0.695668    0.727765     0.181746   0.817321  0.466737 
 0.410631   0.791032    0.29206      0.381462   0.261083  0.32364  
 0.969526   0.855034    0.636019  …  0.425742   0.405174  0.141035 
 0.0396691  0.288112    0.334112     0.517133   0.346466  0.638165 
 0.498094   0.360247    0.631732     0.186802   0.760271  0.509262 
 0.300517   0.504865    0.28193      0.999294   0.934519  0.0082302
 0.728512   0.551689    0.34382      0.811818   0.689145  0.923175 
 0.391227   0.207524    0.789797  …  0.178633   0.347442  0.524699 
 0.463078   0.404616    0.855898     0.112136   0.379221  0.9582   
 0.713819   0.514439    0.446427     0.948149   0.675813  0.865258 
 ⋮                                ⋱                                
 0.31569    0.479621

In [0]:
serial_output = zeros(N,N);

In [60]:
# serial code for matrix transpose

function serial_transpose(serial_output,input,N)
  for i=1:N
    for j=1:N
      serial_output[i + (j-1)*N] = input[j + (i-1)*N]
    end
  end
end

serial_transpose (generic function with 1 method)

In [0]:
serial_transpose(serial_output,input,N)

In [62]:
@test serial_output == h_transpose_tile

[32m[1mTest Passed[22m[39m

In [70]:
@benchmark @cuda blocks = (Int(ceil(N/tile)),Int(ceil(N/tile))) threads = (tile,tile) transpose_tile(d_input,d_output,N)

BenchmarkTools.Trial: 
  memory estimate:  2.09 KiB
  allocs estimate:  63
  --------------
  minimum time:     9.595 μs (0.00% GC)
  median time:      766.312 μs (0.00% GC)
  mean time:        700.809 μs (0.00% GC)
  maximum time:     1.411 ms (0.00% GC)
  --------------
  samples:          7113
  evals/sample:     1

In [64]:
@benchmark serial_transpose(output,input,N)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.868 ms (0.00% GC)
  median time:      5.112 ms (0.00% GC)
  mean time:        5.149 ms (0.00% GC)
  maximum time:     9.752 ms (0.00% GC)
  --------------
  samples:          970
  evals/sample:     1

In [65]:
@btime @cuda blocks = (Int(ceil(N/tile)),Int(ceil(N/tile))) threads = (tile,tile) transpose_tile(d_input,d_output,N)

  9.611 μs (63 allocations: 2.09 KiB)


In [66]:
@btime serial_transpose(output,input,N)

  4.870 ms (0 allocations: 0 bytes)


## Launching a tile in shared memory and then using coalesced reads and writes in the matrix of about a million elements takes lesser time to compute the transpose 