## GPU Friendliness for RL pipeline

In [1]:
using DrWatson
@quickactivate "MicroGridSimWithRL"

In [2]:
using CUDA
using TimerOutputs
using BenchmarkTools

In [3]:
x_large = 1:100_000
x_small = 1:1000

1:1000

In [4]:
timer_large = TimerOutput();
timer_small = TimerOutput();

In [5]:
# const timer1 = TimerOutput()
a = 1:100_000


function compute_sin(x, timer::TimerOutput)
    @timeit timer "inside" begin
        sin(x)
    end
end


function loop_through(a, timer::TimerOutput)
    temp = 0
    @timeit timer "outside" begin
    for i = 1:length(a)
            temp += compute_sin(i, timer) 
        end
    end
    temp
end

const timer1 = TimerOutput()
reset_timer!(timer1)

loop_through(a, timer1)
timer1

[0m[1m ─────────────────────────────────────────────────────────────────────[22m
[0m[1m                    [22m         Time                    Allocations      
                    ───────────────────────   ────────────────────────
  Tot / % measured:      7.73s /   0.8%           83.8MiB /   0.0%    

 Section    ncalls     time    %tot     avg     alloc    %tot      avg
 ─────────────────────────────────────────────────────────────────────
 outside         1   62.4ms  100.0%  62.4ms      752B  100.0%     752B
   inside     100k   27.5ms   44.0%   275ns     0.00B    0.0%    0.00B
[0m[1m ─────────────────────────────────────────────────────────────────────[22m

In [6]:
reset_timer!(timer1)
@benchmark loop_through($a, $timer1)

BenchmarkTools.Trial: 46 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 52.296 ms[22m[39m … [35m468.519 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m105.779 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m118.016 ms[22m[39m ± [32m 68.563 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m▂[39m [39m [34m [39m[39m [39m█[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m▁[39m▃[39m▃

In [9]:
temp = 0.0
print(size(temp))

()

In [9]:
function compute_sin(x)
    sin.(x)
end

compute_sin (generic function with 2 methods)

In [10]:
@timeit timer_large "Large array" begin
    compute_sin(x_large)
end
timer_large

[0m[1m ────────────────────────────────────────────────────────────────────────[22m
[0m[1m                       [22m         Time                    Allocations      
                       ───────────────────────   ────────────────────────
   Tot / % measured:         316s /   0.0%           0.94GiB /   0.8%    

 Section       ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────
 Large array        1   90.6ms  100.0%  90.6ms   8.13MiB  100.0%  8.13MiB
[0m[1m ────────────────────────────────────────────────────────────────────────[22m

In [28]:
to = TimerOutput()

[0m[1m ────────────────────────────────────────────────────────────────────[22m
[0m[1m                   [22m         Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      403μs /   0.0%           3.89KiB /   0.0%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
[0m[1m ────────────────────────────────────────────────────────────────────[22m

In [29]:
to

[0m[1m ────────────────────────────────────────────────────────────────────[22m
[0m[1m                   [22m         Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      8.07s /   0.0%           80.0KiB /   0.0%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
[0m[1m ────────────────────────────────────────────────────────────────────[22m

In [30]:
to

[0m[1m ────────────────────────────────────────────────────────────────────[22m
[0m[1m                   [22m         Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      17.5s /   0.0%            156KiB /   0.0%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
[0m[1m ────────────────────────────────────────────────────────────────────[22m

In [31]:
@timeit to "compute" begin
    sin.(x_large)
end

100000-element Vector{Float64}:
  0.8414709848078965
  0.9092974268256817
  0.1411200080598672
 -0.7568024953079282
 -0.9589242746631385
 -0.27941549819892586
  0.6569865987187891
  0.9893582466233818
  0.4121184852417566
 -0.5440211108893698
 -0.9999902065507035
 -0.5365729180004349
  0.4201670368266409
  ⋮
 -0.9991928068655559
 -0.573669175218859
  0.37928325051310907
  0.9835244048776512
  0.6835177571528606
 -0.24491196429463183
 -0.9481707552390307
 -0.7796857265301358
  0.10563876344561439
  0.8938394614875846
  0.860248280789742
  0.03574879797201651

In [32]:
to

[0m[1m ────────────────────────────────────────────────────────────────────[22m
[0m[1m                   [22m         Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      27.5s /   0.1%            109MiB /   0.7%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
 compute        1   30.6ms  100.0%  30.6ms    816KiB  100.0%   816KiB
[0m[1m ────────────────────────────────────────────────────────────────────[22m

In [11]:
@timeit timer_small "Smaller array" begin
    compute_sin(x_small)
end

timer_small

[0m[1m ──────────────────────────────────────────────────────────────────────────[22m
[0m[1m                         [22m         Time                    Allocations      
                         ───────────────────────   ────────────────────────
    Tot / % measured:          316s /   0.0%           0.94GiB /   0.0%    

 Section         ncalls     time    %tot     avg     alloc    %tot      avg
 ──────────────────────────────────────────────────────────────────────────
 Smaller array        1   49.8μs  100.0%  49.8μs   8.11KiB  100.0%  8.11KiB
[0m[1m ──────────────────────────────────────────────────────────────────────────[22m

In [12]:
@benchmark begin compute_sin(x_large); end

BenchmarkTools.Trial: 2032 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.410 ms[22m[39m … [35m 10.932 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 78.65%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.253 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.438 ms[22m[39m ± [32m861.726 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.89% ±  9.15%

  [39m [39m [39m [39m [39m▄[39m█[39m█[39m█[39m▇[34m▅[39m[39m▂[32m▂[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▂[39m▃[39m▇[39m█[39m█[

In [13]:
@btime compute_sin(x_large);

  1.407 ms (2 allocations: 781.30 KiB)


In [14]:
# @benchmark begin sin.(x_large); end

In [15]:
@btime sin.(x_large);

  1.428 ms (6 allocations: 781.42 KiB)


In [16]:
@btime compute_sin(x_small);

  12.600 μs (1 allocation: 7.94 KiB)


In [17]:
@btime sin.(x_small);

  13.000 μs (5 allocations: 8.06 KiB)


## Memory copy

In [18]:
using CUDA

function copy!(A, B)
    ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
    iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
    A[ix,iy] = B[ix,iy]
    return
end

threads = (4, 3)
blocks  = (2, 2)
nx, ny  = threads[1]*blocks[1], threads[2]*blocks[2]
A       = CUDA.zeros(Float64, nx, ny)
B       =  CUDA.rand(Float64, nx, ny)

@cuda blocks=blocks threads=threads copy!(A, B)
synchronize()

In [19]:
CUDA.device()

CuDevice(0): NVIDIA GeForce MX150

In [19]:
collect(devices())

1-element Vector{CuDevice}:
 CuDevice(0): NVIDIA GeForce MX150

In [20]:
nx = ny = 32
A = CUDA.zeros(Float64, nx, ny);
B = CUDA.rand(Float64, nx, ny);
@benchmark begin copyto!($A, $B); synchronize() end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m56.100 μs[22m[39m … [35m392.200 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m67.800 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m71.810 μs[22m[39m ± [32m 15.928 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m▅[39m█[34m▆[39m[39m▄[32m▂[39m[39m▂[39m▁[39m▁[39m▁[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m▃[39m▁[39m▁[39m▁[39m▃

In [21]:
t_it = @belapsed begin copyto!($A, $B); synchronize() end

6.02e-5

In [22]:
T_tot = 2*1/1e9*nx*ny*sizeof(Float64)/t_it

0.2721594684385382

In [23]:
array_sizes = []
throughputs = []
for pow = 0:11
    nx = ny = 32*2^pow
    if (3*nx*ny*sizeof(Float64) > CUDA.available_memory()) break; end
    A = CUDA.zeros(Float64, nx, ny);
    B = CUDA.rand(Float64, nx, ny);
    t_it = @belapsed begin copyto!($A, $B); synchronize() end
    T_tot = 2*1/1e9*nx*ny*sizeof(Float64)/t_it
    push!(array_sizes, nx)
    push!(throughputs, T_tot)
    println("(nx=ny=$nx) T_tot = $(T_tot)")
    CUDA.unsafe_free!(A)
    CUDA.unsafe_free!(B)
end

(nx=ny=32) T_tot = 0.26256410256410256
(nx=ny=64) T_tot = 1.0469009584664537
(nx=ny=128) T_tot = 3.260497512437811
(nx=ny=256) T_tot = 15.673781763826607
(nx=ny=512) T_tot = 29.662687411598306
(nx=ny=1024) T_tot = 37.18354609929078
(nx=ny=2048) T_tot = 36.08585470774857
(nx=ny=4096) T_tot = 39.70116484751679


In [24]:
T_tot_max, index = findmax(throughputs)
nx = ny = array_sizes[index]
A = CUDA.zeros(Float64, nx, ny);
B = CUDA.rand(Float64, nx, ny);

In [25]:
nx

4096

## Basics of applications' performance

1. Memory-bound : memory access speed - memory access paek performnace : GB/s
2. Compute-bound : speed of computation - computation peak performance : GFLOPS/s (G Floating point operations)

Generally, MB << CB

So to quantify the limitation:
        X = (CB / MB) * (sizeof(Number))

X floating point operations  / Number

## (GPU) Memory copy

In [22]:
nx = ny = 32
A = CUDA.zeros(Float64, nx, ny);
B = CUDA.rand(Float64, nx, ny);

@benchmark begin copyto!($A, $B); synchronize() end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 56.600 μs[22m[39m … [35m 1.992 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m 86.400 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m108.410 μs[22m[39m ± [32m76.241 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▇[39m█[39m▇[39m▆[39m▆[34m▆[39m[39m▆[39m▅[39m▅[32m▅[39m[39m▅[39m▄[39m▄[39m▄[39m▄[39m▃[39m▃[39m▂[39m▂[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m▇[39m█[39m█[39m█[39m█

In [23]:
@inbounds memory_copy_GPU!(A, B) = (A .= B)
# inbounds macro deactivates array bounds checking 
@benchmark begin memory_copy_GPU!($A, $B); synchronize() end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 70.200 μs[22m[39m … [35m 1.401 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m 98.550 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m114.883 μs[22m[39m ± [32m55.320 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m▆[39m█[39m▁[39m▂[39m [34m [39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▁[39m▃[39m█[39m█[39m█

In [24]:
nx = ny = 2048
A = CUDA.zeros(Float64, nx, ny);
B = CUDA.rand(Float64, nx, ny);

@benchmark begin copyto!($A, $B); synchronize() end

BenchmarkTools.Trial: 2139 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.866 ms[22m[39m … [35m16.025 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.176 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.307 ms[22m[39m ± [32m 1.137 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m▂[39m▆[39m▆[39m▇[39m█[39m▆[39m▅[39m▆[34m▃[39m[39m▂[39m▂[39m▂[39m▄[39m▅[32m▂[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m▂[39m▂[39m▃[39m▅[39m▇[39m█[39m█

In [25]:
@benchmark begin memory_copy_GPU!($A, $B); synchronize() end

BenchmarkTools.Trial: 2087 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.981 ms[22m[39m … [35m16.005 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.230 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.366 ms[22m[39m ± [32m 1.173 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m▁[39m▂[39m▅[39m▇[39m▆[39m█[39m█[39m▅[39m▃[34m▃[39m[39m▁[39m▃[39m▄[39m▄[39m▆[39m▂[32m▁[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▄[39m▇[39m█[39m█[39m█[39m█[39m█

In [26]:
t_it = @belapsed begin memory_copy_GPU!($A, $B); synchronize() end
# T_tot = 2*1/1e9*nx*ny*sizeof(Float64)/t_it

0.00198

In [27]:
# GPU kernel 
@inbounds function memory_copy_GPU_K!(A, B)
    ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
    iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
    A[ix,iy] = B[ix,iy]
    return nothing
end

memory_copy_GPU_K! (generic function with 1 method)

In [67]:
threads = (1, 1)
blocks  = (nx÷threads[1], ny)
@benchmark begin @cuda blocks=$blocks threads=$threads memory_copy_GPU_K!($A, $B); synchronize() end

BenchmarkTools.Trial: 38 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m129.900 ms[22m[39m … [35m255.613 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m130.702 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m133.991 ms[22m[39m ± [32m 20.276 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [34m█[39m[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [34m█[39m[39m▁[32m▅

In [69]:
threads = (32, 1)
blocks  = (nx÷threads[1], ny)
@benchmark begin @cuda blocks=$blocks threads=$threads memory_copy_GPU_K!($A, $B); synchronize() end

BenchmarkTools.Trial: 641 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m6.829 ms[22m[39m … [35m60.380 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m7.639 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m7.756 ms[22m[39m ± [32m 3.365 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [34m [39m[39m [39m▂[39m▁[39m▅[32m▅[39m[39m▆[39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▄[39m▄[39m▃[39m▁[39m▄[39m▄[39m▄[

In [58]:
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads memory_copy_GPU_K!($A, $B); synchronize() end

6.47e-5

In [56]:
threads = (32, 1)
blocks  = (nx÷threads[1], ny)
@benchmark begin @cuda blocks=$blocks threads=$threads memory_copy_GPU_K!($A, $B); synchronize() end

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m61.600 μs[22m[39m … [35m 1.291 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m70.600 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m73.625 μs[22m[39m ± [32m17.662 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m▃[39m▄[39m█[34m▇[39m[39m▆[39m▄[32m▃[39m[39m▂[39m▁[39m▂[39m▂[39m▂[39m▁[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m▃[39m▁[39m▁[39m▃[39m▄[39m▇[3

In [60]:
t_it = @belapsed begin @cuda blocks=$blocks threads=$threads memory_copy_GPU_K!($A, $B); synchronize() end
T_tot = 2*1/1e9*nx*ny*sizeof(Float64)/t_it

0.26815057283142396