## Using CUDA

In [1]:
using DrWatson
@quickactivate "MicroGridSimWithRL"

In [2]:
using CUDA
using Test
using BenchmarkTools
using Flux
using ReinforcementLearning

In [3]:
N =10
x_d = CUDA.fill(1.0f0, N) # Float32
y_d = CUDA.fill(2.0f0, N)

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [4]:
y_d .+= x_d
y_d

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [5]:
@test all(y_d .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y_d .== 3.0f0)

In [14]:
function broadcast_add!(x, y)
        CUDA.@sync y += x
    nothing
end

fill!(y_d, 2)
x_d

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

In [16]:
@btime broadcast_add!($x_d, $y_d)

  69.000 μs (31 allocations: 1.78 KiB)


In [17]:
function gpu_add1!(x, y)
    for i = 1:length(x)
        @inbounds y[i] += x[i]
    end
    nothing
end

fill!(y_d, 2)

@cuda gpu_add1!(x_d, y_d)

CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x00000000a078a0e0, CuModule(Ptr{Nothing} @0x00000000a07515f0, CuContext(0x0000000062cb8aa0, instance b4028a4f937b5eae))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [18]:
function benc_gpu1!(x, y)
    CUDA.@sync begin
        @cuda gpu_add1!(x, y)
    end
end

@btime benc_gpu1!($x_d, $y_d)

  72.500 μs (5 allocations: 304 bytes)


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x00000000a078a0e0, CuModule(Ptr{Nothing} @0x00000000a07515f0, CuContext(0x0000000062cb8aa0, instance b4028a4f937b5eae))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [19]:
CUDA.@profile benc_gpu1!(x_d, y_d)

│ The user is responsible for launching Julia under a CUDA profiler.
│ 
│ It is recommended to use Nsight Systems, which supports interactive profiling:
│ $ nsys launch julia
└ @ CUDA.Profile C:\Users\vikas\.julia\packages\CUDA\GGwVa\lib\cudadrv\profile.jl:82


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x00000000a078a0e0, CuModule(Ptr{Nothing} @0x00000000a07515f0, CuContext(0x0000000062cb8aa0, instance b4028a4f937b5eae))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [23]:
A = CuArray([1])

1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
 1

In [27]:
for i = 1:200
    push!(A, i)
end

In [28]:
A

180046-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
   1
   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
   ⋮
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200

In [30]:
sample(A, 10)

LoadError: UndefVarError: sample not defined

In [32]:
BatchSampler{SARTS}

BatchSampler{(:state, :action, :reward, :terminal, :next_state)}

In [33]:
inds, batch = sample(p.rng, traj, BatchSampler{SARTS}(p.batch_size))

LoadError: UndefVarError: p not defined

In [34]:
 [CUDA.capability(dev) for dev in CUDA.devices()]

1-element Vector{VersionNumber}:
 v"6.1.0"

In [35]:
CUDA.device()

CuDevice(0): NVIDIA GeForce MX150

In [36]:
function add(x, y)
    x+y
end

add(2,3)

5

In [51]:
Flux.gpu(add)

nothing

In [42]:
add(1, 2)

3

In [43]:
x= cu(1)
y = cu(2)
add(x, y)

3

LoadError: UndefVarError: type not defined

In [46]:
@btime add(x,y)

  21.486 ns (0 allocations: 0 bytes)


3

In [47]:
@btime Flux.gpu(add(x,y))

  383.744 ns (3 allocations: 352 bytes)


3

In [49]:
methods(add)

In [None]:
()