## Using CUDA

In [1]:
using DrWatson
@quickactivate "MicroGridSimWithRL"

In [2]:
using CUDA
using Test
using BenchmarkTools
using Flux
using ReinforcementLearning

In [4]:
N =10
x_d = CUDA.fill(1.0f0, N) # Float32
y_d = CUDA.fill(2.0f0, N)

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [5]:
y_d .+= x_d
y_d

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [6]:
@test all(y_d .== 3.0f0)

[32m[1mTest Passed[22m[39m
  Expression: all(y_d .== 3.0f0)

In [7]:
function broadcast_add!(x, y)
        CUDA.@sync y += x
    nothing
end

fill!(y_d, 2)
x_d

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

In [8]:
@btime broadcast_add!($x_d, $y_d)

  78.300 μs (16 allocations: 752 bytes)


In [9]:
function gpu_add1!(x, y)
    for i = 1:length(x)
        @inbounds y[i] += x[i]
    end
    nothing
end

fill!(y_d, 2)

@cuda gpu_add1!(x_d, y_d)

CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x0000000091e5aeb0, CuModule(Ptr{Nothing} @0x000000000b588ab0, CuContext(0x00000000918f14e0, instance d705e3f736760a58))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [10]:
function benc_gpu1!(x, y)
    CUDA.@sync begin
        @cuda gpu_add1!(x, y)
    end
end

@btime benc_gpu1!($x_d, $y_d)

  75.900 μs (5 allocations: 304 bytes)


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x0000000091e5aeb0, CuModule(Ptr{Nothing} @0x000000000b588ab0, CuContext(0x00000000918f14e0, instance d705e3f736760a58))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [11]:
CUDA.@profile benc_gpu1!(x_d, y_d)

│ The user is responsible for launching Julia under a CUDA profiler.
│ 
│ It is recommended to use Nsight Systems, which supports interactive profiling:
│ $ nsys launch julia
└ @ CUDA.Profile C:\Users\vikas\.julia\packages\CUDA\DfvRa\lib\cudadrv\profile.jl:82


CUDA.HostKernel{typeof(gpu_add1!), Tuple{CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1}}}(gpu_add1!, CuFunction(Ptr{Nothing} @0x0000000091e5aeb0, CuModule(Ptr{Nothing} @0x000000000b588ab0, CuContext(0x00000000918f14e0, instance d705e3f736760a58))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [12]:
A = CuArray([1])

1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
 1

In [13]:
for i = 1:200
    push!(A, i)
end

ErrorException: Scalar indexing is disallowed.
Invocation of setindex! resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.

In [14]:
A

2-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
                   1
 4629700418014806016

In [30]:
sample(A, 10)

LoadError: UndefVarError: sample not defined

In [32]:
BatchSampler{SARTS}

BatchSampler{(:state, :action, :reward, :terminal, :next_state)}

In [33]:
inds, batch = sample(p.rng, traj, BatchSampler{SARTS}(p.batch_size))

LoadError: UndefVarError: p not defined

In [34]:
 [CUDA.capability(dev) for dev in CUDA.devices()]

1-element Vector{VersionNumber}:
 v"6.1.0"

In [35]:
CUDA.device()

CuDevice(0): NVIDIA GeForce MX150

In [36]:
function add(x, y)
    x+y
end

add(2,3)

5

In [51]:
function Flux.gpu(add)
    

nothing

In [42]:
add(1, 2)

3

In [43]:
x= cu(1)
y = cu(2)
add(x, y)

3

LoadError: UndefVarError: type not defined

In [46]:
@btime add(x,y)

  21.486 ns (0 allocations: 0 bytes)


3

In [1]:
@btime Flux.gpu(add($x,$y))

LoadError: LoadError: UndefVarError: @btime not defined
in expression starting at t:\DARE\Julia\DARE\dare\pre_investigations\julia\notebooks\Experiments with CUDA.ipynb:1

In [49]:
methods(add)

In [3]:
CUDA.versioninfo()

CUDA toolkit 11.7, artifact installation
Unknown NVIDIA driver, for CUDA 11.7
CUDA driver 11.7

Libraries: 
- CUBLAS: 11.10.1
- CURAND: 10.2.10
- CUFFT: 10.7.1
- CUSOLVER: 11.3.5
- CUSPARSE: 11.7.3
- CUPTI: 17.0.0
- NVML: missing
- CUDNN: 8.30.2 (for CUDA 11.5.0)
- CUTENSOR: 1.4.0 (for CUDA 11.5.0)

Toolchain:
- Julia: 1.7.2
- LLVM: 12.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

Environment:
- JULIA_CUDA_NSYS: C:\Program Files\NVIDIA Corporation\Nsight Systems 2022.1.3\target-windows-x64\nsys.exe

1 device:
  0: NVIDIA GeForce MX150 (sm_61, 1.623 GiB / 2.000 GiB available)


In [19]:
# Kernel Programming
function vadd(c, a, b)
    i = threadIdx().x
    c[i] = a[i] + b[i]
    return
end

a = CuArray(1:10)
b = CuArray(1:10)
c = similar(b)
print(c)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [20]:
@cuda threads=length(a) vadd(c, a, b)

CUDA.HostKernel{typeof(vadd), Tuple{CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}}}(vadd, CuFunction(Ptr{Nothing} @0x0000000091e5eb10, CuModule(Ptr{Nothing} @0x00000000a017eaa0, CuContext(0x00000000918f14e0, instance d705e3f736760a58))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [21]:
# using threads

a = CuArray(1:100_000)
b = CuArray(1:100_000)
c = similar(a)
@cuda threads=length(a) vadd(c, a, b)

CuError: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)

In [22]:
max_threads = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
print(max_threads)

1024

In [23]:
blocks_needed = cld(length(a),1024)

98

In [24]:
function vadd(c, a, b)
    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if i <= length(a)
        c[i] = a[i] + b[i]
    end
    return
end
@cuda threads=1024 blocks=blocks_needed vadd(c, a, b)
c

100000-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
      2
      4
      6
      8
     10
     12
     14
     16
     18
     20
      ⋮
 199984
 199986
 199988
 199990
 199992
 199994
 199996
 199998
 200000

In [25]:
# Occupancy API - looks up how much actual hardware resources are needed 
# kernel programming
kernel = @cuda launch=false vadd(c, a, b)
kernel


CUDA.HostKernel{typeof(vadd), Tuple{CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}}}(vadd, CuFunction(Ptr{Nothing} @0x0000000091e5de20, CuModule(Ptr{Nothing} @0x00000000a017f590, CuContext(0x00000000918f14e0, instance d705e3f736760a58))), CUDA.KernelState(Ptr{Nothing} @0x0000000304000000))

In [26]:
config = CUDA.launch_configuration(kernel.fun)

(blocks = 6, threads = 1024)

In [None]:
@show threads = min(length(a), config.threads)
@show blocks = cld(length(a), threads)
kernel(c, a, b; threads, blocks)

In [28]:
using Distributed
addprocs(length(devices()))
@everywhere using CUDA

# assign devices
asyncmap((zip(workers(), devices()))) do (p, d)
    remotecall_wait(p) do
        @info "Worker $p uses $d"
        device!(d)
    end
end

CompositeException: On worker 2:
ArgumentError: Package CUDA not found in current path:
- Run `import Pkg; Pkg.add("CUDA")` to install the CUDA package.

Stacktrace:
 [1] require
   @ .\loading.jl:967
 [2] eval
   @ .\boot.jl:373
 [3] #103
   @ C:\Users\vikas\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\Distributed\src\process_messages.jl:274
 [4] run_work_thunk
   @ C:\Users\vikas\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\Distributed\src\process_messages.jl:63
 [5] run_work_thunk
   @ C:\Users\vikas\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\Distributed\src\process_messages.jl:72
 [6] #96
   @ .\task.jl:423

...and 1 more exception.


In [29]:
devices()

CUDA.DeviceIterator() for 1 devices:
0. NVIDIA GeForce MX150