This supplement is for GPU computation **Q1.7**. Since the Macbook Pro I am using has an Apple silicon GPU, and the `Metal` package for that is still under development. To show the potential of GPU computation, I have this supplement for a Windows PC with a NVIDIA GPU.

In [1]:
# The following codes are excuted on a PC with NVIDIA GTX 3080 GPU
versioninfo()

Julia Version 1.8.5
Commit 17cfb8e65e (2023-01-08 06:45 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 20 × 12th Gen Intel(R) Core(TM) i7-12700KF
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, goldmont)
  Threads: 18 on 20 virtual cores
Environment:
  JULIA_NUM_THREADS = 18


In [2]:
using BenchmarkTools, DelimitedFiles, Images, LinearAlgebra, LoopVectorization
using Profile, Random
using CUDA

CUDA.versioninfo()

CUDA runtime 12.1, artifact installation
CUDA driver 12.1
NVIDIA driver 531.61.0

Libraries: 
- CUBLAS: 12.1.0
- CURAND: 10.3.2
- CUFFT: 11.0.2
- CUSOLVER: 11.4.4


- CUSPARSE: 12.0.2
- CUPTI: 18.0.0
- NVML: 12.0.0+531.61

Toolchain:
- Julia: 1.8.5
- LLVM: 13.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2
- Device capability support: sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86

1 device:
  0: NVIDIA GeForce RTX 3080 (sm_86, 6.770 GiB / 10.000 GiB available)


In [3]:
X = readdlm("nnmf-2429-by-361-face.txt")
V0full = readdlm("V0.txt", ' ', Float64)
W0full = readdlm("W0.txt", ' ', Float64)
println("Data loaded!")

Data loaded!


In [4]:
function nnmf(
    # positional arguments
    X       :: AbstractMatrix{T}, 
    r       :: Integer;
    # kw arguments
    maxiter :: Integer = 1000, 
    tolfun  :: Number = 1e-4,
    V       :: AbstractMatrix{T} = Random.rand!(similar(X, size(X, 1), r)),
    W       :: AbstractMatrix{T} = Random.rand!(similar(X, r, size(X, 2))),
    ) where T <: AbstractFloat
    # implementation
    ## initialization and preallocation
    obj0, obj, niter = 0.0, 1.0, 0
    m, n = size(X)
    XWt = similar(X, m, r)
    VtX = similar(X, r, n)
    WWt = W * transpose(W)
    VtV = similar(X, r, r)
    VWWt = similar(X, m, r)
    VtVW = similar(X, r, n)
    trXtX = norm(X, 2)^2
    WtVtX = similar(X, n, n)
    WWtVtV = similar(X, r, r)
    
    ## main loop
    while (abs(obj - obj0)) / (abs(obj0 + 1)) >= tolfun && niter < maxiter
        # update V
        mul!(XWt, X, transpose(W))
        mul!(VWWt, V, WWt) # WWt was updated later in the loop
        V .= V .* XWt ./ VWWt
        # update W
        mul!(VtX, transpose(V), X)
        mul!(VtV, transpose(V), V)
        mul!(VtVW, VtV, W)
        W .= W .* VtX ./ VtVW
        # update niter and obj
        obj0 = obj
        mul!(WWt, W, transpose(W))
        mul!(WtVtX, transpose(W), VtX)
        mul!(WWtVtV, WWt, VtV)
        obj = trXtX - 2 * tr(WtVtX) + tr(WWtVtV)
        niter += 1
    end
    
    # Output
    V, W, obj, niter
end

nnmf (generic function with 1 method)

In [5]:
# convert X, W0full, V0full as single precision matrix and transfer to GPU
Xsp  = convert(Array{Float32}, X)
Xspd = CuArray(Xsp)
Xd   = CuArray(X)
V0fullsp   = convert(Array{Float32}, V0full)
V0fullspd  = CuArray(V0fullsp)
V0fulld    = CuArray(V0full)
W0fullsp   = convert(Array{Float32}, W0full)
W0fullspd  = CuArray(W0fullsp)
W0fulld    = CuArray(W0full)
# benchmark on GPU and CPU
for r in [10, 20, 30, 40, 50]
    println("r=$r")
    V0spd = V0fullspd[:, 1:r]
    W0spd = W0fullspd[1:r, :]
    V0d = V0fulld[:, 1:r]
    W0d = W0fulld[1:r, :]
    V0sp = V0fullsp[:, 1:r]
    W0sp = W0fullsp[1:r, :]
    V0 = V0full[:, 1:r]
    W0 = W0full[1:r, :]
    println("GPU performance (single precision): ")
    @btime nnmf($Xspd, $r, V = $V0spd, W = $W0spd) setup = (
        copyto!($V0spd, $V0fullspd[:, 1:$r]),
        copyto!($W0spd, $W0fullspd[1:$r, :])
    )
    println("GPU performance (double precision): ")
    @btime nnmf($Xd, $r, V = $V0d, W = $W0d) setup = (
        copyto!($V0d, $V0fulld[:, 1:$r]),
        copyto!($W0d, $W0fulld[1:$r, :])
    )
    println
    println("CPU performance (single precision): ")
    @btime nnmf($Xsp, $r, V = $V0sp, W = $W0sp) setup = (
        copyto!($V0sp, $V0fullsp[:, 1:$r]),
        copyto!($W0sp, $W0fullsp[1:$r, :])
    )
    println("CPU performance (double precision): ")
    @btime nnmf($X, $r, V = $V0, W = $W0) setup = (
        copyto!($V0, $V0full[:, 1:$r]),
        copyto!($W0, $W0full[1:$r, :])
    )
end
print("FINISH")

r=10


GPU performance (single precision): 


  33.856 ms (88712 allocations: 3.90 MiB)
GPU performance (double precision): 




  222.383 ms (101907 allocations: 4.75 MiB)
CPU performance (single precision): 


  160.860 ms (13 allocations: 729.16 KiB)
CPU performance (double precision): 


  204.625 ms (13 allocations: 1.42 MiB)
r=20
GPU performance (single precision): 


  64.787 ms (147598 allocations: 6.49 MiB)
GPU performance (double precision): 


  257.406 ms (161970 allocations: 7.43 MiB)
CPU performance (single precision): 


  559.958 ms (15 allocations: 950.84 KiB)
CPU performance (double precision): 


  577.099 ms (13 allocations: 1.86 MiB)
r=30
GPU performance (single precision): 


  74.973 ms (172932 allocations: 7.60 MiB)
GPU performance (double precision): 




  339.337 ms (204949 allocations: 9.54 MiB)
CPU performance (single precision): 


  1.180 s (15 allocations: 1.15 MiB)
CPU performance (double precision): 


  1.026 s (13 allocations: 2.29 MiB)
r=40
GPU performance (single precision): 


  105.787 ms (215135 allocations: 9.46 MiB)
GPU performance (double precision): 


  574.992 ms (247516 allocations: 11.52 MiB)
CPU performance (single precision): 


  3.311 s (15 allocations: 1.37 MiB)
CPU performance (double precision): 


  1.264 s (13 allocations: 2.73 MiB)
r=50
GPU performance (single precision): 


  135.016 ms (260235 allocations: 11.45 MiB)
GPU performance (double precision): 


  801.783 ms (297569 allocations: 13.86 MiB)
CPU performance (single precision): 


  6.293 s (16 allocations: 1.59 MiB)
CPU performance (double precision): 


  1.894 s (16 allocations: 3.18 MiB)
FINISH

Here, the `CUDA` package provides us the high efficiency on computation, and GPU beat CPU in both `Float32` and `Float64` computation (except for the case when $r=10$). And it is obvious that GPU is extremely efficient in single precision computation. 