# CUDA Matrix Multiplication

In [None]:
"""
    CPU sequential matrix multiplication

"""
function MM!(c, a, b, N)
    for i in range(1,N)
        for j in range(1,N)
            temp = zero(eltype(c))
            for k in range(1,N)
                temp += a[i,k]*b[k,j]
            end
            c[i,j] = temp
        end
    end
end

In [None]:
using Base.Threads

In [None]:
"""

    CPU multi-threading matrix multiplication

"""
function threads_MM!(c, a, b, N)
    @threads for i in range(1,N)
        for j in range(1,N)
            temp = zero(eltype(c))
            for k in range(1,N)
                temp += a[i,k]*b[k,j]
            end
            c[i,j] = temp
        end
    end
end

In [None]:
using CUDA

In [None]:
"""

    CUDA matrix multiplication

"""
function CUDA_MM_v1!(c, a, b, N)
    
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x # row index
    j = (blockIdx().y-1) * blockDim().y + threadIdx().y # column index
    
    if i > N || j > N return nothing end
    
    temp = zero(eltype(c))
    for k in range(1,N)
         temp += a[i,k]*b[k,j]
    end    
    c[i,j] = temp

    return nothing
end

In [None]:
"""

    TODO: CUDA matrix multiplication Using shared memory

"""
function CUDA_MM_v2!(c, a, b, N)
    
   

    return nothing
end

# Timing

In [None]:
using BenchmarkTools

In [None]:
n = 1600
x = Matrix{Float32}(fill(1f0, n, n))
y = Matrix{Float32}(fill(1f0, n, n))
z = Matrix{Float32}(fill(0f0, n, n));

In [None]:
@time "sequential time: " MM!(z, x, y, n)

In [None]:
@time "multi_threading time: " threads_MM!(z, x, y, n)

In [None]:
@time "CUDA time:" begin
    dx = CuArray(x)
    dy = CuArray(y)
    dz = similar(dx)
    
    threads = (16, 16)
    blocks  = (cld(n, 16), cld(n, 16)) # Note row x col Blocks in Grid

    CUDA.@sync begin
        @cuda blocks=blocks threads=threads CUDA_MM_v1!(dz, dx, dy,n)
    end
    d2h_z = Matrix(dz);
    
end

In [None]:
z ≈ d2h_z