In [1]:
import numpy as np
from numba import cuda, jit, float32
import time
import math

In [2]:
def matrix_multiply_cpu(A, B):
    cpu_start=time.time()
   
    C=np.dot(A, B)
    cpu_time=time.time()-cpu_start
    return C, cpu_time

In [3]:

TPB = 32  # Tile size and threads per block

@cuda.jit
def tiled_matrix_mult(A, B, C):
    # Shared memory for tiles
    tile_A = cuda.shared.array((TPB, TPB), dtype=float32)
    tile_B = cuda.shared.array((TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)
    tx, ty = cuda.threadIdx.x, cuda.threadIdx.y
    bx, by = cuda.blockIdx.x, cuda.blockIdx.y

    row, col = y, x
    if row >= C.shape[0] or col >= C.shape[1]:
        return

    sum = 0.0
    for i in range((A.shape[1] + TPB - 1) // TPB):
        # Load in bounds tiles into shared memory
        if i * TPB + tx < A.shape[1] and row < A.shape[0]:
            tile_A[ty, tx] = A[row, i * TPB + tx]
        else:
            tile_A[ty, tx] = 0

        if i * TPB + ty < B.shape[0] and col < B.shape[1]:
            tile_B[ty, tx] = B[i * TPB + ty, col]
        else:
            tile_B[ty, tx] = 0

        cuda.syncthreads()

        for k in range(TPB):
            sum += tile_A[ty, k] * tile_B[k, tx]

        cuda.syncthreads()

    if row < C.shape[0] and col < C.shape[1]:
        C[row, col] = sum

def matrix_multiply_gpu(A, B):
    A_device = cuda.to_device(A)
    B_device = cuda.to_device(B)
    C_device = cuda.device_array((A.shape[0], B.shape[1]), dtype=np.float32)

    grid_dims = ((A.shape[0] + TPB - 1) // TPB, (B.shape[1] + TPB - 1) // TPB)
    block_dims = (TPB, TPB)

    start_time = time.time()
    tiled_matrix_mult[grid_dims, block_dims](A_device, B_device, C_device)
    cuda.synchronize()
    gpu_time = time.time() - start_time

    return C_device.copy_to_host(), gpu_time

In [4]:
def random_matrix(n):
    A = np.random.default_rng().standard_normal(size=(n,n), dtype='float32')
    B = np.random.default_rng().standard_normal(size=(n,n), dtype='float32')
    return A, B

In [5]:
A, B = random_matrix(10000)
print(A.shape)
print(B.shape)


    # Perform matrix multiplication on the CPU
cpu_start=time.time()
total_kernel_time=0.0
C_cpu, cpu_time = matrix_multiply_cpu(A, B)
total_cpu_time=time.time()-cpu_start
gpu_start=time.time()
C, gpu_time = matrix_multiply_gpu(A, B)
total_gpu_time=time.time()-gpu_start

print(f"CPU time: {total_cpu_time}s, GPU time: {total_gpu_time}s")

(10000, 10000)
(10000, 10000)
CPU time: 2.3269824981689453s, GPU time: 11.393534421920776s
