In [10]:
!pip install pycuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule



In [11]:
modd = SourceModule("""
#define TILE_DIM 32

__global__ void matrix_multiplication_shared_memory(const double* A, const double* B, double* C, int N, int M, int K)
{

	  // Allocate the sub-matrices to the shared memory. Note two-dim indexing.
    __shared__ double sub_A[TILE_DIM][TILE_DIM];
    __shared__ double sub_B[TILE_DIM][TILE_DIM];

    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

    const int bx = blockIdx.x;
    const int by = blockIdx.y;

    // Each block gets it own TILE_DIM sized slot in x and y directions.
    const int row = by * TILE_DIM + ty;
    const int col = bx * TILE_DIM + tx;

    double result = 0.0;

    for(int i = 0; i <  M  / TILE_DIM; i++)
        {

        // Iterate over the tile dimension to copy the data.
        sub_A[ty][tx] = A[(i * TILE_DIM + tx) + M * row];
        sub_B[ty][tx] = B[(i * TILE_DIM + ty) * K + col];

		    // Make sure that all threads have completed the memory transaction.
        __syncthreads();

        // Multiply the matrix elements inside the tile and add them to the result.
        for (int j = 0; j < TILE_DIM; j++)
            {
            result += sub_A[ty][j] * sub_B[j][tx];
            }

		    // Make sure that all of the threads have finished the calculation
        __syncthreads();
        }

    // Write back to the global memory. Using the global index.
    int C_index = K * (by * blockDim.y + ty) + (bx * blockDim.x + tx);
    C[C_index] = result;
}
  """)

In [12]:
# Set up tests.
import math
import numpy as np


A_n_rows = 2048
A_n_cols = 1024
B_n_rows = 1024
B_n_cols = 512

value_type = float

C_n_rows = A_n_rows
C_n_cols = B_n_cols

numThreadsPerBlock = 32
numBlocksx = math.ceil(C_n_cols/numThreadsPerBlock)
numBlocksy = math.ceil(C_n_rows/numThreadsPerBlock)




In [13]:

# Create the input vectors.

A = np.random.randn(A_n_rows,A_n_cols)
A = A.astype(value_type)

B = np.random.randn(B_n_rows,B_n_cols)
B = B.astype(value_type)


# Allocate the memory on the GPU and copy the vectors.

A_gpu = cuda.mem_alloc(A.size * A.dtype.itemsize)
cuda.memcpy_htod(A_gpu, A)

B_gpu = cuda.mem_alloc(B.size * B.dtype.itemsize)
cuda.memcpy_htod(B_gpu, B)

C = np.zeros([C_n_rows,C_n_cols])
C = C.astype(value_type)
C_gpu = cuda.mem_alloc(C.size * C.dtype.itemsize)


 # Call the CUDA kernel.

matrix_multiplication = modd.get_function("matrix_multiplication_shared_memory")
matrix_multiplication(A_gpu, B_gpu, C_gpu, np.int32(A_n_rows), np.int32(A_n_cols), np.int32(B_n_cols),
                        block=(numThreadsPerBlock, numThreadsPerBlock, 1),
                        grid=(numBlocksx, numBlocksy, 1))


# Copy the result back to the host.

cuda.memcpy_dtoh(C, C_gpu)

# Do same calculation in CPU.

C_cpu = np.dot(A,B)

# Verify the result

np.allclose(C_cpu,C, 0.001,0.001)


  globals().clear()
  globals().clear()


True