<a href="https://colab.research.google.com/github/vcoopman/study/blob/main/CUDA/CudaMatrixMultiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!nvidia-smi

Mon Jul 15 05:39:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [19]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [12]:
%%file /tmp/cuda_matrix_multiplication.cu
/*******************************************************************************
 * Matrix Multiplication using Compute Unified Device Architecture (CUDA)
 *
 * This is a toy project used to study the use of CUDA.
 *
 *******************************************************************************
 *
 * who       when        what
 * --------  ----------  ----------------------------------------------
 * vcoopman  2024-07-13  created
 */

#include <iostream>
#include <cstdlib>
#include <ctime>

#define N 4

// CUDA kernel for matrix multiplication.
__global__ void matrix_multiplication(int *mat_a, int *mat_b, int *mat_c, int n) {
    // Get global indices i and j.
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    // Perform the matrix multiplication for element in mat_c.
    int sum = 0;
    if (i < n && j < n) {
        for (int k = 0; k < n; k++)
            sum += mat_a[i * n + k] * mat_b[k * n + j];
        mat_c[i * n + j] = sum;
    }
}

int main() {
    std::srand(std::time(0));

    // This example covers square matrices.
    int n = N;

    // Define both host and device matrices.
    int *mat_a, *mat_b, *mat_c;
    int *d_mat_a, *d_mat_b, *d_mat_c;
    int size = n * n * sizeof(int);

    // Allocate memory for host matrices.
    mat_a = (int *)malloc(size);
    mat_b = (int *)malloc(size);
    mat_c = (int *)malloc(size);

    // Initialize matrices with random integers.
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++) {
            mat_a[i * n + j] = std::rand() % 10;
            mat_b[i * n + j] = std::rand() % 10;
        }

    // Display both mat_a.
    printf("Mat A\n");
    printf("-----\n");
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            printf("%d ", mat_a[i * n + j]);
        printf("\n");
    }

    // Display both mat_b.
    printf("Mat B\n");
    printf("-----\n");
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            printf("%d ", mat_b[i * n + j]);
        printf("\n");
    }

    // Allocate memory in device matrices.
    cudaMalloc((void **)&d_mat_a, size);
    cudaMalloc((void **)&d_mat_b, size);
    cudaMalloc((void **)&d_mat_c, size);

    // Copy input matrices (mat_a, mat_b) to device memory.
    cudaMemcpy(d_mat_a, mat_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_mat_b, mat_b, size, cudaMemcpyHostToDevice);

    // One block will have enough threads to cover the entire matrix.
    // Each element of the resulting matrix will be calculated by an independent thread.
    // Because we are dealing with square matrices, we use N * N threads.
    dim3 blockSize(N, N);
    // Because our blockSize is enough to cover the problem space, we will need only 1 block in the grid.
    dim3 gridSize(1, 1);
    // Execute kernel.
    matrix_multiplication<<<gridSize, blockSize>>>(d_mat_a, d_mat_b, d_mat_c, n);

    // Copy result from device to host.
    cudaMemcpy(mat_c, d_mat_c, size, cudaMemcpyDeviceToHost);

    // Display result.
    printf("Mat C\n");
    printf("-----\n");
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            printf("%d ", mat_c[i * n + j]);
        printf("\n");
    }

    // Free allocated memory in host and device.
    cudaFree(d_mat_a);
    cudaFree(d_mat_b);
    cudaFree(d_mat_c);
    free(mat_a);
    free(mat_b);
    free(mat_c);

    return 0;

}

Overwriting /tmp/cuda_matrix_multiplication.cu


In [13]:
! nvcc /tmp/cuda_matrix_multiplication.cu -o /tmp/cuda_matrix_multiplication && /tmp/cuda_matrix_multiplication

Mat A
-----
8 3 7 8 
8 7 0 8 
1 3 9 8 
9 8 8 1 
Mat B
-----
2 4 9 1 
4 4 0 8 
8 7 5 7 
7 3 6 6 
Mat C
-----
140 117 155 129 
100 84 120 112 
142 103 102 136 
121 127 127 135 


In [17]:
# Profile the app.
!nvprof /tmp/cuda_matrix_multiplication

Mat A
-----
4 9 1 1 
0 4 1 3 
4 1 2 9 
9 7 3 3 
Mat B
-----
0 1 1 4 
5 1 9 8 
1 0 5 4 
3 4 5 8 
==7971== NVPROF is profiling process 7971, command: /tmp/cuda_matrix_multiplication
Mat C
-----
49 17 95 100 
30 16 56 60 
34 41 68 104 
47 28 102 128 
==7971== Profiling application: /tmp/cuda_matrix_multiplication
==7971== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   50.40%  4.0000us         1  4.0000us  4.0000us  4.0000us  matrix_multiplication(int*, int*, int*, int)
                   26.61%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
                   22.98%  1.8240us         2     912ns     640ns  1.1840us  [CUDA memcpy HtoD]
      API calls:   99.50%  103.68ms         3  34.559ms  2.8610us  103.66ms  cudaMalloc
                    0.18%  190.07us         1  190.07us  190.07us  190.07us  cudaLaunchKernel
                    0.14%  145.53us       114  1.2760us     133ns  61.353us  cuDevi