In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-9zg5ulku
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-9zg5ulku
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=86739e8b3c306e47c56a0e19e4b243fb0a6de2489adbef9eea5c07902cd75ecd
  Stored in directory: /tmp/pip-ephem-wheel-cache-g2d2ubuh/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [4]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCK_SIZE 32
#define MATRIX_SIZE 1024

typedef struct
{
	int ** elem;
} matrix;

void init_matrix(matrix* m)
{
	cudaMallocManaged((void**)&(m -> elem), sizeof(int*) * MATRIX_SIZE);

	for (int i = 0; i < MATRIX_SIZE; i++)
	{
		cudaMallocManaged((void**)&(m -> elem[i]), sizeof(int) * MATRIX_SIZE);
	}
}

void add_elem(matrix m)
{
	for (int i = 0; i < MATRIX_SIZE; i++)
		for (int j = 0; j < MATRIX_SIZE; j++)
		{
			m.elem[i][j] = rand() % 50;
		}
}


void matrix_multiply(matrix a, matrix b, matrix result)
{
	for (int i = 0; i < MATRIX_SIZE; i++)
		for (int j = 0; j < MATRIX_SIZE; j++)
			for(int k = 0; k < MATRIX_SIZE; k++)
				result.elem[i][j] += a.elem[i][k] * b.elem[k][j];
}

__global__ void matrix_multiply_kernel(matrix a, matrix b, matrix result)
{
	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int idx = bx * BLOCK_SIZE + tx;
	int idy = by * BLOCK_SIZE + ty;

	int sum = 0;

	// load submatrixes by blocks
	for (int block = 0; block < MATRIX_SIZE; block += BLOCK_SIZE) {
		__shared__ int a_sub[BLOCK_SIZE][BLOCK_SIZE];
		__shared__ int b_sub[BLOCK_SIZE][BLOCK_SIZE];
		a_sub[tx][ty] = (idx < MATRIX_SIZE && block + ty < MATRIX_SIZE) ? a.elem[idx][block + ty] : 0;
		b_sub[tx][ty] = (block + tx < MATRIX_SIZE && idy < MATRIX_SIZE) ? b.elem[block + tx][idy] : 0;

		__syncthreads();
		for (int i = 0; i < BLOCK_SIZE; i++)
			sum += a_sub[tx][i] * b_sub[i][ty];
		__syncthreads();
	}

	if (idx < MATRIX_SIZE && idy < MATRIX_SIZE)
	  result.elem[idx][idy] = sum;

}


void free_matrix(matrix* m) {
	for (int i = 0; i < MATRIX_SIZE; i++)
		cudaFree(m -> elem[i]);
	cudaFree(m -> elem);
}

int main(int argc, char ** argv)
{
	printf("Matrix size: %d\n", MATRIX_SIZE);

	struct timespec start, end;
  double time_taken;

	// initialize matrix
	matrix a, b, serial_result, cuda_result;
	init_matrix(&a);
	init_matrix(&b);
	init_matrix(&serial_result);
	init_matrix(&cuda_result);

	// add elements to matrix
	add_elem(a);
	add_elem(b);

	// sequential code
  clock_gettime(CLOCK_MONOTONIC, &start);
	matrix_multiply(a, b, serial_result);
  clock_gettime(CLOCK_MONOTONIC, &end);
  time_taken = (end.tv_sec - start.tv_sec) * 1e9;
  time_taken = (time_taken + (end.tv_nsec - start.tv_nsec)) * 1e-9;
  printf("Sequential code: %lf seconds\n", time_taken);

	// cuda parallel code
	int dim = MATRIX_SIZE / BLOCK_SIZE + 1;
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid(dim, dim);

  clock_gettime(CLOCK_MONOTONIC, &start);
	matrix_multiply_kernel<<<grid, block>>>(a, b, cuda_result);
	cudaDeviceSynchronize();
  clock_gettime(CLOCK_MONOTONIC, &end);
  time_taken = (end.tv_sec - start.tv_sec) * 1e9;
  time_taken = (time_taken + (end.tv_nsec - start.tv_nsec)) * 1e-9;
	printf("Parallel code (GPGPU): %lf seconds\n", time_taken);


	// verify results
	int flag = 0;
  int i = 0;
  while (!flag && i < MATRIX_SIZE){
      for (int j = 0; j < MATRIX_SIZE; j++){
          if (serial_result.elem[i][j] != cuda_result.elem[i][j]){
              flag = 1;
              break;
          }
      }
      i++;
  }
	if (!flag)
		printf("The matrices match.\n");
	else
		printf("The matrices do not match.\n");

  // free memory
	free_matrix(&a);
	free_matrix(&b);
	free_matrix(&serial_result);
	free_matrix(&cuda_result);
	return 0;
}

Matrix size: 1024
Sequential code: 17.254561 seconds
Parallel code (GPGPU): 0.044867 seconds
The matrices match.

