In [None]:
%%writefile mmul.cu

#include <stdio.h>
#include <time.h>


#define N  1024

__global__ void matrixMulGPU( int * a, int * b, int * c )
{
  int val = 0;

  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  if (row < N && col < N)
  {
    for ( int k = 0; k < N; ++k )
      val += a[row * N + k] * b[k * N + col];
    c[row * N + col] = val;
  }
}

void matrixMulCPU( int * a, int * b, int * c )
{
  int val = 0;

  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      val = 0;
      for ( int k = 0; k < N; ++k )
        val += a[row * N + k] * b[k * N + col];
      c[row * N + col] = val;
    }
}

int main()
{
  int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations

  int size = N * N * sizeof (int); // Number of bytes of an N x N matrix


  clock_t inicio, fim; //time calculation
  double tempo;


  // Allocate memory
  cudaMallocManaged (&a, size);
  cudaMallocManaged (&b, size);
  cudaMallocManaged (&c_cpu, size);
  cudaMallocManaged (&c_gpu, size);

  // Initialize memory; create 2D matrices
  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      a[row*N + col] = row;
      b[row*N + col] = col+2;
      c_cpu[row*N + col] = 0;
      c_gpu[row*N + col] = 0;
    }

  
  dim3 threads_per_block (16, 16, 1); // A 16 x 16 block threads
  dim3 number_of_blocks ((N / threads_per_block.x) + 1, (N / threads_per_block.y) + 1, 1);

  matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );

  cudaDeviceSynchronize(); // Wait for the GPU to finish before proceeding

  // Call the CPU version to check our work

  inicio = clock();
  matrixMulCPU( a, b, c_cpu );
  fim = clock();

  tempo = (double)(fim - inicio) / CLOCKS_PER_SEC * 1000; // Calcula o tempo de execução em segundos

  // Compare the two answers to make sure they are equal
  bool error = false;
  for( int row = 0; row < N && !error; ++row )
    for( int col = 0; col < N && !error; ++col )
      if (c_cpu[row * N + col] != c_gpu[row * N + col])
      {
        printf("FOUND ERROR at c[%d][%d]\n", row, col);
        error = true;
        break;
      }
  if (!error)
    printf("Success!\n");

  // Free all our allocated memory
  cudaFree(a); cudaFree(b);
  cudaFree( c_cpu ); cudaFree( c_gpu );

  printf("\nTempo de execução: %f milissegundos para N = %d\n\n\n", tempo, N);
}

Overwriting mmul.cu


In [None]:
! if [ ! mmul -nt mmul.cu ]; then nvcc mmul.cu -o mmul; fi
! nvprof ./mmul

==1340== NVPROF is profiling process 1340, command: ./mmul
Success!

Tempo de execução: 6018.100000 milissegundos para N = 1024


==1340== Profiling application: ./mmul
==1340== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  53.047ms         1  53.047ms  53.047ms  53.047ms  matrixMulGPU(int*, int*, int*)
      API calls:   81.91%  247.18ms         4  61.795ms  13.346us  247.08ms  cudaMallocManaged
                   17.58%  53.058ms         1  53.058ms  53.058ms  53.058ms  cudaDeviceSynchronize
                    0.43%  1.3118ms         4  327.94us  180.79us  464.71us  cudaFree
                    0.05%  140.89us       101  1.3940us     135ns  54.563us  cuDeviceGetAttribute
                    0.02%  56.519us         1  56.519us  56.519us  56.519us  cudaLaunchKernel
                    0.01%  27.172us         1  27.172us  27.172us  27.172us  cuDeviceGetName
                    0.00%  6.2850us         1  6.