In [None]:
!nvidia-smi

Fri Apr 18 15:55:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# Step 1: Write CUDA code to a file
code = r'''
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256
#define OUT_TILE_DIM 32
#define MASK_DIM 3
#define MASK_RADIUS (MASK_DIM / 2)


__constant__ float mask_c[MASK_DIM][MASK_DIM];
__global__ void convolutional_kernel(float* input, float* output, unsigned int width, unsigned int height)
{
  int outRow = blockIdx.y*blockDim.y+threadIdx.y;
  int outCol = blockIdx.x*blockDim.x+threadIdx.x;

  // Boundary Condtion
  if (outRow < height && outCol < width)
  {
    float sum = 0.0f;
    for (int maskRow = 0; maskRow < MASK_DIM; ++maskRow)
    {
      for (int maskCol = 0; maskCol < MASK_DIM; ++maskCol)
      {
        int inRow = outRow - MASK_RADIUS + maskRow;
        int inCol = outCol - MASK_RADIUS + maskCol;
        if (inRow < height && inRow >= 0 && inCol < width && inCol >=0)
        {
          sum += mask_c[maskRow][maskCol]*input[inRow*width+inCol];
        }
      }
    }
    output[outRow*width+outCol] = sum;
  }
}

void convolution_gpu(float mask[][MASK_DIM], float* input, float* output, unsigned int width, unsigned int height)
{
  float *input_d, *output_d;
  cudaMalloc((void**)&input_d, width*height*sizeof(float));
  cudaMalloc((void**)&output_d, width*height*sizeof(float));
  cudaDeviceSynchronize();

  cudaMemcpy(input_d, input, width*height*sizeof(float), cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();


  cudaMemcpyToSymbol(mask_c,mask,MASK_DIM*MASK_DIM*sizeof(float));
  cudaDeviceSynchronize();

  dim3 numThreadsPerBlock(OUT_TILE_DIM,OUT_TILE_DIM);
  dim3 numBlocks((width + OUT_TILE_DIM - 1)/OUT_TILE_DIM, (height + OUT_TILE_DIM -1)/OUT_TILE_DIM);
  convolutional_kernel<<<numBlocks, numThreadsPerBlock>>>(input_d,output_d, width, height);
  cudaDeviceSynchronize();

  cudaMemcpy(output,output_d,width*height*sizeof(float),cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  cudaFree(input_d);
  cudaFree(output_d);
  cudaDeviceSynchronize();

}
int main()
{
  const unsigned int width = 1024;
  const unsigned int height = 1024;

  float* input = (float*)malloc(width*height*sizeof(float));
  float* output = (float*)malloc(width*height*sizeof(float));

  srand(42);
  for (unsigned int i = 0; i<width*height; i++) {
    input[i] = float(rand())/RAND_MAX;
  }

  float mask[MASK_DIM][MASK_DIM];
  for (int i = 0; i < MASK_DIM; i++) {
    for (int j = 0; j < MASK_DIM; j++) {
        mask[i][j] = 1.0f / (MASK_DIM * MASK_DIM);  // simple averaging kernel
    }
  }

  convolution_gpu(mask,input,output,width,height);

  return 0;
}

'''

# Step 2: Save to file
with open('vec_add.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 vec_add.cu -o vec_add

# Step 4: Run the binary
!./vec_add


In [2]:
!nvprof ./vec_add --profile-from-start off

==692== NVPROF is profiling process 692, command: ./vec_add --profile-from-start off
==692== Profiling application: ./vec_add --profile-from-start off
==692== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   67.80%  2.0711ms         1  2.0711ms  2.0711ms  2.0711ms  [CUDA memcpy DtoH]
                   29.45%  899.67us         2  449.83us     672ns  899.00us  [CUDA memcpy HtoD]
                    2.75%  83.967us         1  83.967us  83.967us  83.967us  convolutional_kernel(float*, float*, unsigned int, unsigned int)
      API calls:   94.62%  122.73ms         2  61.367ms  158.44us  122.58ms  cudaMalloc
                    3.76%  4.8736ms         2  2.4368ms  1.0862ms  3.7874ms  cudaMemcpy
                    0.96%  1.2395ms         1  1.2395ms  1.2395ms  1.2395ms  cudaMemcpyToSymbol
                    0.25%  330.63us         2  165.32us  127.90us  202.73us  cudaFree
                    0.20%  259.92us       114  2.