In [None]:
!nvidia-smi

Mon Apr 21 12:53:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Step 1: Write CUDA code to a file
code = r'''
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256
#define BLOCK_DIM 8

// Error checking macro
#define CHECK_CUDA_ERROR(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA error in %s at line %d: %s\n", \
                __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

__global__ void stencil_kernel(float* in, float* out, unsigned int n)
{
  unsigned int i = blockIdx.z * blockDim.z + threadIdx.z;
  unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
  unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
  float C0 = 0.5f;
  float C1 = 0.25f;

  if (i >= 1 && i < n-1 && j >= 1 && j < n - 1 && k >= 1 && k < n -1)
  {
    out[i*n*n + j*n + k] = C0*in[i*n*n + j*n + k] +
                          C1*(in[i*n*n + j*n + (k-1)] +
                              in[i*n*n + j*n + (k+1)] +
                              in[i*n*n + (j-1)*n + k] +
                              in[i*n*n + (j+1)*n + k] +
                              in[(i-1)*n*n + j*n + k] +
                              in[(i+1)*n*n + j*n + k]);
  }
}




void stencil_gpu(float* in, float* out, unsigned int n)
{
  float *in_d, *out_d;
  size_t size = n*n*n*sizeof(float);

  // Allocate device memory
  CHECK_CUDA_ERROR(cudaMalloc((void**) &in_d, size));
  CHECK_CUDA_ERROR(cudaMalloc((void**) &out_d, size));

  cudaDeviceSynchronize();

  CHECK_CUDA_ERROR(cudaMemcpy(in_d, in, size, cudaMemcpyHostToDevice));
  cudaDeviceSynchronize();

  dim3 numThreadsPerBlock(BLOCK_DIM,BLOCK_DIM,BLOCK_DIM);
  dim3 numBlocks((n + BLOCK_DIM - 1)/ BLOCK_DIM,
                 (n + BLOCK_DIM - 1)/ BLOCK_DIM,
                 (n + BLOCK_DIM - 1)/ BLOCK_DIM);
  stencil_kernel<<< numBlocks, numThreadsPerBlock>>>(in_d,out_d,n);
  // Check for kernel launch errors
  CHECK_CUDA_ERROR(cudaGetLastError());

  // Wait for kernel to finish
  CHECK_CUDA_ERROR(cudaDeviceSynchronize());

  cudaMemcpy(out,out_d, n*n*n*sizeof(float), cudaMemcpyDeviceToHost);

  cudaFree(in_d);
  cudaFree(out_d);

}
int main()
{
  const unsigned int n = 5;

  float* input = (float*)malloc(n*n*n*sizeof(float));
  float* output = (float*)malloc(n*n*n*sizeof(float));

  srand(42);
  for (unsigned int i = 0; i<n*n*n; i++) {
    input[i] = float(rand())/RAND_MAX;
  }


  stencil_gpu(input,output,n);
  for (unsigned int i = 0; i<1500; i++) {
      printf("%f",output[i]);
  }

  return 0;
}

'''

# Step 2: Save to file
with open('vec_add.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 vec_add.cu -o vec_add

# Step 4: Run the binary
!./vec_add


0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.1867841.5431821.5390590.0000000.0000000.9090681.3587691.3430370.0000000.0000000.9922820.8961750.9508780.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.9098191.1887191.2978150.0000000.0000001.1072141.2153571.1597020.0000000.0000001.1631460.8053410.7817090.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.2150720.8880370.8582670.0000000.0000001.2518991.0316911.0300660.0000000.0000001.3965850.9069270.7756220.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000

In [None]:
!nvprof ./vec_add --profile-from-start off

==21718== NVPROF is profiling process 21718, command: ./vec_add --profile-from-start off
0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.1867841.5431821.5390590.0000000.0000000.9090681.3587691.3430370.0000000.0000000.9922820.8961750.9508780.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.9098191.1887191.2978150.0000000.0000001.1072141.2153571.1597020.0000000.0000001.1631460.8053410.7817090.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.2150720.8880370.8582670.0000000.0000001.2518991.0316911.0300660.0000000.0000001.3965850.9069270.7756220.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000