In [None]:
!nvidia-smi

Fri Apr 18 15:55:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
# Step 1: Write CUDA code to a file
code = r'''
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float get__value(const float* data, int index, int n)
{
    if(index < n)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int n)
{
    int d_i = threadIdx.x + blockIdx.x*blockDim.x;

    result[d_i] = get__value(data, 2*d_i, n) + get__value(data, 2*d_i + 1, n);

    if (d_i == 0 && n % 2 != 0)
    {
        result[d_i] += data[n-1];
    }
}

int main()
{

    int n = 10000000;


    float* data_h   = (float*)calloc(n, sizeof(float));

    srand(42);
    for (int i = 0; i < n; i++)
    {
        data_h[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float result_h = 0.0;

    float* data_d;
    cudaMalloc((void**)&data_d, n*sizeof(float));
    cudaMemcpy(data_d, data_h, n*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = n/2/BLOCK_SIZE + 1;

    float* result1_d;
    float* result2_d;
    cudaMalloc((void**)&result1_d, n*sizeof(float));
    cudaMalloc((void**)&result2_d, n*sizeof(float));

    reduce_kernel<<<numBlocks, threadsPerBlock>>>(data_d, result1_d, n);
    for (int n_c = n/2; n_c > 1; n_c = n_c/2)
    {
        int n_c_b = n_c/2/BLOCK_SIZE + 1;
        reduce_kernel<<<n_c_b, threadsPerBlock>>>(result1_d, result2_d, n);
        std::swap(result1_d, result2_d);
    }

    cudaMemcpy(&result_h, result1_d, 1*sizeof(float), cudaMemcpyDeviceToHost);

    free(data_h);
    cudaFree(data_d);
    cudaFree(result1_d);
    cudaFree(result2_d);

    return 0;
}

'''

# Step 2: Save to file
with open('vec_add.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 vec_add.cu -o vec_add

# Step 4: Run the binary
!./vec_add


In [7]:
!nvprof ./vec_add --profile-from-start off

==2761== NVPROF is profiling process 2761, command: ./vec_add --profile-from-start off
==2761== Profiling application: ./vec_add --profile-from-start off
==2761== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   94.49%  8.8481ms         1  8.8481ms  8.8481ms  8.8481ms  [CUDA memcpy HtoD]
                    5.49%  513.85us        23  22.341us  2.6550us  228.96us  reduce_kernel(float const *, float*, int)
                    0.02%  2.0810us         1  2.0810us  2.0810us  2.0810us  [CUDA memcpy DtoH]
      API calls:   93.52%  174.96ms         3  58.322ms  88.880us  174.74ms  cudaMalloc
                    5.07%  9.4925ms         2  4.7463ms  464.63us  9.0279ms  cudaMemcpy
                    1.23%  2.2922ms         3  764.08us  128.88us  1.1023ms  cudaFree
                    0.11%  196.78us        23  8.5550us  3.0730us  114.33us  cudaLaunchKernel
                    0.07%  125.07us       114  1.0970us     107ns  51.