In [None]:
!nvidia-smi

Fri Apr 18 15:55:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Step 1: Write CUDA code to a file
code = r'''

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

// CUDA kernel for reduction using atomicAdd
__global__ void reduce_sum_kernel(double* A, double* B, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < n) {
        atomicAdd(B, A[i]);
    }
}

// CPU version of array sum
void reduce_sum_cpu(double* A, double* B, int n) {
    double sum = 0.0;
    for (int i = 0; i < n; ++i) {
        sum += A[i];
    }
    *B = sum;
}

int main() {
    int N = 10000000;
    int threadsPerBlock = 256;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
    size_t size = N * sizeof(double);

    // Allocate host memory
    double* A_h = (double*)malloc(size);
    double* B_h = (double*)malloc(sizeof(double));
    double* B_h_CPU = (double*)malloc(sizeof(double));

    // Initialize data
    srand(time(NULL));
    for (int i = 0; i < N; i++) {
        A_h[i] = (double)rand() / RAND_MAX;
    }

    // Allocate device memory
    double* A_d;
    double* B_d;
    cudaMalloc((void**)&A_d, size);
    cudaMalloc((void**)&B_d, sizeof(double));

    // Copy data to device
    cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
    cudaMemset(B_d, 0, sizeof(double));  // initialize sum to 0

    // GPU timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Launch kernel
    reduce_sum_kernel<<<blocks, threadsPerBlock>>>(A_d, B_d, N);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Copy result back
    cudaMemcpy(B_h, B_d, sizeof(double), cudaMemcpyDeviceToHost);

    // CPU sum
    reduce_sum_cpu(A_h, B_h_CPU, N);

    // Report results
    printf("CPU Sum: %f\n", *B_h_CPU);
    printf("GPU Sum: %f\n", *B_h);

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    printf("GPU Time: %.3f ms\n", ms);

    // Cleanup
    cudaFree(A_d);
    cudaFree(B_d);
    free(A_h);
    free(B_h);
    free(B_h_CPU);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

'''

# Step 2: Save to file
with open('vec_add.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 vec_add.cu -o vec_add

# Step 4: Run the binary
!./vec_add


CPU Sum: 4998851.832672
GPU Sum: 4998851.832672
GPU Time: 35.228 ms


In [None]:
!nvprof ./vec_add --profile-from-start off

==8311== NVPROF is profiling process 8311, command: ./vec_add --profile-from-start off
GPU: Tesla T4
Compute Capability: 7.5
4.99942e+06CPU
4.98867e-310GPU==8311== Profiling application: ./vec_add --profile-from-start off
==8311== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  35.091ms         1  35.091ms  35.091ms  35.091ms  reduceSumKernel(double*, double*, int)
                    0.00%     992ns         1     992ns     992ns     992ns  [CUDA memcpy HtoD]
      API calls:   92.79%  82.484ms         2  41.242ms     926ns  82.483ms  cudaEventCreate
                    5.02%  4.4586ms         1  4.4586ms  4.4586ms  4.4586ms  cudaEventSynchronize
                    1.45%  1.2924ms         2  646.19us  169.36us  1.1230ms  cudaFree
                    0.27%  236.85us         2  118.42us  100.37us  136.48us  cudaMalloc
                    0.15%  133.47us       114  1.1700us     103ns  54.271us  cuDeviceGetAttr