<a href="https://colab.research.google.com/github/yooian/saxpy_test/blob/main/SAXPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue Aug 19 21:38:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [12]:
%%writefile saxpy_cpu.cpp

#include <iostream>
#include <vector>
#include <chrono> // for timing

// Simple single-thread CPU version of SAXPY
void saxpy_cpu(int n, float a, float* x, float* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = a * x[i] + y[i];
  }
}

int main() {
  int n = 1 << 24; // large vector for meaningful perf

  std::vector<float> x(n, 1.0f);
  std::vector<float> y(n, 2.0f);
  float a = 3.0f;

  // timing
  auto start = std::chrono::high_resolution_clock::now();
  saxpy_cpu(n, a, x.data(), y.data());
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> duration = stop - start;

  std::cout << "CPU time: " << duration.count() << " ms\n";

  // check
  // each y element should be 3.0 * 1.0 + 2.0 = 5.0
  float maxError = 0.0f;
  for (int i = 0; i < n; ++i) {
    maxError = std::max(maxError, std::abs(y[i] - 5.0f));
  }

  std::cout << "Max error: " << maxError << std::endl;

  return 0;
}

Overwriting saxpy_cpu.cpp


In [13]:
!g++ -O3 -o saxpy_cpu saxpy_cpu.cpp && ./saxpy_cpu

CPU time: 11.4273 ms
Max error: 0


In [15]:
%%writefile saxpy_gpu.cu

#include <iostream>
#include <vector>

__global__ void saxpy_kernel(int n, float a, float* x, float* y) {
  int i = blockIdx.x * blockDim.x + threadIdx.x; // offset

  // boundary check
  if (i < n) {
    y[i] = a * x[i] + y[i];
  }
}

int main() {
  int n = 1 << 24;
  size_t bytes = n * sizeof(float);

  // Allocate Host Memory
  // create vectors on cpu (host)
  std::vector<float> h_x(n, 1.0f);
  std::vector<float> h_y(n, 2.0f);

  // Allocate Device Memory
  float *d_x, *d_y;
  cudaMalloc(&d_x, bytes);
  cudaMalloc(&d_y, bytes);

  // Copy Data from Host to Device (CPU to GPU)
  // happens on PCIe bus == sloww
  cudaMemcpy(d_x, h_x.data(), bytes, cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, h_y.data(), bytes, cudaMemcpyHostToDevice);

  // Configure and Launch Kernel
  // need # of threads, and arrange it in grid of blocks
  int blockSize = 256;
  int gridSize = (n + blockSize - 1) / blockSize;

  // kernel launch
  saxpy_kernel<<<gridSize, blockSize>>>(n, 3.0f, d_x, d_y);

  // Copy results back
  std::vector<float> h_result_y(n);
  cudaMemcpy(h_result_y.data(), d_y, bytes, cudaMemcpyDeviceToHost);

  // Verify
  float maxError = 0.0f;
  for (int i = 0; i < n; ++i) {
    maxError = std::max(maxError, std::abs(h_result_y[i] - 5.0f));
  }
  std::cout << "Max GPU error: " << maxError << std::endl;

  // Free Memory
  cudaFree(d_x);
  cudaFree(d_y);

  return 0;
}

Writing saxpy_gpu.cu


In [17]:
!nvcc -o saxpy_gpu saxpy_gpu.cu && ./saxpy_gpu

Max GPU error: 3


In [18]:
!nvprof ./saxpy_gpu

==12221== NVPROF is profiling process 12221, command: ./saxpy_gpu
Max GPU error: 3
==12221== Profiling application: ./saxpy_gpu
==12221== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   67.49%  28.707ms         2  14.353ms  14.147ms  14.559ms  [CUDA memcpy HtoD]
                   32.51%  13.831ms         1  13.831ms  13.831ms  13.831ms  [CUDA memcpy DtoH]
      API calls:   78.92%  211.22ms         2  105.61ms  104.06us  211.12ms  cudaMalloc
                   16.18%  43.298ms         3  14.433ms  14.196ms  14.740ms  cudaMemcpy
                    4.28%  11.458ms         1  11.458ms  11.458ms  11.458ms  cudaLaunchKernel
                    0.55%  1.4689ms         2  734.47us  288.39us  1.1805ms  cudaFree
                    0.07%  176.21us       114  1.5450us     198ns  70.123us  cuDeviceGetAttribute
                    0.01%  14.471us         1  14.471us  14.471us  14.471us  cuDeviceGetName
                    0.0