<a href="https://colab.research.google.com/github/tobthecreator/parallel/blob/main/vector_add.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
%%writefile vector_add.cu
#include <stdio.h>

__global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int n = 1000;
    size_t size = n * sizeof(float);

    // Allocate host memory
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    // Initialize host arrays
    for (int i = 0; i < n; i++) {
        h_A[i] = i;
        h_B[i] = i * 2.0f;
    }

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy inputs to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    vectorAdd<<<numBlocks, blockSize>>>(d_A, d_B, d_C, n);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify result
    for (int i = 0; i < 10; i++) {
        printf("%.0f + %.0f = %.0f\n", h_A[i], h_B[i], h_C[i]);
    }

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Overwriting vector_add.cu


In [44]:
!nvcc vector_add.cu -o vector_add

In [95]:
%%writefile vector_add.cu
#include <stdio.h>
#include <time.h>

// Function to print elements of a float array
void printFloatArray(float arr[], int size) {
    for (int i = 0; i < size; i++) {
        printf("Element %d: %.3f\n", i, arr[i]);  // Print with 3 decimal places
    }

    printf("\n\n");
}

// Operate directly in RAM
void vectorAddCPU(float* x, float* y, float* z, int N) {
  for (unsigned int i=0; i<N; ++i) {
    z[i] = x[i] + y[i];
  }
}

__global__ void vectorAddGPUKernal(float *x, float* y, float* z, int N) {
  // gridDim.x -> num blocks in grid
  // blockIdx.x -> position of block in grid
  // threadIdx.x -> position of thread in block
  // blockDim.x -> number of threads in a block

  // Each thread has to compute it's global index so it can backtrack to to the index in the arrays
  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < N) {
    z[i] = x[i] + y[i];;
  }
}


// Copy memory over to the GPU, then operate
void vectorAddGPU(float* x, float* y, float* z, int N) {
  // Allocate GPU memory, "_d" is just "x-on-the-device (GPU)"
  float *x_d, *y_d, *z_d;

  // Pass a pointer for the input vectors so that we can copy them
  cudaMalloc((void**)&x_d, N*sizeof(float));
  cudaMalloc((void**)&y_d, N*sizeof(float));
  cudaMalloc((void**)&z_d, N*sizeof(float));

  // Copy to the GPU
  cudaMemcpy(x_d, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(y_d, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Run the GPU
  // z_d[i] = x_d[i] + y_d[i];
  // One GPU thread per vector element

  // Arrays of Threads are called Grids.
  // Grids are organized into Blocks.
  // We need to launch a Grid of Threads.
  // All Threads in the same Grid execute the same function, known as a kernal.

  // Call a GPU Kernal fn (launch a Grid of Threads)
  const unsigned int numThreadsPerBlock = 512;
  const unsigned int numBlocks = (N + numThreadsPerBlock - 1)/numThreadsPerBlock;
  vectorAddGPUKernal<<< numBlocks, numThreadsPerBlock >>>(x_d, y_d, z_d, N);
  cudaDeviceSynchronize();


  // Copy from the GPU
  cudaMemcpy(z, z_d, N*sizeof(float), cudaMemcpyDeviceToHost);

  // Deallocate the GPU memory

  cudaFree(x_d);
  cudaFree(y_d);
  cudaFree(z_d);
}

void vectorAdd(unsigned int N) {
  clock_t cpu_start, cpu_end, gpu_start, gpu_end;
  double cpu_time, gpu_time;

  float* x = (float*) malloc(N*sizeof(float));
  float* y = (float*) malloc(N*sizeof(float));
  float* z = (float*) malloc(N*sizeof(float));

  for (unsigned int i=0; i<N; ++i) {
    x[i] = rand();
    y[i] = rand();

  }

  // printFloatArray(x, N);
  // printFloatArray(y, N);

  cpu_start = clock();
  vectorAddCPU(x, y, z, N);
  cpu_end = clock();
  // printFloatArray(z, N);

  gpu_start = clock();
  vectorAddGPU(x, y, z, N);
  gpu_end = clock();

  // printFloatArray(z, N);

  cpu_time = ((double)(cpu_end - cpu_start)) / CLOCKS_PER_SEC;
  gpu_time = ((double)(gpu_end - gpu_start)) / CLOCKS_PER_SEC;
  printf("Array size: %i\n", N);
  printf("CPU execution time: %f seconds\n", cpu_time);
  printf("GPU execution time: %f seconds\n", gpu_time);

  printf("\n\n");

  free(x);
  free(y);
  free(z);

}

int main() {
  unsigned int numOrdersOfMagnitude = 10;  // You can tweak this value

    for (unsigned int i = 1; i < numOrdersOfMagnitude; i++) {
        unsigned int n = pow(10, i);  // 10^i for each order of magnitude
        vectorAdd(n);
    }

    return 0;
}

Writing vector_add.cu


In [99]:
!nvcc vector_add.cu -o vector_add

In [100]:
!./vector_add

Array size: 10
CPU execution time: 0.000001 seconds
GPU execution time: 0.197281 seconds


Array size: 100
CPU execution time: 0.000001 seconds
GPU execution time: 0.000258 seconds


Array size: 1000
CPU execution time: 0.000003 seconds
GPU execution time: 0.000213 seconds


Array size: 10000
CPU execution time: 0.000044 seconds
GPU execution time: 0.000303 seconds


Array size: 100000
CPU execution time: 0.000453 seconds
GPU execution time: 0.000609 seconds


Array size: 1000000
CPU execution time: 0.005924 seconds
GPU execution time: 0.004025 seconds


Array size: 10000000
CPU execution time: 0.047994 seconds
GPU execution time: 0.029569 seconds


Array size: 100000000
CPU execution time: 0.468434 seconds
GPU execution time: 0.265127 seconds


^C
