### Hello world CUDA

In [30]:
%%writefile prog.c

#include <stdio.h>

int main() {
  printf("This is testing ... ");
}

Overwriting prog.c


In [31]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [32]:
!nvcc prog.c -o prog

In [33]:
!./prog

This is testing ... 

In [34]:
%%writefile cuda_prog.cu

#include <stdio.h>

__global__ void cuda_hello(){
    printf("Hello World from GPU!\n");
}

int main() {
    cuda_hello<<<1,1>>>();
    cudaDeviceSynchronize();
    return 0;
}

Overwriting cuda_prog.cu


In [35]:
!nvcc cuda_prog.cu -o prog
!./prog

Hello World from GPU!


### GPU based vector addition

#### CPU vector add

In [50]:
%%writefile vector_add.c

#include<stdio.h>
#include <stdlib.h>

void add(float *a, float *b, float *out, int n) {
    for (int i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
  int N = 10;
  float *a = (float*) malloc(N * sizeof(float));
  float *b = (float*) malloc(N * sizeof(float));
  float *out = (float*) malloc(N * sizeof(float));
  for (int i=0; i < N; i++) {
    a[i] = i;
    b[i] = 3*i;
  }
  add(a,b,out,N);
  for (int i=0; i < N; i++) {
    printf("%.2f ", out[i]);
  }
}

Overwriting vector_add.c


In [51]:
!nvcc vector_add.c -o vector_add
!./vector_add

0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 

#### GPU vector add

In [57]:
%%writefile vector_add.cu

#include<stdio.h>
#include <stdlib.h>

// This function will be exectuted in the GPU,
// the pointers are expected to be in the device memory.

__global__ void add(float *a, float *b, float *out, int n) {
    for (int i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

int main() {
  int N = 10;
  float *a = (float*) malloc(N * sizeof(float));
  float *b = (float*) malloc(N * sizeof(float));
  float *out = (float*) malloc(N * sizeof(float));
  for (int i=0; i < N; i++) {
    a[i] = i;
    b[i] = 3*i;
  }

  // Move the memory to GPU
  float *d_a, *d_b, *d_out;
  cudaMalloc((void**) &d_a, N * sizeof(float));
  cudaMalloc((void**) &d_b, N * sizeof(float));
  cudaMalloc((void**) &d_out, N * sizeof(float));

  cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

  // Execute the function on the GPU.
  add<<<1,1>>>(d_a, d_b, d_out, N);

  // Copy the memory back to CPU
  cudaMemcpy(out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

  for (int i=0; i < N; i++) {
    printf("%.2f ", out[i]);
  }
}

Overwriting vector_add.cu


In [58]:
!nvcc vector_add.cu -o vector_add_cu
!./vector_add_cu

0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 

#### Profiling

In [59]:
!nvprof ./vector_add_cu

==14471== NVPROF is profiling process 14471, command: ./vector_add_cu
==14471== Profiling application: ./vector_add_cu
0.00 4.00 8.00 12.00 16.00 20.00 24.00 28.00 32.00 36.00 ==14471== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   53.81%  4.7350us         1  4.7350us  4.7350us  4.7350us  add(float*, float*, float*, int)
                   26.18%  2.3040us         1  2.3040us  2.3040us  2.3040us  [CUDA memcpy DtoH]
                   20.00%  1.7600us         2     880ns     640ns  1.1200us  [CUDA memcpy HtoD]
      API calls:   99.52%  85.776ms         3  28.592ms  5.8860us  85.762ms  cudaMalloc
                    0.23%  198.49us         1  198.49us  198.49us  198.49us  cudaLaunchKernel
                    0.16%  134.93us       114  1.1830us     136ns  53.724us  cuDeviceGetAttribute
                    0.06%  54.132us         3  18.044us  7.3870us  26.788us  cudaMemcpy
                    0.01%  12.495us         