<a href="https://colab.research.google.com/github/trefftzc/cis677/blob/main/An_introduction_to_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# An introduction to cuda
Using examples provided by NVIDIA in their online courses.

In [22]:
!nvidia-smi

Mon Mar 10 16:50:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# A simple example

In [1]:
%%writefile hello_gpu.cu
#include <stdio.h>

void helloCPU()
{
  printf("Hello from the CPU.\n");
}

__global__ void helloGPU()
{
  printf("Hello also from the GPU.\n");
}

int main()
{

  helloCPU();

  /*
   * Refactor this call to `helloGPU` so that it launches
   * as a kernel on the GPU.
   */

  helloGPU<<<1,1>>>();

  /*
   * Add code below to synchronize on the completion of the
   * `helloGPU` kernel completion before continuing the CPU
   * thread.
   */
   cudaDeviceSynchronize();
}

Writing hello_gpu.cu


In [29]:
!nvcc -o hello_gpu hello_gpu.cu -arch=sm_75


In [30]:
!./hello_gpu

Hello from the CPU.
Hello also from the GPU.


# Another simple example

In [4]:
%%writefile very_simple.cu
#include <stdio.h>
void CPUFunction()
{
  printf("This function is defined to run on the CPU.\n");
}

__global__ void GPUFunction()
{
  printf("This function is defined to run on the GPU.\n");
}

int main()
{
  CPUFunction();

  GPUFunction<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Writing very_simple.cu


In [32]:
!nvcc very_simple.cu -o very_simple -arch=sm_75

In [33]:
!./very_simple

This function is defined to run on the CPU.
This function is defined to run on the GPU.


# Now executing a function in parallel

In [8]:
%%writefile first_parallel.cu
#include <stdio.h>

/*
 * Refactor firstParallel so that it can run on the GPU.
 */

__global__ void firstParallel()
{
  printf("This should be running in parallel.\n");
}

int main()
{
  /*
   * Refactor this call to firstParallel to execute in parallel
   * on the GPU.
   */

  firstParallel<<<5,5>>>();

  /*
   * Some code is needed below so that the CPU will wait
   * for the GPU kernels to complete before proceeding.
   */
     cudaDeviceSynchronize();
}

Writing first_parallel.cu


In [34]:
!nvcc -o first_parallel first_parallel.cu -arch=sm_75

In [35]:
!./first_parallel

This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.
This should be running in parallel.


# Thread and block id

In [39]:
%%writefile thread_and_block_id.cu
#include <stdio.h>

__global__ void printSuccessForCorrectExecutionConfiguration()
{

  if(threadIdx.x == 1023 && blockIdx.x == 255)
  {
    printf("Success!\n");
  } else {
    // printf("Failure. Update the execution configuration as necessary.\n");
  }
}

int main()
{
  /*
   * Update the execution configuration so that the kernel
   * will print `"Success!"`.
   */

  printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
  cudaDeviceSynchronize();
}

Writing thread_and_block_id.cu


In [40]:
!nvcc -o thread_and_block_id thread_and_block_id.cu -arch=sm_75

In [41]:
!./thread_and_block_id

Success!


# A single loop

1 block. This single block has 10 entries.

In [42]:
%%writefile single_loop.cu
#include <stdio.h>

/*
 * Refactor `loop` to be a CUDA Kernel. The new kernel should
 * only do the work of 1 iteration of the original loop.
 */

__global__ void loop(int N)
{
/*
  for (int i = 0; i < N; ++i)
  {
    printf("This is iteration number %d\n", i);
  }
  */
  printf("This is iteration number %d\n", threadIdx.x);
}

int main()
{
  /*
   * When refactoring `loop` to launch as a kernel, be sure
   * to use the execution configuration to control how many
   * "iterations" to perform.
   *
   * For this exercise, only use 1 block of threads.
   */

  int N = 10;
  loop<<<1,N>>>(N);
  cudaDeviceSynchronize();
}

Writing single_loop.cu


In [43]:
!nvcc -o single_loop single_loop.cu -arch=sm_75

In [44]:
!./single_loop

This is iteration number 0
This is iteration number 1
This is iteration number 2
This is iteration number 3
This is iteration number 4
This is iteration number 5
This is iteration number 6
This is iteration number 7
This is iteration number 8
This is iteration number 9


# Multi Block

Now, two blocks. Each block has five entries.

In [45]:
%%writefile multi_block.cu
#include <stdio.h>

/*
 * Refactor `loop` to be a CUDA Kernel. The new kernel should
 * only do the work of 1 iteration of the original loop.
 */

__global__ void loop(int N)
{
/*
  for (int i = 0; i < N; ++i)
  {
    printf("This is iteration number %d\n", i);
  }
  */
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  printf("This is interation number %d\n",i);
}

int main()
{
  /*
   * When refactoring `loop` to launch as a kernel, be sure
   * to use the execution configuration to control how many
   * "iterations" to perform.
   *
   * For this exercise, be sure to use more than 1 block in
   * the execution configuration.
   */

  int N = 10;
  loop<<<2,5>>>(N);
  cudaDeviceSynchronize();

}

Writing multi_block.cu


In [46]:
!nvcc -o multi_block multi_block.cu -arch=sm_75

In [47]:
!./multi_block

This is interation number 0
This is interation number 1
This is interation number 2
This is interation number 3
This is interation number 4
This is interation number 5
This is interation number 6
This is interation number 7
This is interation number 8
This is interation number 9


# Another example: Doubling the elements of an array
We will double the elements in an array

In [36]:
%%writefile double.cu
#include <stdio.h>

/*
 * Initialize array values on the host.
 */

void init(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
  {
    a[i] = i;
  }
}

/*
 * Double elements in parallel on the GPU.
 */

__global__
void doubleElements(int *a, int N)
{
  int i;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < N)
  {
    a[i] *= 2;
  }
}

/*
 * Check all elements have been doubled on the host.
 */

bool checkElementsAreDoubled(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
   {
    if (a[i] != i*2) return false;
  }
  return true;
}

int main()
{
  int N = 100;
  int *a;

  size_t size = N * sizeof(int);

  /*
   * Refactor this memory allocation to provide a pointer
   * `a` that can be used on both the host and the device.
   */

  //a = (int *)malloc(size);
  cudaMallocManaged(&a,size);

  init(a, N);

  size_t threads_per_block = 10;
  size_t number_of_blocks = 10;

  /*
   * This launch will not work until the pointer `a` is also
   * available to the device.
   */

  doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
  cudaDeviceSynchronize();

  bool areDoubled = checkElementsAreDoubled(a, N);
  printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");

  /* /*
   * Refactor to free memory that has been allocated to be
   * accessed by both the host and the device.
   */

  cudaFree(a);
}

Writing double.cu


In [37]:
!nvcc double.cu -o double -arch=sm_75


In [38]:
!./double

All elements were doubled? TRUE


# How to handle arrays that do not match exactly the grid?

In [58]:
%%writefile mismatch_grid.cu
#include <stdio.h>

/*
 * Currently, `initializeElementsTo`, if executed in a thread whose
 * `i` is calculated to be greater than `N`, will try to access a value
 * outside the range of `a`.
 *
 * Refactor the kernel definition to prevent out of range accesses.
 */

__global__ void initializeElementsTo(int initialValue, int *a, int N)
{
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  if (i < N)
      a[i] = initialValue;
}

int main()
{
  /*
   * Do not modify `N`.
   */

  int N = 1000;

  int *a;
  size_t size = N * sizeof(int);

  cudaMallocManaged(&a, size);

  /*
   * Assume we have reason to want the number of threads
   * fixed at `256`: do not modify `threads_per_block`.
   */

  size_t threads_per_block = 256;


  /*
   * Assign a value to `number_of_blocks` that will
   * allow for a working execution configuration given
   * the fixed values for `N` and `threads_per_block`.
   */

  size_t number_of_blocks =  ( N + threads_per_block - 1) / threads_per_block;;

  int initialValue = 6;

  initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
  cudaDeviceSynchronize();

  /*
   * Check to make sure all values in `a`, were initialized.
   */

  for (int i = 0; i < N; ++i)
  {
    if(a[i] != initialValue)
    {
      printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
      cudaFree(a);
      exit(1);
    }
  }
  printf("SUCCESS!\n");

  cudaFree(a);
}

Overwriting mismatch_grid.cu


In [59]:
!nvcc -o mismatch_grid mismatch_grid.cu -arch=sm_75

In [60]:
!./mismatch_grid

SUCCESS!


# Grid smaller than array

What can we do if the grid is actually smaller than the array we are working on?

In [50]:
%%writefile small_grid.cu
#include <stdio.h>

void init(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
  {
    a[i] = i;
  }
}

/*
 * In the current application, `N` is larger than the grid.
 * Refactor this kernel to use a grid-stride loop in order that
 * each parallel thread work on more than one element of the array.
 */

__global__
void doubleElements(int *a, int N)
{
  int i;
  int indexWithinTheGrid = blockIdx.x * blockDim.x + threadIdx.x;
  int gridStride = gridDim.x * blockDim.x;
  for (i = indexWithinTheGrid; i < N; i += gridStride)
  {
   a[i] *= 2;
  }

}

bool checkElementsAreDoubled(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
  {
    if (a[i] != i*2) return false;
  }
  return true;
}

int main()
{
  /*
   * `N` is greater than the size of the grid (see below).
   */

  int N = 10000;
  int *a;

  size_t size = N * sizeof(int);
  cudaMallocManaged(&a, size);

  init(a, N);

  /*
   * The size of this grid is 256*32 = 8192.
   */

  size_t threads_per_block = 256;
  size_t number_of_blocks = 32;

  doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
  cudaDeviceSynchronize();

  bool areDoubled = checkElementsAreDoubled(a, N);
  printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");

  cudaFree(a);
}

Overwriting small_grid.cu


In [51]:
!nvcc small_grid.cu -o small_grid -arch=sm_75

In [52]:
!./small_grid

All elements were doubled? TRUE


# How to handle errors?

In [53]:
%%writefile handling_errors.cu
#include <stdio.h>

void init(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
  {
    a[i] = i;
  }
}

__global__
void doubleElements(int *a, int N)
{

  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = gridDim.x * blockDim.x;

  for (int i = idx; i < N + stride; i += stride)
  {
    a[i] *= 2;
  }
}

bool checkElementsAreDoubled(int *a, int N)
{
  int i;
  for (i = 0; i < N; ++i)
  {
    if (a[i] != i*2) return false;
  }
  return true;
}

int main()
{
  /*
   * Add error handling to this source code to learn what errors
   * exist, and then correct them. Googling error messages may be
   * of service if actions for resolving them are not clear to you.
   */

  int N = 10000;
  int *a;

  size_t size = N * sizeof(int);
  cudaMallocManaged(&a, size);
  cudaError_t err;
   err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
   if (err != cudaSuccess)
    {
      printf("Error: %s\n", cudaGetErrorString(err));
    }
  init(a, N);

  // size_t threads_per_block = 2048;
  size_t threads_per_block = 1024;
  size_t number_of_blocks = 32;

  doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
  cudaDeviceSynchronize();

    err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
    if (err != cudaSuccess)
    {
      printf("Error: %s\n", cudaGetErrorString(err));
    }
  bool areDoubled = checkElementsAreDoubled(a, N);
  printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");

  cudaFree(a);

    err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
    if (err != cudaSuccess)
    {
      printf("Error: %s\n", cudaGetErrorString(err));
    }
}

Writing handling_errors.cu


In [54]:
!nvcc handling_errors.cu -o handling_errors -arch=sm_75

In [55]:
!./handling_errors

All elements were doubled? TRUE
