**Lab-1: count_theads.cu and hello.cu**

03-11-25, v1.0, (c) Incubera AI Labs

**This an introduction to GPU parallelism. ***

Here's what your learn:

*   Count number of threads
*   We write **once** run many copies
*   Code Runs in Multiple Places at Once
*   We can concurrenlty launch multiple ("n") copies of a function (e.g. n=1000)
*   GPU executes all copies in **parallel**
*   Each thread gets a **unique ID** (0-(n-1)) automatically
*   Functions **at the same time**, not one after another

In [None]:
!nvidia-smi


In [None]:
!nvcc --version


In [None]:
%%writefile count_threads.cu
#include <stdio.h>

//  Cardinality (Number of Threads)
__global__ void countThreads() {
    // Thread's position within its block
    int thread_in_block = threadIdx.x;

    // Which block am I in?
    int my_block = blockIdx.x;

    // My unique global thread ID
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Total threads in the grid
    int total_threads = gridDim.x * blockDim.x;

    printf("Block %d, Thread %d â†’ Global ID: %d (Total: %d threads)\n",
           my_block, thread_in_block, global_id, total_threads);
}

int main() {
    printf("Launching with <<<2, 5>>> (2 blocks, 5 threads each)\n\n");
    countThreads<<<2, 5>>>();
    cudaDeviceSynchronize();
    return 0;
}

In [None]:
!nvcc -arch=sm_75 count_threads.cu -o count_threads


In [None]:
!./count_threads

In [None]:
%%writefile hello.cu
#include <stdio.h>

//
__global__ void helloKernel() {
    printf("Hello from thread %d!\n", threadIdx.x);
}

int main() {
    // Check if CUDA device is available
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    printf("Found %d CUDA devices\n", deviceCount);

    if (deviceCount == 0) {
        printf("No CUDA devices found!\n");
        return 1;
    }

    // Set larger printf buffer BEFORE launching kernel
    cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*1024*10);

    printf("Launching kernel from CPU...\n");

    // 1 block, 100 threads
    helloKernel<<<1, 1000>>>();

    // Check for launch errors
    cudaError_t launchErr = cudaGetLastError();
    if (launchErr != cudaSuccess) {
        printf("Kernel launch error: %s\n", cudaGetErrorString(launchErr));
        return 1;
    }

    // Wait and check for execution errors
    cudaError_t syncErr = cudaDeviceSynchronize();
    if (syncErr != cudaSuccess) {
        printf("Kernel execution error: %s\n", cudaGetErrorString(syncErr));
        return 1;
    }

    // Force flush - this is key!
    cudaDeviceReset();

    printf("GPU finished!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 hello.cu -o hello


In [None]:
!./hello