<a href="https://colab.research.google.com/github/theatharvagai/Basics-of-CUDA/blob/main/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%%writefile menu_driven_cuda.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1000000  // Array size (1 Million elements)
#define BLOCK_SIZE 256

// --- KERNELS ---

// Kernel for Element-wise Addition
__global__ void addArrays(float *a, float *b, float *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Kernel for Element-wise Subtraction
__global__ void subArrays(float *a, float *b, float *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] - b[idx];
    }
}

// Kernel for Element-wise Multiplication
__global__ void mulArrays(float *a, float *b, float *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] * b[idx];
    }
}

// --- HELPER FUNCTION ---
void checkResult(float *a, float *b, float *out, float val, int n, const char* opName) {
    printf("\n--- %s Results ---\n", opName);
    printf("Input A (first 10): ");
    for (int i = 0; i < 10 && i < n; i++) {
        printf("%.2f ", a[i]);
    }
    printf("\nInput B (first 10): ");
    for (int i = 0; i < 10 && i < n; i++) {
        printf("%.2f ", b[i]);
    }
    printf("\nOutput C (first 10): ");
    for (int i = 0; i < 10 && i < n; i++) {
        printf("%.2f ", out[i]);
    }
    printf("\n");

    // Check just the first few elements to verify
    bool correct = true;
    for (int i = 0; i < 10; i++) {
        if (abs(out[i] - val) > 1e-5) {
            correct = false;
            printf("Mismatch at index %d: Expected %.2f, got %.2f\n", i, val, out[i]);
            break;
        }
    }
    if (correct) printf("SUCCESS: %s kernel executed correctly.\n", opName);
    else printf("FAILURE: %s kernel failed.\n", opName);
}

// --- HOST CODE ---
int main() {
    float *a, *b, *c;
    int choice;
    size_t bytes = N * sizeof(float);

    // 1. Allocation using Unified Memory (Accessible by CPU and GPU)
    cudaMallocManaged(&a, bytes);
    cudaMallocManaged(&b, bytes);
    cudaMallocManaged(&c, bytes);

    // 2. Initialize Arrays
    for (int i = 0; i < N; i++) {
        a[i] = 10.0f;
        b[i] = 2.0f;
    }

    // Calculate Grid and Block dimensions
    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // Prefetch data to GPU (Performance Optimization for Unified Memory)
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(a, bytes, device, NULL);
    cudaMemPrefetchAsync(b, bytes, device, NULL);

    while(1) {
        printf("\n--- CUDA Array Operations Menu ---\n");
        printf("1. Add Arrays\n");
        printf("2. Subtract Arrays\n");
        printf("3. Multiply Arrays\n");
        printf("4. Exit\n");
        printf("Enter your choice: ");

        // Note: Colab handles inputs differently, ensure you enter number in the box
        if (scanf("%d", &choice) != 1) break;

        if (choice == 4) break;

        switch (choice) {
            case 1:
                printf("Launching Addition Kernel...\n");
                addArrays<<<numBlocks, BLOCK_SIZE>>>(a, b, c, N);
                cudaDeviceSynchronize(); // Wait for GPU to finish
                checkResult(a, b, c, 12.0f, N, "Addition"); // 10 + 2 = 12
                break;
            case 2:
                printf("Launching Subtraction Kernel...\n");
                subArrays<<<numBlocks, BLOCK_SIZE>>>(a, b, c, N);
                cudaDeviceSynchronize();
                checkResult(a, b, c, 8.0f, N, "Subtraction"); // 10 - 2 = 8
                break;
            case 3:
                printf("Launching Multiplication Kernel...\n");
                mulArrays<<<numBlocks, BLOCK_SIZE>>>(a, b, c, N);
                cudaDeviceSynchronize();
                checkResult(a, b, c, 20.0f, N, "Multiplication"); // 10 * 2 = 20
                break;
            default:
                printf("Invalid choice!\n");
        }
    }

    // Free Unified Memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    return 0;
}


Overwriting menu_driven_cuda.cu


In [8]:
!nvcc -o menu_cuda menu_driven_cuda.cu
!./menu_cuda


--- CUDA Array Operations Menu ---
1. Add Arrays
2. Subtract Arrays
3. Multiply Arrays
4. Exit
Enter your choice: 1
Launching Addition Kernel...

--- Addition Results ---
Input A (first 10): 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 
Input B (first 10): 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 
Output C (first 10): 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 
SUCCESS: Addition kernel executed correctly.

--- CUDA Array Operations Menu ---
1. Add Arrays
2. Subtract Arrays
3. Multiply Arrays
4. Exit
Enter your choice: ^C


In [1]:
%%writefile tiled_matrix_mul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>

#define TILE_WIDTH 16  // Block size (16x16 threads)
#define N 1024         // Matrix dimensions (1024 x 1024)

// Macro for checking CUDA errors
#define cudaCheckError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

// --- KERNEL: Tiled Matrix Multiplication ---
// Uses Shared Memory to reduce Global Memory access
__global__ void matrixMulTiled(float *A, float *B, float *C, int width) {
    // 1. Define Shared Memory for the Tile
    __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];

    // 2. Calculate Row and Column indices
    int bx = blockIdx.x; int by = blockIdx.y;
    int tx = threadIdx.x; int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;

    float Pvalue = 0;

    // 3. Loop over the A and B tiles required to compute the C element
    for (int p = 0; p < width / TILE_WIDTH; ++p) {

        // Collaborative loading of tiles into shared memory
        // Each thread loads one element of A and one element of B
        ds_A[ty][tx] = A[Row * width + (p * TILE_WIDTH + tx)];
        ds_B[ty][tx] = B[(p * TILE_WIDTH + ty) * width + Col];

        // 4. Synchronization barrier
        // Wait for all threads in block to finish loading shared memory
        __syncthreads();

        // 5. Multiply the tile elements
        for (int k = 0; k < TILE_WIDTH; ++k) {
            Pvalue += ds_A[ty][k] * ds_B[k][tx];
        }

        // Synchronization barrier
        // Wait for all threads to finish using the current tile before loading the next
        __syncthreads();
    }

    // 6. Write final result to global memory
    if(Row < width && Col < width) {
        C[Row * width + Col] = Pvalue;
    }
}

// --- HOST CODE ---
int main() {
    int size = N * N * sizeof(float);
    float *h_A, *h_B, *h_C, *h_C_CPU;
    float *d_A, *d_B, *d_C;

    // Allocate Host Memory
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(size);
    h_C_CPU = (float*)malloc(size);

    // Initialize Matrices
    for(int i = 0; i < N * N; i++) {
        h_A[i] = (float)(rand() % 100) / 10.0;
        h_B[i] = (float)(rand() % 100) / 10.0;
    }

    // Allocate Device Memory
    cudaCheckError(cudaMalloc(&d_A, size));
    cudaCheckError(cudaMalloc(&d_B, size));
    cudaCheckError(cudaMalloc(&d_C, size));

    // Copy Data to Device
    cudaCheckError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

    // Setup Grid and Blocks
    dim3 dimGrid(ceil(N/TILE_WIDTH), ceil(N/TILE_WIDTH), 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    printf("Matrix Size: %dx%d\n", N, N);
    printf("Grid Size: %dx%d\n", dimGrid.x, dimGrid.y);
    printf("Block Size: %dx%d\n", dimBlock.x, dimBlock.y);

    // Create CUDA Events for Timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // --- Launch Kernel ---
    printf("Launching Tiled Kernel...\n");
    cudaEventRecord(start);
    matrixMulTiled<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
    cudaEventRecord(stop);

    cudaCheckError(cudaPeekAtLastError());
    cudaCheckError(cudaDeviceSynchronize());

    // Calculate Time
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Kernel Execution Time: %f ms\n", milliseconds);
    printf("Effective GFLOPs: %f\n", (2.0 * N * N * N) / (milliseconds * 1e6));

    // Copy result back to Host
    cudaCheckError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

    // Verification (Compute a small corner on CPU to check correctness)
    // Doing full NxN CPU verify is too slow for demo
    printf("Verifying result...\n");
    bool correct = true;
    for(int i = 0; i < 10; i++) { // Check first 10 rows
        for(int j = 0; j < 10; j++) { // Check first 10 cols
            float sum = 0;
            for(int k = 0; k < N; k++) {
                sum += h_A[i * N + k] * h_B[k * N + j];
            }
            if(fabs(h_C[i * N + j] - sum) > 1e-2) {
                printf("Mismatch at [%d][%d]: GPU %f, CPU %f\n", i, j, h_C[i*N+j], sum);
                correct = false;
                break;
            }
        }
        if(!correct) break;
    }

    if(correct) printf("TEST PASSED: Results match CPU verification!\n");
    else printf("TEST FAILED: Results do not match.\n");

    // Free Memory
    free(h_A); free(h_B); free(h_C); free(h_C_CPU);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    return 0;
}

Writing tiled_matrix_mul.cu


In [2]:
!nvcc -o tiled_matrix_mul tiled_matrix_mul.cu
!./tiled_matrix_mul

Matrix Size: 1024x1024
Grid Size: 64x64
Block Size: 16x16
Launching Tiled Kernel...
Kernel Execution Time: 107.296188 ms
Effective GFLOPs: 20.014538
Verifying result...
TEST PASSED: Results match CPU verification!
