**(3a) Simplified Vector Addition with debugs**

*   Only 10 elements (easy to print all elements)
*   Prints from both CPU and GPU
*   Shows each step clearly

In [None]:
%%writefile vector_add_debug.cu
#include <stdio.h>
#include <cuda_runtime.h>

// GPU KERNEL - Runs on Device
__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
        // Debug: print from first few threads
        if (i < 5) {
            printf("GPU Thread %d: %f + %f = %f\n", i, a[i], b[i], c[i]);
        }
    }
}

int main() {
    int n = 10;  // Start with just 10 elements for easy debugging
    size_t bytes = n * sizeof(float);

    // 1. Allocate and initialize on CPU
    //    The CPU creates two arrays in its own memory (RAM)
    float *h_a = (float*)malloc(bytes);
    float *h_b = (float*)malloc(bytes);
    float *h_c = (float*)malloc(bytes);

    printf("=== CPU Initialization ===\n");
    for (int i = 0; i < n; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(i * 2);
        printf("CPU: h_a[%d] = %f, h_b[%d] = %f\n", i, h_a[i], i, h_b[i]);
    }

    // 2. Allocate GPU memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    // 3. Copy to GPU
    // Data is transferred from CPU RAM â†’ GPU VRAM
    // This happens over the PCI-Express bus
    // (the connection between CPU and GPU)
    // Now the GPU has copies of arrays `a` and `b` in its own memory
    printf("\n=== Copying to GPU ===\n");
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    // 4. Launch kernel
    printf("\n=== GPU Computation ===\n");
    vectorAdd<<<1, n>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();


    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("ERROR: %s\n", cudaGetErrorString(error));
    }

    // 5. Copy back
    printf("\n=== Copying from GPU ===\n");
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    // 6. Verify results
    printf("\n=== CPU Verification ===\n");
    for (int i = 0; i < n; i++) {
        float expected = h_a[i] + h_b[i];
        printf("c[%d] = %f (expected %f) %s\n",
               i, h_c[i], expected,
               (h_c[i] == expected) ? "âœ“" : "âœ—");
    }

    // 7. Cleanup
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c);

    return 0;
}


In [None]:
!nvcc -arch=sm_75 -o vector_debug vector_add_debug.cu


In [None]:
!./vector_debug

### ***(3b) Large date set (Million elements) Vector Addition***

*   We will now try with **1 Million elements** (10 Lakhs)
*   Data set is initialized in CPU and transferred to GPU for addition
*   Results copied back to CPU


In [None]:
%%writefile vector_add_million.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <time.h>

// GPU KERNEL - Runs on Device
__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

// CPU version for comparison
void vectorAddCPU(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 1000000;  // 1 million elements
    size_t bytes = n * sizeof(float);

    printf("=== Vector Addition: %d elements ===\n\n", n);

    // 1. Allocate and initialize on CPU
    printf("1. Allocating CPU memory...\n");
    float *h_a = (float*)malloc(bytes);
    float *h_b = (float*)malloc(bytes);
    float *h_c = (float*)malloc(bytes);
    float *h_c_cpu = (float*)malloc(bytes);  // For CPU comparison

    printf("2. Initializing arrays...\n");
    for (int i = 0; i < n; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(i * 2);
    }

    // Show a few sample values
    printf("   Sample: a[0]=%0.f, b[0]=%0.f\n", h_a[0], h_b[0]);
    printf("   Sample: a[999999]=%0.f, b[999999]=%0.f\n\n", h_a[999999], h_b[999999]);

    // === CPU TIMING ===
    printf("3. Running CPU version...\n");
    clock_t cpu_start = clock();
    vectorAddCPU(h_a, h_b, h_c_cpu, n);
    clock_t cpu_end = clock();
    double cpu_time = ((double)(cpu_end - cpu_start)) / CLOCKS_PER_SEC * 1000.0;
    printf("   CPU Time: %.2f ms\n\n", cpu_time);

    // === GPU SETUP ===
    printf("4. Allocating GPU memory...\n");
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    printf("5. Copying data to GPU...\n");
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    // === GPU TIMING ===
    printf("6. Running GPU version...\n");
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    printf("   Grid: %d blocks x %d threads = %d total threads\n",
           blocksPerGrid, threadsPerBlock, blocksPerGrid * threadsPerBlock);

    // Warm-up run
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    // Timed run
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float gpu_time = 0;
    cudaEventElapsedTime(&gpu_time, start, stop);
    printf("   GPU Time: %.2f ms\n\n", gpu_time);

    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("   ERROR: %s\n", cudaGetErrorString(error));
        return -1;
    }

    printf("7. Copying results back to CPU...\n");
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    // === VERIFICATION ===
    printf("8. Verifying results...\n");
    int errors = 0;
    for (int i = 0; i < n; i++) {
        float expected = h_a[i] + h_b[i];
        if (h_c[i] != expected) {
            errors++;
            if (errors <= 5) {  // Show only first 5 errors
                printf("   Error at index %d: got %f, expected %f\n",
                       i, h_c[i], expected);
            }
        }
    }

    if (errors == 0) {
        printf("   âœ“ All %d values correct!\n\n", n);
    } else {
        printf("   âœ— Found %d errors out of %d values\n\n", errors, n);
    }

    // Show sample results
    printf("Sample Results:\n");
    printf("   c[0] = %0.f (expected %0.f)\n", h_c[0], h_a[0] + h_b[0]);
    printf("   c[1] = %0.f (expected %0.f)\n", h_c[1], h_a[1] + h_b[1]);
    printf("   c[999999] = %0.f (expected %0.f)\n\n", h_c[999999], h_a[999999] + h_b[999999]);

    // === PERFORMANCE COMPARISON ===
    printf("=== Performance Summary ===\n");
    printf("CPU Time: %.2f ms\n", cpu_time);
    printf("GPU Time: %.2f ms\n", gpu_time);
    printf("Speedup: %.1fx faster on GPU! ðŸš€\n\n", cpu_time / gpu_time);

    // Cleanup
    printf("9. Cleaning up...\n");
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);
    free(h_c_cpu);

    printf("Done! âœ“\n");

    return 0;
}


In [None]:
!nvcc -arch=sm_75 vector_add_million.cu -o vector_add_million


In [None]:
!./vector_add_million

### ***(3c) Large date set (Million elements) Vector Addition (Data directly in GPU***

*   We will now try with **1 Million elements** (10 Lakhs)
*   Data set is initialized directly in GPU
*   Results copied back to CPU


In [None]:
%%writefile vector_add_million_gpu_init.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <time.h>

// GPU KERNEL - Initialize arrays on GPU
__global__ void initArrays(float *a, float *b, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        a[i] = (float)i;
        b[i] = (float)(i * 2);
    }
}

// GPU KERNEL - Add vectors
__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

// CPU version for comparison
void vectorAddCPU(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 1000000;  // 1 million elements
    size_t bytes = n * sizeof(float);

    printf("=== Vector Addition: %d elements ===\n", n);
    printf("*** Initializing arrays ON GPU ***\n\n");

    // Allocate CPU memory (only for results and verification)
    float *h_c = (float*)malloc(bytes);

    // === GPU MEMORY ALLOCATION ===
    printf("1. Allocating GPU memory...\n");
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    // === GPU INITIALIZATION ===
    printf("2. Initializing arrays ON GPU (not on CPU!)...\n");
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t init_start, init_stop;
    cudaEventCreate(&init_start);
    cudaEventCreate(&init_stop);

    cudaEventRecord(init_start);
    initArrays<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, n);
    cudaEventRecord(init_stop);
    cudaEventSynchronize(init_stop);

    float init_time = 0;
    cudaEventElapsedTime(&init_time, init_start, init_stop);
    printf("   GPU Initialization Time: %.2f ms\n", init_time);
    printf("   Grid: %d blocks x %d threads\n\n", blocksPerGrid, threadsPerBlock);

    // Verify initialization (copy a few values back to check)
    float test_a[3], test_b[3];
    cudaMemcpy(test_a, d_a, 3 * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(test_b, d_b, 3 * sizeof(float), cudaMemcpyDeviceToHost);
    printf("   Verification: a[0]=%0.f, a[1]=%0.f, a[2]=%0.f\n", test_a[0], test_a[1], test_a[2]);
    printf("   Verification: b[0]=%0.f, b[1]=%0.f, b[2]=%0.f\n\n", test_b[0], test_b[1], test_b[2]);

    // === GPU COMPUTATION ===
    printf("3. Running GPU vector addition...\n");

    // Warm-up run
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    // Timed run
    cudaEvent_t compute_start, compute_stop;
    cudaEventCreate(&compute_start);
    cudaEventCreate(&compute_stop);

    cudaEventRecord(compute_start);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaEventRecord(compute_stop);
    cudaEventSynchronize(compute_stop);

    float compute_time = 0;
    cudaEventElapsedTime(&compute_time, compute_start, compute_stop);
    printf("   GPU Computation Time: %.2f ms\n\n", compute_time);

    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("   ERROR: %s\n", cudaGetErrorString(error));
        return -1;
    }

    // === COPY RESULTS ===
    printf("4. Copying results back to CPU...\n");
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    // === VERIFICATION ===
    printf("5. Verifying results...\n");
    int errors = 0;
    int samples_to_check = 1000;  // Check first and last 1000

    // Check first 1000
    for (int i = 0; i < samples_to_check && i < n; i++) {
        float expected = (float)i + (float)(i * 2);  // a[i] + b[i]
        if (h_c[i] != expected) {
            errors++;
            if (errors <= 3) {
                printf("   Error at index %d: got %f, expected %f\n",
                       i, h_c[i], expected);
            }
        }
    }

    // Check last 1000
    for (int i = n - samples_to_check; i < n; i++) {
        float expected = (float)i + (float)(i * 2);
        if (h_c[i] != expected) {
            errors++;
            if (errors <= 3) {
                printf("   Error at index %d: got %f, expected %f\n",
                       i, h_c[i], expected);
            }
        }
    }

    if (errors == 0) {
        printf("   âœ“ All checked values correct!\n\n");
    } else {
        printf("   âœ— Found %d errors\n\n", errors);
    }

    // Show sample results
    printf("Sample Results:\n");
    printf("   c[0] = %0.f (expected 0)\n", h_c[0]);
    printf("   c[1] = %0.f (expected 3)\n", h_c[1]);
    printf("   c[2] = %0.f (expected 6)\n", h_c[2]);
    printf("   c[999999] = %0.f (expected 2999997)\n\n", h_c[999999]);

    // === PERFORMANCE SUMMARY ===
    printf("=== Performance Summary ===\n");
    printf("GPU Initialization: %.2f ms\n", init_time);
    printf("GPU Computation:    %.2f ms\n", compute_time);
    printf("Total GPU Time:     %.2f ms\n\n", init_time + compute_time);

    printf("ðŸ’¡ Benefits of GPU initialization:\n");
    printf("   âœ“ No CPU initialization needed\n");
    printf("   âœ“ No CPUâ†’GPU transfer of input data\n");
    printf("   âœ“ Data lives entirely on GPU\n");
    printf("   âœ“ Only transfer results back\n\n");

    // Cleanup
    printf("6. Cleaning up...\n");
    cudaEventDestroy(init_start);
    cudaEventDestroy(init_stop);
    cudaEventDestroy(compute_start);
    cudaEventDestroy(compute_stop);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_c);

    printf("Done! âœ“\n");

    return 0;
}


In [None]:
!nvcc -arch=sm_75 vector_add_million_gpu_init.cu -o vector_add_million_gpu_init


In [None]:
!./vector_add_million_gpu_init

### Key Benefits of GPU Initialization


*   No CPU initialization loop - saves CPU time
*   No CPUâ†’GPU transfer - saves PCIe bandwidth
*   Data lives entirely on GPU - more efficient
*   Only transfer results back - minimal data movement
