<a href="https://colab.research.google.com/github/thetushargoyal/Learning-CUDA/blob/main/CUDA_on_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version
!pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpkjek01eu".


In [None]:
%%cuda
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}

Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1



In [None]:
%%cuda

#include <stdio.h>
#include <ctime>
#include <cuda_runtime.h>

__global__ void add(int* a, int* b, int* c) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    c[i] = a[i] + b[i];
}

__managed__ int vector_a[10000000], vector_b[10000000], vector_c[10000000];

void cpu_add(int* a, int* b, int* c, int size) {
    for (int i = 0; i < size; i++) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int size = 10000000; // Increased vector size

    // Initialize vectors
    for (int i = 0; i < size; i++) {
        vector_a[i] = i;
        vector_b[i] = size - i;
        vector_c[i] = 0;
    }

    // Measure GPU time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    // Launch kernel with 4 blocks of 256 threads
    add<<<4, 256>>>(vector_a, vector_b, vector_c);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Print GPU result sum
    int gpu_result_sum = 0;
    for (int i = 0; i < size; i++) {
        gpu_result_sum += vector_c[i];
    }
    printf("GPU Result = %d\n", gpu_result_sum);
    printf("GPU Time = %f ms\n", milliseconds);

    // Measure CPU time
    clock_t cpu_start = clock();
    cpu_add(vector_a, vector_b, vector_c, size);
    clock_t cpu_end = clock();

    double cpu_time = 1000.0 * (cpu_end - cpu_start) / CLOCKS_PER_SEC;

    // Print CPU result sum
    int cpu_result_sum = 0;
    for (int i = 0; i < size; i++) {
        cpu_result_sum += vector_c[i];
    }
    printf("CPU Result = %d\n", cpu_result_sum);
    printf("CPU Time = %f ms\n", cpu_time);

    // Cleanup
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


GPU Result = 1650065408
GPU Time = 0.253952 ms
CPU Result = 276447232
CPU Time = 31.604000 ms

