In [1]:
# check python version to verify automatically installed
!python --version

# check nvcc (CUDA compiler driver) version to verify the automatically installed
!nvcc --version

# since google colab runs jupyter notebook, we need to install nvcc4jupyter: cuda c++ plugin for jupyter notebook
!pip install nvcc4jupyter

# after installing, load the package (or extension)
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmptrvxwa8c".


In [3]:
%%cuda

#include <curand_kernel.h>
#include <stdio.h>

__global__ void generate_random_numbers(unsigned long long int *count, unsigned long seed, unsigned long long int stride) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    curandState state;
    curand_init(seed + tid, tid, 0, &state);

    unsigned long long int local_counter = 0;

    for (int i = 0; i < stride; i++) {
      float x = curand_uniform(&state) * 2.0f - 1.0f;
      float y = curand_uniform(&state) * 2.0f - 1.0f;

      float distance_squared = x*x + y*y;

      if (distance_squared <= 1) local_counter++;
    }

    atomicAdd(count, local_counter);
}

int main() {
    // Device props
    // CC 7.5
    // 16 real block
    // 64 thread per block == 1024 maximum threads per multiprocessor
    // warp is 32

    int block = 32;
    int thread = 256;
    unsigned long long int count = 0, *cuda_count, n = 1e12;
    unsigned long long int stride = n / (block * thread);
    unsigned long seed = time(NULL);

    cudaMalloc((void**)&cuda_count, sizeof(unsigned long long int));
    cudaMemcpy(cuda_count, &count, sizeof(unsigned long long int), cudaMemcpyHostToDevice);
    generate_random_numbers<<<block, thread>>>(cuda_count, seed, stride);
    cudaMemcpy(&count, cuda_count, sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    double pi = 4 * count / ((double)n);
    printf("%0.12f\n", pi);

    return 0;
}

// RESULT -> 3.141593035940

3.141590227624

