In [2]:
# Check the Python version to verify automatically installed
!python --version

# Check NVCC (CUDA compiler driver) version to verify the automatically installed
!nvcc --version

# Since Google Colab runs Jupyter Notebook, we need to install the nvcc4jupyter: CUDA C++ Plugin for Jupyter Notebook
!pip install nvcc4jupyter

# After installing, load the package (called extension)
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpqve_6oyy".


In [3]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>

#include <stdio.h>
#include <stdlib.h>
#include <time.h>


__global__ void monteCarloPiEstimate(int* d_insideCircle, long long int n, unsigned long seed) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    curandState state;
    curand_init(seed, idx, 0, &state);

    int local_count = 0;

    for (int i = idx; i < n; i += stride) {
        float x = curand_uniform(&state) * 2.0f - 1.0f;
        float y = curand_uniform(&state) * 2.0f - 1.0f;
        float distance_squared = x * x + y * y;
        if (distance_squared <= 1)
            local_count++;
    }
        atomicAdd(d_insideCircle, local_count);
}


int main() {
    int n = 1 << 24;
    int h_insideCircle = 0;
    int* d_insideCircle;

    cudaMalloc(&d_insideCircle, sizeof(int));

    cudaMemcpy(d_insideCircle, &h_insideCircle, sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 1024;
    int numBlocks = (n + blockSize - 1) / blockSize;
    printf("%d,%d", blockSize, numBlocks);
    monteCarloPiEstimate << <numBlocks, blockSize >> > (d_insideCircle, n, time(NULL));

    cudaMemcpy(&h_insideCircle, d_insideCircle, sizeof(int), cudaMemcpyDeviceToHost);

    float piEstimate = 4.0f * h_insideCircle / n;
    printf("Estimated Pi = %f\n", piEstimate);

    cudaFree(d_insideCircle);

    return 0;
}

1024,16384Estimated Pi = 3.141793

