# Check NVIDIA CUDA Compiler Version
This cell runs the command !nvcc --version to check the version of the NVIDIA CUDA Compiler (nvcc) installed in the environment. The CUDA Compiler is part of the NVIDIA CUDA Toolkit and is essential for compiling CUDA programs.

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


# Check GCC Version
This cell runs !gcc --version to display the installed version of the GNU Compiler Collection (GCC).



In [2]:
!gcc --version

gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



# Install and Load NVCC4Jupyter
This cell installs the nvcc4jupyter package and loads the NVCC extension for Jupyter notebooks. The commands are:


*   !pip install nvcc4jupyter: Installs the nvcc4jupyter package, which allows you to write and run CUDA C/C++ code directly in Jupyter notebooks.

*   %load_ext nvcc4jupyter: Loads the NVCC extension to enable the execution of CUDA code within notebook cells.

In [3]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp1iajzhkx".


# List NVIDIA GPUs
This cell runs the command !nvidia-smi -L to list all NVIDIA GPUs available in the environment. The nvidia-smi (NVIDIA System Management Interface) tool provides information about the NVIDIA driver and hardware, including the GPU model, memory, and other specifications. This command is useful for confirming the presence and details of GPU resources available for your computations.

In [4]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-386a70d0-db04-4e05-2413-e2b15eba27ac)


# Display GPU Properties with CUDA

In [None]:
%%cuda
#include <cuda_runtime.h>
#include <stdio.h>

// Convert bytes to gigabytes
float bytesToGB(size_t bytes) {
    return static_cast<float>(bytes) / (1024.0f * 1024.0f * 1024.0f);
}

// Convert bytes to kilobytes
float bytesToKB(size_t bytes) {
    return static_cast<float>(bytes) / 1024.0f;
}

void displayGPUProperties(const cudaDeviceProp& deviceProp) {
    printf("Device Name: %s\n", deviceProp.name);
    printf("Total Global Memory: %.2f GB\n", bytesToGB(deviceProp.totalGlobalMem));
    printf("Shared Memory per Block: %.2f KB\n", static_cast<float>(deviceProp.sharedMemPerBlock) / 1024.0f);
    printf("Registers per Block: %d\n", deviceProp.regsPerBlock);
    printf("Warp Size: %d\n", deviceProp.warpSize);
    printf("Max Threads per Block: %d\n", deviceProp.maxThreadsPerBlock);
    printf("Max Threads per Multi-Processor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
    printf("Max Grid Dimensions: [%d, %d, %d]\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
    printf("Max Block Dimensions: [%d, %d, %d]\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
    printf("Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor);
    printf("Multi-Processor Count: %d\n", deviceProp.multiProcessorCount);
    printf("Max Texture 1D Size: %d\n", deviceProp.maxTexture1D);
    printf("Max Texture 2D Size: [%d, %d]\n", deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1]);
    printf("Concurrent Kernels: %s\n", deviceProp.concurrentKernels ? "Supported" : "Not Supported");
    printf("ECC Memory: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");

    // Cache information
    printf("\nCache Configuration:\n");
    printf("L2 Cache Size: %.2f KB\n", bytesToKB(deviceProp.l2CacheSize));
    printf("Global Memory Bus Width: %d bits\n", deviceProp.memoryBusWidth);
    printf("Memory Clock Rate: %.2f GHz\n", deviceProp.memoryClockRate / 1e6f);


}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for (int i = 0; i < deviceCount; ++i) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, i);

        printf("Device %d\n", i);
        displayGPUProperties(deviceProp);
        printf("\n");
    }

    return 0;
}


Device 0
Device Name: Tesla T4
Total Global Memory: 14.75 GB
Shared Memory per Block: 48.00 KB
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads per Multi-Processor: 1024
Max Grid Dimensions: [2147483647, 65535, 65535]
Max Block Dimensions: [1024, 1024, 64]
Compute Capability: 7.5
Multi-Processor Count: 40
Max Texture 1D Size: 131072
Max Texture 2D Size: [131072, 65536]
Concurrent Kernels: Supported
ECC Memory: Enabled

Cache Configuration:
L2 Cache Size: 4096.00 KB
Global Memory Bus Width: 256 bits
Memory Clock Rate: 5.00 GHz




In [5]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Serial

In [15]:
%cd "/content/drive/MyDrive/MultiCore/N_Body_Project/serial"

/content/drive/MyDrive/MultiCore/N_Body_Project/serial


## Number of Bodies 600

In [17]:
!g++ -o nbody_serial n_body.cpp -lm
!./nbody_serial

Time: 1.945232


## Number of Bodies 2880

In [16]:
!g++ -o nbody_serial n_body.cpp -lm
!./nbody_serial

Time: 49.670639


## Number of Bodies 11520

In [14]:
!g++ -o nbody_serial n_body.cpp -lm
!./nbody_serial

Time: 779.031543


## Number of bodies 23040

In [4]:
!g++ -o nbody_serial n_body.cpp -lm
!./nbody_serial

# OpenMP

In [None]:
%cd "/content/drive/MyDrive/MultiCore/N_Body_Project/openMp"

/content/drive/MyDrive/MultiCore/N_Body_Project/openMp


In [None]:
!g++ -o nbody_omp n_body_omp.cpp -fopenmp

In [None]:
!./nbody_omp

Time: 40.573866


In [None]:
import multiprocessing
print(multiprocessing.cpu_count())

2


# Cuda

In [7]:
%cd "/content/drive/MyDrive/MultiCore/N_Body_Project/cuda"

/content/drive/MyDrive/MultiCore/N_Body_Project/cuda


## Number of Bodies 600

### BlockSize = 16

In [13]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 0.673287


### BlockSize = 32

In [14]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 0.511666


### BlockSize = 64

In [15]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 0.555038


### BlockSize = 128

In [16]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 0.596572


## Number of Bodies 2880

### BlockSize = 16

In [8]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 2.484351


### BlockSize = 32

In [9]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 1.627396


### BlockSize = 64

In [10]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 1.904166


### BlockSize = 128

In [11]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 1.864272


## Number of Bodies 11520

### BlockSize = 16

In [23]:
!nvcc -o nbody_cuda n_body_cuda.cu

In [24]:
!./nbody_cuda

Time: 30.405216


### BlockSize = 32

In [17]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 15.161601


### BlockSize = 64

In [14]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 15.208548


### BlockSize = 128

In [13]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 18.241501


## Number of Bodies 23040

### BlockSize = 16




In [26]:
!nvcc -o nbody_cuda n_body_cuda.cu

In [27]:
!./nbody_cuda

Time: 115.496223


### BlockSize = 32

In [9]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 61.749649


### BlockSize = 64

In [11]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 60.048935


### BlockSize = 128

In [12]:
!nvcc -o nbody_cuda n_body_cuda.cu
!./nbody_cuda

Time: 60.095977
