<a href="https://colab.research.google.com/github/thomaswalsh86/cuda/blob/main/CUDA_ACTUAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-853mucdc
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-853mucdc
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmps1yajyfa".


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void hello_from_gpu() {
    printf("Hello World from GPU! Thread %d in Block %d\n", threadIdx.x, blockIdx.x);
}

int main(void) {
    printf("Starting CUDA program on Tesla T4...\n");
    fflush(stdout);

    // Launch kernel with multiple threads to increase chance of output
    hello_from_gpu<<<2, 3>>>();

    // Proper synchronization with error checking
    cudaError_t syncError = cudaDeviceSynchronize();
    if (syncError != cudaSuccess) {
        printf("CUDA sync error: %s\n", cudaGetErrorString(syncError));
        fflush(stdout);
    }

    cudaError_t kernelError = cudaGetLastError();
    if (kernelError != cudaSuccess) {
        printf("Kernel launch error: %s\n", cudaGetErrorString(kernelError));
        fflush(stdout);
    }

    printf("Program completed successfully!\n");
    fflush(stdout);
    return 0;
}

Starting CUDA program on Tesla T4...
Hello World from GPU! Thread 0 in Block 0
Hello World from GPU! Thread 1 in Block 0
Hello World from GPU! Thread 2 in Block 0
Hello World from GPU! Thread 0 in Block 1
Hello World from GPU! Thread 1 in Block 1
Hello World from GPU! Thread 2 in Block 1
Program completed successfully!



In [None]:
%%writefile simulation_test.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

// N-body simulation (simplified)
struct Body {
    float x, y, z;
    float vx, vy, vz;
    float mass;
};

__global__ void nbody_simulation(Body *bodies, int n, float dt) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;

    float fx = 0.0f, fy = 0.0f, fz = 0.0f;

    // Calculate gravitational forces from all other bodies
    for (int j = 0; j < n; j++) {
        if (i == j) continue;

        float dx = bodies[j].x - bodies[i].x;
        float dy = bodies[j].y - bodies[i].y;
        float dz = bodies[j].z - bodies[i].z;

        float dist_sq = dx*dx + dy*dy + dz*dz + 1e-10f; // Softening
        float dist = sqrtf(dist_sq);
        float force = bodies[i].mass * bodies[j].mass / dist_sq;

        fx += force * dx / dist;
        fy += force * dy / dist;
        fz += force * dz / dist;
    }

    // Update velocities and positions
    bodies[i].vx += fx / bodies[i].mass * dt;
    bodies[i].vy += fy / bodies[i].mass * dt;
    bodies[i].vz += fz / bodies[i].mass * dt;

    bodies[i].x += bodies[i].vx * dt;
    bodies[i].y += bodies[i].vy * dt;
    bodies[i].z += bodies[i].vz * dt;
}

int main() {
    printf("=== REAL-WORLD SIMULATION TEST ===\n\n");

    int n_bodies = 4096;
    int iterations = 10;

    printf("Running N-body simulation with %d bodies for %d iterations\n\n",
           n_bodies, iterations);

    Body *h_bodies = new Body[n_bodies];
    Body *d_bodies;

    // Initialize random bodies
    for (int i = 0; i < n_bodies; i++) {
        h_bodies[i].x = (float)rand() / RAND_MAX * 100.0f - 50.0f;
        h_bodies[i].y = (float)rand() / RAND_MAX * 100.0f - 50.0f;
        h_bodies[i].z = (float)rand() / RAND_MAX * 100.0f - 50.0f;
        h_bodies[i].vx = (float)rand() / RAND_MAX * 2.0f - 1.0f;
        h_bodies[i].vy = (float)rand() / RAND_MAX * 2.0f - 1.0f;
        h_bodies[i].vz = (float)rand() / RAND_MAX * 2.0f - 1.0f;
        h_bodies[i].mass = (float)rand() / RAND_MAX * 10.0f + 1.0f;
    }

    cudaMalloc(&d_bodies, n_bodies * sizeof(Body));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    for (int iter = 0; iter < iterations; iter++) {
        cudaMemcpy(d_bodies, h_bodies, n_bodies * sizeof(Body), cudaMemcpyHostToDevice);

        int blockSize = 256;
        int numBlocks = (n_bodies + blockSize - 1) / blockSize;

        nbody_simulation<<<numBlocks, blockSize>>>(d_bodies, n_bodies, 0.01f);
        cudaDeviceSynchronize();

        cudaMemcpy(h_bodies, d_bodies, n_bodies * sizeof(Body), cudaMemcpyDeviceToHost);

        if (iter % 2 == 0) {
            printf("Iteration %d/%d completed\n", iter + 1, iterations);
        }
    }

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("\nSimulation Results:\n");
    printf("Total time: %.2f ms\n", milliseconds);
    printf("Time per iteration: %.2f ms\n", milliseconds / iterations);
    printf("Bodies processed per second: %.2f million\n",
           (n_bodies * iterations) / (milliseconds / 1000.0) / 1e6);

    // Calculate center of mass to verify simulation
    float total_mass = 0.0f;
    float com_x = 0.0f, com_y = 0.0f, com_z = 0.0f;

    for (int i = 0; i < n_bodies; i++) {
        total_mass += h_bodies[i].mass;
        com_x += h_bodies[i].x * h_bodies[i].mass;
        com_y += h_bodies[i].y * h_bodies[i].mass;
        com_z += h_bodies[i].z * h_bodies[i].mass;
    }

    com_x /= total_mass;
    com_y /= total_mass;
    com_z /= total_mass;

    printf("Center of mass: (%.2f, %.2f, %.2f)\n", com_x, com_y, com_z);

    // Cleanup
    delete[] h_bodies;
    cudaFree(d_bodies);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    printf("\n=== SIMULATION TEST COMPLETE ===\n");
    return 0;
}

Overwriting simulation_test.cu


In [None]:
!nvcc -arch=sm_75 -o simulation_test simulation_test.cu && ./simulation_test

=== REAL-WORLD SIMULATION TEST ===

Running N-body simulation with 4096 bodies for 10 iterations

Iteration 1/10 completed
Iteration 3/10 completed
Iteration 5/10 completed
Iteration 7/10 completed
Iteration 9/10 completed

Simulation Results:
Total time: 37.48 ms
Time per iteration: 3.75 ms
Bodies processed per second: 1.09 million
Center of mass: (0.18, -0.94, -0.70)

=== SIMULATION TEST COMPLETE ===


In [None]:
!sudo apt-get install cuda-toolkit-12-4 -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  ca-certificates-java cuda-cccl-12-4 cuda-command-line-tools-12-4
  cuda-compiler-12-4 cuda-crt-12-4 cuda-cudart-12-4 cuda-cudart-dev-12-4
  cuda-cuobjdump-12-4 cuda-cupti-12-4 cuda-cupti-dev-12-4 cuda-cuxxfilt-12-4
  cuda-documentation-12-4 cuda-driver-dev-12-4 cuda-gdb-12-4
  cuda-libraries-12-4 cuda-libraries-dev-12-4 cuda-nsight-12-4
  cuda-nsight-compute-12-4 cuda-nsight-systems-12-4 cuda-nvcc-12-4
  cuda-nvdisasm-12-4 cuda-nvml-dev-12-4 cuda-nvprof-12-4 cuda-nvprune-12-4
  cuda-nvrtc-12-4 cuda-nvrtc-dev-12-4 cuda-nvtx-12-4 cuda-nvvm-12-4
  cuda-nvvp-12-4 cuda-opencl-12-4 cuda-opencl-dev-12-4 cuda-profiler-api-12-4
  cuda-sanitizer-12-4 cuda-toolkit-12-4-config-common cuda-tools-12-4
  cuda-visual-tools-12-4 default-jre default-jre-headless fonts-dejavu-core
  fonts-dejavu-extra gds-tools-12-4 java-common libatk-wrapper-java
  libat

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0


In [None]:
%%writefile