<a href="https://colab.research.google.com/github/vanigupta123/nbody_sim/blob/main/nbody_sim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%writefile main.cu
#include <stdio.h>
#include <curand_kernel.h>
#include <cuda_runtime.h>
#include <math.h>

__global__ void initParticles(int N, double *pos, double *vel, double *mass) {
  // this is a kernel and will be called with params<<<grid_dimensions, block_dimensions>>>
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
  // printf("%d\n", idx);
  if (idx >= N) return;

  curandState state;
  curand_init(1234ULL, idx, 0, &state);

  pos[3*idx] = curand_uniform_double(&state);
  pos[3*idx + 1] = curand_uniform_double(&state);
  pos[3*idx + 2] = curand_uniform_double(&state);

  vel[3*idx] = 0.0;
  vel[3*idx + 1] = 0.0;
  vel[3*idx + 2] = 0.0;

  mass[idx] = 1.0 + curand_uniform_double(&state);
}

__global__ void getAcceleration(int N, double *pos, double *vel, double *mass, double *acc) {
  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (idx >= N) return;

  double G = 6.6743e-11;
  // F = Gm1m2/|r|^2 * r/|r|
  // a1 = Gm2r/|r|^3
  // with softening -> a1 = Gm2r/(|r|^2 + eps^2)^(3/2)
  double sum[3];
  sum[0] = 0.0;
  sum[1] = 0.0;
  sum[2] = 0.0;
  double p1 = pos[3*idx];
  double r[3];
  double m2;
  double magSquared;
  double val;
  double pos_i_x = pos[3*idx];
  double pos_i_y = pos[(3*idx)+1];
  double pos_i_z = pos[(3*idx)+2];
  // TODO: get tile and implement shared memory here
  for (int i = 0; i < N; i++) {
    m2 = mass[i];
    r[0] = pos[3*i] - pos_i_x;
    r[1] = pos[(3*i)+1] - pos_i_y;
    r[2] = pos[(3*i)+2] - pos_i_z;
    magSquared = (r[0]*r[0]) + (r[1]*r[1]) + (r[2]*r[2]);
    val = m2 / sqrt(pow(magSquared + EPS2, 3));
    sum[0] += r[0] * val;
    sum[1] += r[1] * val;
    sum[2] += r[2] * val;
  }
  acc[3*idx] = G * sum[0];
  acc[3*idx + 1] = G * sum[1];
  acc[3*idx + 2] = G * sum[2];
}

int main() {
  printf("hello from colab GPU!\n");
  int N = 1000;
  double *d_pos, *d_vel, *d_mass, *d_acc;
  cudaMalloc(&d_pos, 3*N*sizeof(double));
  cudaMalloc(&d_vel, 3*N*sizeof(double));
  cudaMalloc(&d_mass, N*sizeof(double));
  cudaMalloc(&d_acc, 3*N*sizeof(double));

  // one thread per particle
  int blockSize = 256;
  int numBlocks = (N + blockSize - 1)/blockSize; // same as ceil(N/blockSize)
  initParticles<<<numBlocks, blockSize>>>(N, d_pos, d_vel, d_mass);
  getAcceleration<<<numBlocks, blockSize>>>(N, d_pos, d_vel, d_mass, d_acc);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("CUDA error: %s\n", cudaGetErrorString(err));
  }
  cudaDeviceSynchronize();

  // sample of initial velocities and positions
  double s_pos[15], s_vel[15], s_mass[5]; // for 5 particles
  cudaMemcpy(s_pos, d_pos, 15*sizeof(double), cudaMemcpyDeviceToHost);
  cudaMemcpy(s_vel, d_vel, 15*sizeof(double), cudaMemcpyDeviceToHost);
  cudaMemcpy(s_mass, d_mass, 5*sizeof(double), cudaMemcpyDeviceToHost);

  for (int i = 0; i < 5; i++) {
    printf("particle with mass %.4f at position (%.2f, %.2f, %.2f) with velocity (%.2f, %.2f, %.2f)\n",
    s_mass[i],
    s_pos[3*i], s_pos[(3*i)+1], s_pos[(3*i)+2],
    s_vel[3*i], s_vel[(3*i)+1], s_vel[(3*i)+2]
    );
  }

  return 0;
}

Overwriting main.cu


In [4]:
!nvcc -arch=sm_75 main.cu -o main

[01m[0m[01mmain.cu(46)[0m: [01;31merror[0m: identifier "[01mEPS2[0m" is undefined
      val = m2 / pow(magSquared + EPS2, 3/2);
                                  ^

1 error detected in the compilation of "main.cu".


In [None]:
!./main

hello from colab GPU!
particle with mass 1.7867 at position (0.43, 0.46, 0.58) with velocity (0.00, 0.00, 0.00)
particle with mass 1.7993 at position (0.93, 0.54, 0.02) with velocity (0.00, 0.00, 0.00)
particle with mass 1.0938 at position (0.81, 0.74, 0.20) with velocity (0.00, 0.00, 0.00)
particle with mass 1.9946 at position (0.31, 0.54, 0.90) with velocity (0.00, 0.00, 0.00)
particle with mass 1.9569 at position (0.56, 0.06, 0.71) with velocity (0.00, 0.00, 0.00)
