<a href="https://colab.research.google.com/github/trefftzc/cis677/blob/main/Combining_nvcc_and_mpicc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A brief example of how to combine nvcc and mpicc

Taken from https://docs.ccv.brown.edu/oscar/gpu-computing/mpi-cuda

In [None]:
%%writefile main.c
/* main.c */
#include <mpi.h>
#include <stdlib.h>

void launchMultiply(float *a, float *b,int n);

int main (int argc, char **argv)
{
    int rank, nprocs;
    MPI_Init (&argc, &argv);
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
    int n = 128;
    /* ... prepare arrays a and b */
    float *a;
    float *b;
    a = (float *) malloc (sizeof(float)*n);
    b = (float *) malloc (sizeof(float)*n);
    for(int i = 0;i < n;i++) {
      a[i] = i;
      b[i] = i;
    }
    launchMultiply (a, b,n);
    MPI_Finalize();
    return 0;
}

Overwriting main.c


In [None]:
%%writefile multiply.cu
#include <cuda.h>

__global__ void __multiply__ (const float *a, float *b)
{
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    b[i] *= a[i];
}

extern "C++" void launchMultiply(float *a, float *b,int n)
{
    /* ... load CPU data into GPU buffers a_gpu and b_gpu */
      float *device_a;
      float *device_b;
      cudaMalloc(&device_a,n*sizeof(float));
      cudaMalloc(&device_b,n*sizeof(float));
      cudaMemcpy(device_a,a,n*sizeof(float),cudaMemcpyHostToDevice);
      cudaMemcpy(device_b,b,n*sizeof(float),cudaMemcpyHostToDevice);
    __multiply__ <<< (n+31)/32,32 >>> (device_a, device_b);


    /* ... transfer data from GPU to CPU */
      cudaMemcpy(a,device_a,n*sizeof(float),cudaMemcpyDeviceToHost);
      cudaMemcpy(b,device_b,n*sizeof(float),cudaMemcpyDeviceToHost);

      cudaFree(device_a);
      cudaFree(device_b);
}

Writing multiply.cu


In [None]:
%%writefile Makefile
combined: main.o multiply.o
  mpic++ main.o multiply.o -lcudart -L/usr/local/cuda-12.4/lib64 -o combined

main.o: main.c
  mpic++ -c main.c -o main.o

multiply.o: multiply.cu


Overwriting Makefile


In [None]:
!nvcc -c multiply.cu -o multiply.o
!mpic++ -c main.c -o main.o
!mpic++ main.o multiply.o -o combined -lcudart -L/usr/local/cuda-12.2/lib64



In [None]:
!./combined

# An alternative, using nvcc only

In [1]:
%%writefile test.cu
#include<unistd.h>
#include<cuda.h>
#include"mpi.h"
#include <iostream>

using namespace std;

int main(int argc, char *argv[])
{
int MyRank, NumberOfProcessors, Root = 0;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&NumberOfProcessors);
    MPI_Comm_rank(MPI_COMM_WORLD,&MyRank);

    if(MyRank == Root)
            cout<<"My Rank is 0." << endl;
    else
            cout<<"My Rank is " << MyRank << endl;
    MPI_Finalize();

    return(0);
}

Writing test.cu


In [3]:
!nvcc test.cu -o test -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib -lmpi -DUSE_MPI=1

In [6]:
!OMPI_ALLOW_RUN_AS_ROOT=1
!mpiexec --allow-run-as-root -n 2 --oversubscribe ./test

My Rank is 1
My Rank is 0.
