## **How to run a CUDA code on GOOGLE COLAB**


1. In the execution menu choice the GPU acceleration option

2. The next lines verify if a GPU was assigned

In [1]:
# Show if a CUDA Toolkit was installed
!ls /usr/local
# Show it the  nvcc command can be called
!which nvcc
# Show which NVIDIA card was assigned
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src
/usr/local/cuda/bin/nvcc
Mon Jul 24 13:52:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
            

3. Just paste your code below the line
%%writefile RunCode.cu

In [2]:
#Escribe el archivo .cu que se vaya a utilizar

%%writefile RunCode.cu

// Adding two vectors (c=a+b) on CUDA
#include <iostream>
#include <fstream>
#include <cmath>
using namespace std;

#define Lx 16
#define Nx 8
const int Mx=(Lx+Nx-1)/Nx;

//--------------------KERNELS----------------
__global__ void AddVectors(float *d_a,float *d_b,float *d_c){
 //Which thread should I do?
  int ix;  ix=blockIdx.x*blockDim.x+threadIdx.x;
  d_c[ix]=d_a[ix]+d_b[ix];}


int main(void){
  int ix;
  //DECLARE
  //Declare arrays in the Host
  float h_a[Lx],h_b[Lx],h_c[Lx];
  //Declare arrays in the Device
  float*d_a; cudaMalloc((void**) &d_a,Lx*sizeof(float));
  float*d_b; cudaMalloc((void**) &d_b,Lx*sizeof(float));
  float*d_c; cudaMalloc((void**) &d_c,Lx*sizeof(float));

  //INPUT DATA
  //Set data in the Host
  for(ix=0;ix<Lx;ix++){
    h_a[ix]=ix; h_b[ix]=2*ix;}

  //Send data to the Device
  cudaMemcpy(d_a,h_a,Lx*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(d_b,h_b,Lx*sizeof(float),cudaMemcpyHostToDevice);

  //PROCESS
  //Run parallel on the Device
  dim3 ThreadsPerBlock(Nx,1,1);
  dim3 BlocksPerGrid(Mx,1,1);
  AddVectors<<<BlocksPerGrid,ThreadsPerBlock>>>(d_a,d_b,d_c);

  //SHOW RESULTS
  //Bring back to the Host
  cudaMemcpy(h_c,d_c,Lx*sizeof(float),cudaMemcpyDeviceToHost);
  for(ix=0;ix<Lx;ix++)
    cout<<ix<<" "<<h_c[ix]<<endl;

  //Free dynamic memory
  cudaFree(d_a);  cudaFree(d_b);  cudaFree(d_c);

  return 0;
}


Writing RunCode.cu


4. Let us compile the .cu code with nvcc

Notes:
*   The flags are necessary if the GPU is a NVIDIA K80 card.
*   The output archive "Waves_CUDA.dat" will appear in the directory on the left

In [3]:
# Compila el archivo .cu. La bandera es necesaria si usa una tarjeta Tesla K80
!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 RunCode.cu -o Ejecutar
!./Ejecutar

0 0
1 3
2 6
3 9
4 12
5 15
6 18
7 21
8 24
9 27
10 30
11 33
12 36
13 39
14 42
15 45
