Step 1: Go to https://colab.research.google.com in Browser and Click on New Python 3 Notebook

Step 2: Click to Runtime > Change runtime type > Hardware Accelerator, choose GPU .

Step 3: Check the Version of CUDA by : running the command below to get the following output (if not available, install cuda-10.1: 
!apt-get update
!apt-get install cuda-10.1)

In [1]:
!nvcc --version
!gcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [2]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-z9yx_918
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-z9yx_918
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4308 sha256=9669e15e192925c8ebe92fdede210eb782eafea2181c31f2541d53df823603f2
  Stored in directory: /tmp/pip-ephem-wheel-cache-olnpai0g/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


## C on Google Colab

#### Without file operation

In [None]:
%%cu
/* CPU version */ 

#include <stdio.h>
#include <stdlib.h>

void addOne (char* val){
    *val = *val + 1; 
}

int main(){
    
    char* cc = (char*)malloc(sizeof(char));
    *cc = 'A'; 

    printf("Before: %c\n", *cc);

    //call the addOne function 
    addOne(cc);

    printf("After: %c\n", *cc);
    free(cc); 
    
    return 0; 
}

Before: A
After: B



#### With file operation

In [None]:
code = """
/* CPU version */ 

#include <stdio.h>
#include <stdlib.h>

void addOne (char* val){
    *val = *val + 1; 
}

int main(){
    
    char* cc = (char*)malloc(sizeof(char));
    *cc = 'B'; 

    printf("Before: %c \\n", *cc);

    //call the addOne function 
    addOne(cc);

    printf("After: %c \\n", *cc);
    free(cc); 
    
    return 0; 
}
"""

In [None]:
text_file = open("addOne.c", "w")
text_file.write(code)
text_file.close()
!gcc addOne.c -o addOne
!./addOne

Before: B 
After: C 


## OpenMP C on Google Colab

In [None]:
ompcode = """
// OpenMP program to print
  
// OpenMP header 
#include <omp.h>   
#include <stdio.h>  
#include <stdlib.h>

void addOne (char* val){
    #pragma omp parallel
    {
      *val = *val + 1; 
      printf("%c from tid = %d \\n", *val, omp_get_thread_num());
    }
}

int main(){
    
    char* cc = (char*)malloc(sizeof(char));
    *cc = 'A'; 

    printf("Before: %c \\n", *cc);

    //call the addOne function 
    addOne(cc);

    // Beginning of parallel region 
    printf("After: %c \\n", *cc);

    free(cc); 
    return 0; 
}
"""


In [None]:
text_file = open("addOne_omp.c", "w")
text_file.write(ompcode)
text_file.close()

In [None]:
# Test after changing the thread number 
%env OMP_NUM_THREADS=3

env: OMP_NUM_THREADS=3


In [None]:
!gcc -fopenmp addOne_omp.c -o addOne_omp

In [None]:
!./addOne_omp

Before: A 
B from tid = 1 
C from tid = 0 
D from tid = 2 
After: D 


[link text](https://)## CUDA C on Google Colab

#### Without file operation

In [None]:
%%cu
/* Cuda (GPU) version */

#include <stdio.h>

__global__ void addOne (char* val){
    *val = *val + 1; 
}

int main(){
    
    char* cc = (char*)malloc(sizeof(char)); 
    *cc = 'A';
    printf("Before: %c\n", *cc);

    char* gc; 
    cudaMalloc((void**)&gc, sizeof(char));
    cudaMemcpy(gc, cc, sizeof(char), cudaMemcpyHostToDevice);

    //call the addOne kernel 
    addOne<<<1,1>>>(gc);

    cudaMemcpy(cc, gc, sizeof(char), cudaMemcpyDeviceToHost); 
    printf("After: %c\n", *cc);

    free(cc);
    cudaFree(gc);
    
    return 0; 
}

Before: A
After: B



*italicized text*#### With file operation

In [None]:
code_cuda = """
/* Cuda (GPU) version */

#include <stdio.h>

__global__ void addOne (char* val){
    *val = *val + 1; 
}

int main(){
    
    char* cc = (char*)malloc(sizeof(char)); 
    *cc = 'A';
    printf("Before: %c \\n", *cc);

    char* gc; 
    cudaMalloc((void**)&gc, sizeof(char));
    cudaMemcpy(gc, cc, sizeof(char), cudaMemcpyHostToDevice);

    //call the addOne kernel 
    addOne<<<1,1>>>(gc);

    cudaMemcpy(cc, gc, sizeof(char), cudaMemcpyDeviceToHost); 
    printf("After: %c \\n", *cc);

    free(cc);
    cudaFree(gc);
    
    return 0; 
}
"""

In [None]:
text_file = open("addOne.cu", "w")
text_file.write(code_cuda)
text_file.close()

In [None]:
!nvcc addOne.cu -o addOne_cuda

In [None]:
!./addOne_cuda

Before: A 
After: B 


##MY POSIX THREADS

In [None]:
# POSIX THREAD
code = """
/* CPU version */ 
#include <unistd.h>
#include <stdio.h>
#include <pthread.h>
#define N 10
#define THN 4
#define SLICE_SIZE (N+THN-1)/THN

int A[N],B[N];
pthread_t tid[THN];


void verifyArr(int A[], int val){
  for(int i=0;i<N;i++){
    if(A[i] != val){
      printf("Array is not verified\\n");
      return;
    }
  }
  printf("Array is verified\\n");
}

void thread_func(void* arg){  
  int id = *(int*)arg;
  printf("This is thread %d\\n",id);
  for(int i = id*SLICE_SIZE;i < (id+1)*SLICE_SIZE && i < N; i++)
    A[i]+=B[i];
}
void print(int A[]){
  for(int i=0;i<N;i++)
    printf("%d ",A[i]);
  printf("\\n");
}

int main(){
  int ids[N];  
  for(int i=0;i<N;i++){
    ids[i]=i;
    A[i]=0;
    B[i]=1;
  }
  for(int i=0;i<THN;i++)
    pthread_create(&tid[i],NULL,(void*)&thread_func,(void*)&ids[i]);
  for(int i=0;i<THN && i<N;i++)
    pthread_join(tid[i],NULL);
  verifyArr(A,1);
  print(A);
  for(int i=0;i<THN;i++)
    pthread_create(&tid[i],NULL,(void*)&thread_func,(void*)&ids[i]);
  for(int i=0;i<THN && i<N;i++)
    pthread_join(tid[i],NULL);
  verifyArr(A,2);
  print(A);
}

"""
text_file = open("multi_thread.c", "w")
text_file.write(code)
text_file.close()
!gcc -pthread multi_thread.c
!./a.out

This is thread 0
This is thread 1
This is thread 2
This is thread 3
Array is verified
1 1 1 1 1 1 1 1 1 1 
This is thread 0
This is thread 1
This is thread 2
This is thread 3
Array is verified
2 2 2 2 2 2 2 2 2 2 


##MY OPEN MP

In [None]:
ompcode = """
// OpenMP program to print
  
// OpenMP header 
#include <omp.h>   
#include <stdio.h>  
#include <stdlib.h>

#define N 10000000
#define THN 5
#define SLICE_SIZE (N+THN-1)/THN

int A[N],B[N];
void doParallel(){
#pragma omp parallel num_threads(THN)
  {
    int threads = omp_get_num_threads();
    int id=omp_get_thread_num();
    for(int i=id*SLICE_SIZE; i<(id+1)*SLICE_SIZE && i < N; i++)
      A[i]+=B[i];
    //for(int i=id*(N+threads-1)/threads; i<(id+1)*(N+threads-1)/threads && i < N; i++)
      //A[i]+=B[i];
  }
}
void verifyArr(int A[], int val){
  for(int i=0;i<N;i++){
    if(A[i] != val){
      printf("Array is not verified\\n");
      return;
    }
  }
  printf("Array is verified\\n");
}

int main(){
for(int i=0;i<N;i++){
  A[i]=0;
  B[i]=1;
}
double start,end;
start = omp_get_wtime();
doParallel();
end = omp_get_wtime();
printf("%lf s\\n",end-start);
verifyArr(A,1);
//doParallel();
//verifyArr(A,2);
}
"""
text_file = open("openmp.c", "w")
text_file.write(ompcode)
text_file.close()
!gcc -fopenmp openmp.c -o openmp
!./openmp

0.028868 s
Array is verified


In [None]:
ompcode = """
// OpenMP program to print
  
// OpenMP header 
#include <omp.h>   
#include <stdio.h>  
#include <stdlib.h>

#define N 1000000
#define THN 2
#define SLICE_SIZE (N+THN-1)/THN


int *A,*B;

void Add(int id){
  for(int i=id*SLICE_SIZE; i<(id+1)*SLICE_SIZE && i < N; i++)
    A[i]+=B[i];
}

void doParallel(){
#pragma omp parallel num_threads(THN)
  {
    #pragma omp for
    for(int id=0;id<THN;id++){
      Add(id);
    }
  }
}
void verifyArr(int A[], int val){
  for(int i=0;i<N;i++){
    if(A[i] != val){
      printf("Array is not verified\\n");
      return;
    }
  }
  printf("Array is verified\\n");
}

int main(){
// omp_set_num_threads(THN); 
A=malloc(N*sizeof(int));
B=malloc(N*sizeof(int));
for(int i=0;i<N;i++){
  A[i]=2;
  B[i]=1;
}
double start,end;
start = omp_get_wtime();
doParallel();
end = omp_get_wtime();
printf("\\n%lf sec\\n",end-start);
verifyArr(A,3);
//doParallel();
//verifyArr(A,4);
free(A);
free(B);
}
"""
text_file = open("openmp.c", "w")
text_file.write(ompcode)
text_file.close()
!gcc -fopenmp openmp.c -o openmp
!./openmp


0.002564 sec
Array is verified


##MY CUDA

In [None]:
%%cu
#include <stdio.h>
#define N 100
#define THB 100

__global__ void Add(int * A, int * B){
    int i=threadIdx.x + blockIdx.x * blockDim.x;
    if(i<N){
        A[i]=A[i]+B[i];
        A[0]=blockDim.x;
    }
}
int A[N];
int B[N];

int main(){
    int *gA;
    int *gB;
    for(int i=0;i<N;i++){
      A[i]=0;
      B[i]=i;
    }
    cudaMalloc((void**)&gA,N*sizeof(int));
    cudaMalloc((void**)&gB,N*sizeof(int));
    cudaMemcpy(gA,A,N*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(gB,B,N*sizeof(int),cudaMemcpyHostToDevice);
    //dim3 blocks(2,1,1);
    //dim3 threads(2,1,1);
    Add<<<(N+THB-1)/THB,THB>>>(gA,gB);
    
    cudaMemcpy(A,gA,N*sizeof(int),cudaMemcpyDeviceToHost);
    cudaMemcpy(B,gB,N*sizeof(int),cudaMemcpyDeviceToHost);
    cudaFree(gA);
    cudaFree(gB);
    for(int i=0;i<N;i++)
      printf("%d ",A[i]);
    printf("\\n%d",(N+THB-1)/THB);
}

100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
1
