<a href="https://colab.research.google.com/github/wappints/CversusCUDA/blob/main/CversusCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile runs.c
const int runs = 30;

Overwriting runs.c


In [None]:
%%writefile dotproductC.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "N.c"
#include "runs.c"
void dotproduct(int n, float *dotp, float *h_in, float *h_in2) {
    for(int i = 0; i < n; i++) {
      *dotp += (h_in[i] * h_in2[i]) ; // summation of products 
    }

}
int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  double total = 0;
  float *in_x, *in_y, *dotp;

  in_x = (float*) malloc(ARRAY_BYTES);
  in_y = (float*) malloc(ARRAY_BYTES);
  dotp = (float*) malloc(ARRAY_BYTES);

  for (int x = 0; x < ARRAY_SIZE; x++) { // initialize array in_x to 2.0 for all indices 
    in_x[x] = float(2);
  }

  for (int y = 0; y < ARRAY_SIZE; y++) { // initialize array in_y to 3.0 for all indices 
    in_y[y] = float(3);
  }

  clock_t start, end;
  for (int i = 0; i < runs; i++) {
    start = clock();
    dotproduct(ARRAY_SIZE, dotp, in_x, in_y); // dotp should have summation of 6's in 1...N=ARRAY_SIZE 
    end = clock();

    double time_taken = ((double)(end-start))*1e6/CLOCKS_PER_SEC;
    total += time_taken;

    if (i < 1)
      printf("DotProduct = %f of every run", *dotp); // print once the dot product result
   *dotp = 0; // clear the dot product result from the dotp variable
  }
  total = total/runs;
  printf(" | Array Size = %d | Execution Time = %f μs | No. of Runs: %d", ARRAY_SIZE, total, runs);
  return 0;
}

Overwriting dotproductC.c


In [None]:
%%writefile dotproductCu.cu

#include <stdio.h>
#include <stdlib.h>
#include "N.c"
#include "runs.c"
#include "blocksize.c"

__global__ 
void dotproduct(float *dotp, int n, float *h_in, float *h_in2) { //grid-stride dotproduct 
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i =index; i<n; i+=stride) {
    atomicAdd(dotp, h_in[i] * h_in2[i]); //mutex lock
  }


}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  float *in_x, *in_y, *dotp;
  // allocate memory CUDA 
  cudaMallocManaged(&in_x, ARRAY_BYTES);
  cudaMallocManaged(&in_y, ARRAY_BYTES);
  cudaMallocManaged(&dotp, ARRAY_BYTES);

  for (int x = 0; x < ARRAY_SIZE; x++) { // initialize array in_x to 2.0 for all indices 
    in_x[x] = float(2);
  }
  for (int y = 0; y < ARRAY_SIZE; y++) { // initialize array in_y to 3.0 for all indices 
    in_y[y] = float(3);
  }

  int numblocks = (ARRAY_SIZE+blocksize-1)/blocksize; // get the right numblocks 
  for (int i = 0; i < runs; i++) {
    dotproduct<<<numblocks , blocksize>>> (dotp, ARRAY_SIZE, in_x, in_y);      
    cudaDeviceSynchronize();      
    if (i < 1)
      printf("Dot Product = %f ", *dotp); // print only once to prevent repetitive prints of dot product output 
    *dotp = 0; // reset dotp variable 
  }
   printf("| Array Size = %d | No. of Runs: %d\n", ARRAY_SIZE, runs);

   // free memory 
  cudaFree(in_x);
  cudaFree(in_y);
  cudaFree(dotp);

  return 0;
}

Overwriting dotproductCu.cu


In [None]:
%%writefile blocksize.c
const int blocksize = 1024;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
g++ dotproductC.c -o dotproductC
./dotproductC

DotProduct = 6291456.000000 of every run | Array Size = 1048576 | Execution Time = 4132.766667 μs | No. of Runs: 30



Execution Time = 4132.766667 μs

In [None]:
%%shell
nvcc dotproductCu.cu -o dotproductCu
nvprof ./dotproductCu

==6457== NVPROF is profiling process 6457, command: ./dotproductCu
Dot Product = 6291456.000000 | Array Size = 1048576 | No. of Runs: 30
==6457== Profiling application: ./dotproductCu
==6457== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  117.45ms        30  3.9151ms  3.7713ms  6.3513ms  dotproduct(float*, int, float*, float*)
      API calls:   73.95%  340.74ms         3  113.58ms  12.234us  340.67ms  cudaMallocManaged
                   25.54%  117.69ms        30  3.9229ms  3.7772ms  6.3747ms  cudaDeviceSynchronize
                    0.21%  955.82us         3  318.61us  106.15us  450.58us  cudaFree
                    0.13%  619.71us        30  20.657us  13.967us  73.469us  cudaLaunchKernel
                    0.12%  548.06us         1  548.06us  548.06us  548.06us  cuDeviceTotalMem
                    0.04%  184.92us       101  1.8300us     130ns  79.990us  cuDeviceGetAttribute
                    0.01



Execution Time = 3915.1 μs

In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
g++ dotproductC.c -o dotproductC
./dotproductC

DotProduct = 25165824.000000 of every run | Array Size = 4194304 | Execution Time = 13562.666667 μs | No. of Runs: 30



Execution Time = 13562.666667 μs

In [None]:
%%shell
nvcc dotproductCu.cu -o dotproductCu
nvprof ./dotproductCu

==6512== NVPROF is profiling process 6512, command: ./dotproductCu
Dot Product = 25165824.000000 | Array Size = 4194304 | No. of Runs: 30
==6512== Profiling application: ./dotproductCu
==6512== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  390.88ms        30  13.029ms  8.4491ms  24.415ms  dotproduct(float*, int, float*, float*)
      API calls:   53.16%  391.15ms        30  13.038ms  8.4579ms  24.428ms  cudaDeviceSynchronize
                   46.34%  341.02ms         3  113.67ms  24.081us  340.94ms  cudaMallocManaged
                    0.27%  2.0060ms         3  668.67us  66.459us  1.0285ms  cudaFree
                    0.14%  999.14us        30  33.304us  17.799us  57.935us  cudaLaunchKernel
                    0.06%  413.48us         1  413.48us  413.48us  413.48us  cuDeviceTotalMem
                    0.03%  217.20us       101  2.1500us     137ns  104.36us  cuDeviceGetAttribute
                    0.0



Execution Time = 13029.0 μs

In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
g++ dotproductC.c -o dotproductC
./dotproductC

DotProduct = 123032912.000000 of every run | Array Size = 16777216 | Execution Time = 55287.533333 μs | No. of Runs: 30



Execution Time = 55287.533333 μs

In [None]:
%%shell
nvcc dotproductCu.cu -o dotproductCu
nvprof ./dotproductCu

==6564== NVPROF is profiling process 6564, command: ./dotproductCu
Dot Product = 123032912.000000 | Array Size = 16777216 | No. of Runs: 30
==6564== Profiling application: ./dotproductCu
==6564== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.14746s        30  38.249ms  33.428ms  94.426ms  dotproduct(float*, int, float*, float*)
      API calls:   72.87%  1.15649s        30  38.550ms  33.442ms  94.432ms  cudaDeviceSynchronize
                   26.49%  420.37ms         3  140.12ms  27.070us  420.24ms  cudaMallocManaged
                    0.49%  7.8375ms         3  2.6125ms  103.38us  4.0028ms  cudaFree
                    0.09%  1.4173ms        30  47.244us  34.744us  80.032us  cudaLaunchKernel
                    0.04%  629.78us         1  629.78us  629.78us  629.78us  cuDeviceTotalMem
                    0.02%  280.23us       101  2.7740us     180ns  122.27us  cuDeviceGetAttribute
                    0



Execution Time = 38249 μs




| # of elements | C execution time (μs) |CUDA execution time (μs)|
|  -----------  |      -----------      |      -----------       |
| 1<<20         | 4132.766667           |   3915.1                |
| 1<<22         | 13562.666667          |   13029.0                |
| 1<<24         | 55287.533333          |   38249.0                  | 
