<a href="https://colab.research.google.com/github/wappints/CversusCUDA/blob/main/CvCUDA(square).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CPU execution time


In [None]:
%%writefile runs.c
const int runs = 30;

Overwriting runs.c


In [None]:
%%writefile Q1_SquareTime.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "N.c"
#include "runs.c"
void square(int n, float *h_out, float *h_in) {
  for(int i = 0; i < n; i++) {
    h_out[i] = h_in[i] * h_in[i];
  }
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  double total = 0;
  float *in, *out;

  in = (float*) malloc(ARRAY_BYTES);
  out = (float*) malloc(ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
    in[i] = float(i);
  }
    

  // call C
  clock_t start, end;
  for (int j = 0; j < runs; j++) {

    start = clock();
    square(ARRAY_SIZE, out, in);
    end = clock();
    double time_taken = ((double)(end-start))*1e6/CLOCKS_PER_SEC;
    total += time_taken;
  }
  total = total/runs;
  printf("Array Size = %d | Execution Time = %f μs | No. of Runs: %d", ARRAY_SIZE,  total,runs);
  return 0;
}

Overwriting Q1_SquareTime.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 256;

Overwriting N.c


In [None]:
%%shell
g++ Q1_SquareTime.c -o Q1_SquareTime
./Q1_SquareTime

Array Size = 256 | Execution Time = 1.700000 μs | No. of Runs: 30



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 512;

Overwriting N.c


In [None]:
%%shell
g++ Q1_SquareTime.c -o Q1_SquareTime
./Q1_SquareTime

Array Size = 512 | Execution Time = 2.933333 μs | No. of Runs: 30



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1024;

Overwriting N.c


In [None]:
%%shell
g++ Q1_SquareTime.c -o Q1_SquareTime
./Q1_SquareTime

Array Size = 1024 | Execution Time = 4.700000 μs | No. of Runs: 30



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1048576;

Overwriting N.c


In [None]:
%%shell
g++ Q1_SquareTime.c -o Q1_SquareTime
./Q1_SquareTime

Array Size = 1048576 | Execution Time = 4437.700000 μs | No. of Runs: 30



# CUDA execution time

In [None]:
%%writefile Q2_CSquareTime.cu

#include <stdio.h>
#include <stdlib.h>
#include "N.c"
#include "runs.c"
__global__ 

void square(float *h_out, float *h_in) {
  int i = threadIdx.x;
  h_out[i] = h_in[i] * h_in[i];
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
    in[i] = float(i);
  }

  for (int j = 0; j < runs; j++) {
    square<<<1, ARRAY_SIZE>>> (out, in);      
    cudaDeviceSynchronize();      
  }
  cudaFree(in);
  cudaFree(out);

  printf("Array Size = %d | No. of Runs: %d\n", ARRAY_SIZE, runs);
  return 0;
}



Overwriting Q2_CSquareTime.cu


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 256;

Overwriting N.c


In [None]:
%%shell
nvcc Q2_CSquareTime.cu -o Q2_CSquareTime
nvprof ./Q2_CSquareTime

==19009== NVPROF is profiling process 19009, command: ./Q2_CSquareTime
Array Size = 256 | No. of Runs: 30
==19009== Profiling application: ./Q2_CSquareTime
==19009== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  302.33us        30  10.077us  3.9680us  184.70us  square(float*, float*)
      API calls:   99.60%  410.18ms         2  205.09ms  18.970us  410.17ms  cudaMallocManaged
                    0.15%  612.37us         1  612.37us  612.37us  612.37us  cuDeviceTotalMem
                    0.11%  435.73us        30  14.524us  7.6990us  195.84us  cudaDeviceSynchronize
                    0.06%  250.08us        30  8.3360us  5.9790us  52.078us  cudaLaunchKernel
                    0.05%  217.32us       101  2.1510us     257ns  87.720us  cuDeviceGetAttribute
                    0.02%  100.38us         2  50.190us  29.340us  71.040us  cudaFree
                    0.01%  36.347us         1  36.347us  36.347us  3



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 512;

Overwriting N.c


In [None]:
%%shell
nvcc Q2_CSquareTime.cu -o Q2_CSquareTime
nvprof ./Q2_CSquareTime

==19053== NVPROF is profiling process 19053, command: ./Q2_CSquareTime
Array Size = 512 | No. of Runs: 30
==19053== Profiling application: ./Q2_CSquareTime
==19053== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  291.90us        30  9.7290us  4.0960us  170.97us  square(float*, float*)
      API calls:   99.59%  414.17ms         2  207.09ms  21.385us  414.15ms  cudaMallocManaged
                    0.15%  608.02us         1  608.02us  608.02us  608.02us  cuDeviceTotalMem
                    0.11%  457.98us        30  15.266us  8.0580us  181.42us  cudaDeviceSynchronize
                    0.06%  247.67us        30  8.2550us  6.0620us  46.871us  cudaLaunchKernel
                    0.06%  231.27us       101  2.2890us     176ns  103.96us  cuDeviceGetAttribute
                    0.03%  107.93us         2  53.962us  31.020us  76.905us  cudaFree
                    0.01%  37.008us         1  37.008us  37.008us  3



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1024;

Overwriting N.c


In [None]:
%%shell
nvcc Q2_CSquareTime.cu -o Q2_CSquareTime
nvprof ./Q2_CSquareTime

==19099== NVPROF is profiling process 19099, command: ./Q2_CSquareTime
Array Size = 1024 | No. of Runs: 30
==19099== Profiling application: ./Q2_CSquareTime
==19099== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  324.35us        30  10.811us  4.2880us  197.02us  square(float*, float*)
      API calls:   99.75%  696.11ms         2  348.06ms  31.527us  696.08ms  cudaMallocManaged
                    0.09%  600.19us         1  600.19us  600.19us  600.19us  cuDeviceTotalMem
                    0.07%  465.32us        30  15.510us  6.9080us  206.58us  cudaDeviceSynchronize
                    0.04%  265.10us        30  8.8360us  6.1490us  47.103us  cudaLaunchKernel
                    0.03%  235.29us       101  2.3290us     228ns  91.501us  cuDeviceGetAttribute
                    0.02%  133.38us         2  66.689us  41.450us  91.929us  cudaFree
                    0.01%  36.863us         1  36.863us  36.863us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1048576;

Overwriting N.c


In [None]:
%%shell
nvcc Q2_CSquareTime.cu -o Q2_CSquareTime
nvprof ./Q2_CSquareTime

==19145== NVPROF is profiling process 19145, command: ./Q2_CSquareTime
Array Size = 1048576 | No. of Runs: 30
==19145== Profiling application: ./Q2_CSquareTime
==19145== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   99.66%  381.50ms         2  190.75ms  48.856us  381.45ms  cudaMallocManaged
                    0.15%  575.75us         1  575.75us  575.75us  575.75us  cuDeviceTotalMem
                    0.08%  308.01us         2  154.00us  25.849us  282.16us  cudaFree
                    0.07%  275.91us       101  2.7310us     281ns  134.66us  cuDeviceGetAttribute
                    0.02%  60.377us        30  2.0120us  1.5750us  12.983us  cudaDeviceSynchronize
                    0.01%  37.727us         1  37.727us  37.727us  37.727us  cuDeviceGetName
                    0.00%  14.107us        30     470ns     368ns  2.6660us  cudaLaunchKernel
                    0.00%  6.6300us         1



#Grid-stride execution time

## CPU


In [None]:
%%writefile Q3_CPUGridStride.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "N.c"
#include "runs.c"
void square(int n, float *h_out, float *h_in) {
  for(int i = 0; i < n; i++) {
    h_out[i] = h_in[i] * h_in[i];
  }
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  double total = 0;
  float *in, *out;

  in = (float*) malloc(ARRAY_BYTES);
  out = (float*) malloc(ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
    in[i] = float(i);
  }
    
  // call C
  clock_t start, end;
  for (int j = 0; j < runs; j++) {

    start = clock();
    square(ARRAY_SIZE, out, in);
    end = clock();
    double time_taken = ((double)(end-start))*1e6/CLOCKS_PER_SEC;
    total += time_taken;
  }
  total = total/runs;
  printf("Array Size = %d | Execution Time = %f μs | No. of Runs: %d", ARRAY_SIZE,  total,runs);
  return 0;
}


Overwriting Q3_CPUGridStride.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
g++ Q3_CPUGridStride.c -o Q3_CPUGridStride
./Q3_CPUGridStride

Array Size = 1048576 | Execution Time = 4513.966667 μs | No. of Runs: 30



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
g++ Q3_CPUGridStride.c -o Q3_CPUGridStride
./Q3_CPUGridStride

Array Size = 4194304 | Execution Time = 18915.666667 μs | No. of Runs: 30



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
g++ Q3_CPUGridStride.c -o Q3_CPUGridStride
./Q3_CPUGridStride

Array Size = 16777216 | Execution Time = 74964.300000 μs | No. of Runs: 30



##GPU

In [None]:
%%writefile Q3_GPUGridStride.cu


#include <stdio.h>
#include <stdlib.h>
#include "N.c"
#include "runs.c"
#include "blocksize.c"

__global__
void square(float *h_out, float *h_in, int n) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;    
  int stride = blockDim.x * gridDim.x;  

  for (int i=index; i < n; i+=stride)   
     h_out[i] = h_in[i] * h_in[i];
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  int numBlocks = (ARRAY_BYTES+numThreads-1)/numThreads;
  float *in, *out;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
      in[i] = float(i);
  }
  
  for (int j = 0; j < runs; j++) {
    square<<<numBlocks, numThreads>>>(out, in, ARRAY_SIZE);
    cudaDeviceSynchronize();
  }
  
    // free memory
  cudaFree(in);
  cudaFree(out);

  printf("Array Size = %d |  No. of Runs: %d\n", ARRAY_SIZE, runs);
  return 0;
}

Overwriting Q3_GPUGridStride.cu


In [None]:
%%writefile blocksize.c
const int numThreads = 256;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19207== NVPROF is profiling process 19207, command: ./Q3_GPUGridStride
Array Size = 1048576 |  No. of Runs: 30
==19207== Profiling application: ./Q3_GPUGridStride
==19207== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  5.7304ms        30  191.01us  109.28us  2.5528ms  square(float*, float*, int)
      API calls:   98.18%  411.82ms         2  205.91ms  53.679us  411.76ms  cudaMallocManaged
                    1.40%  5.8748ms        30  195.83us  113.42us  2.5621ms  cudaDeviceSynchronize
                    0.15%  619.29us         2  309.65us  172.59us  446.70us  cudaFree
                    0.14%  581.81us         1  581.81us  581.81us  581.81us  cuDeviceTotalMem
                    0.06%  268.26us        30  8.9410us  5.9430us  48.526us  cudaLaunchKernel
                    0.05%  220.65us       101  2.1840us     187ns  93.487us  cuDeviceGetAttribute
                    0.01%  50.334us         1  50.334u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19251== NVPROF is profiling process 19251, command: ./Q3_GPUGridStride
Array Size = 4194304 |  No. of Runs: 30
==19251== Profiling application: ./Q3_GPUGridStride
==19251== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  25.636ms        30  854.53us  423.83us  13.328ms  square(float*, float*, int)
      API calls:   92.89%  397.06ms         2  198.53ms  61.523us  396.99ms  cudaMallocManaged
                    6.27%  26.790ms        30  893.00us  419.65us  14.314ms  cudaDeviceSynchronize
                    0.54%  2.3280ms         2  1.1640ms  780.87us  1.5471ms  cudaFree
                    0.14%  603.96us         1  603.96us  603.96us  603.96us  cuDeviceTotalMem
                    0.09%  375.82us        30  12.527us  6.1920us  58.169us  cudaLaunchKernel
                    0.05%  217.34us       101  2.1510us     282ns  88.701us  cuDeviceGetAttribute
                    0.01%  51.445us         1  51.445u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19295== NVPROF is profiling process 19295, command: ./Q3_GPUGridStride
Array Size = 16777216 |  No. of Runs: 30
==19295== Profiling application: ./Q3_GPUGridStride
==19295== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  107.25ms        30  3.5749ms  1.6808ms  58.469ms  square(float*, float*, int)
      API calls:   76.51%  387.58ms         2  193.79ms  51.139us  387.52ms  cudaMallocManaged
                   21.46%  108.70ms        30  3.6233ms  1.6847ms  58.483ms  cudaDeviceSynchronize
                    1.77%  8.9777ms         2  4.4888ms  3.1962ms  5.7815ms  cudaFree
                    0.12%  590.50us         1  590.50us  590.50us  590.50us  cuDeviceTotalMem
                    0.10%  488.16us        30  16.271us  7.6650us  72.336us  cudaLaunchKernel
                    0.04%  225.26us       101  2.2300us     276ns  95.072us  cuDeviceGetAttribute
                    0.01%  32.963us         1  32.963



In [None]:
%%writefile blocksize.c
const int numThreads = 512;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19341== NVPROF is profiling process 19341, command: ./Q3_GPUGridStride
Array Size = 1048576 |  No. of Runs: 30
==19341== Profiling application: ./Q3_GPUGridStride
==19341== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  5.7875ms        30  192.92us  110.05us  2.5674ms  square(float*, float*, int)
      API calls:   98.03%  375.21ms         2  187.61ms  43.710us  375.17ms  cudaMallocManaged
                    1.55%  5.9203ms        30  197.34us  113.94us  2.5760ms  cudaDeviceSynchronize
                    0.14%  549.58us         1  549.58us  549.58us  549.58us  cuDeviceTotalMem
                    0.14%  544.13us         2  272.07us  153.30us  390.84us  cudaFree
                    0.07%  266.40us        30  8.8800us  5.5990us  47.442us  cudaLaunchKernel
                    0.05%  208.19us       101  2.0610us     266ns  85.127us  cuDeviceGetAttribute
                    0.01%  35.264us         1  35.264u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19385== NVPROF is profiling process 19385, command: ./Q3_GPUGridStride
Array Size = 4194304 |  No. of Runs: 30
==19385== Profiling application: ./Q3_GPUGridStride
==19385== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  25.534ms        30  851.12us  464.34us  11.911ms  square(float*, float*, int)
      API calls:   93.16%  395.05ms         2  197.52ms  40.100us  395.01ms  cudaMallocManaged
                    6.05%  25.663ms        30  855.43us  468.71us  11.922ms  cudaDeviceSynchronize
                    0.51%  2.1626ms         2  1.0813ms  767.67us  1.3949ms  cudaFree
                    0.15%  619.29us         1  619.29us  619.29us  619.29us  cuDeviceTotalMem
                    0.07%  313.36us        30  10.445us  6.2900us  44.454us  cudaLaunchKernel
                    0.05%  223.10us       101  2.2080us     293ns  91.105us  cuDeviceGetAttribute
                    0.01%  33.162us         1  33.162u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19429== NVPROF is profiling process 19429, command: ./Q3_GPUGridStride
Array Size = 16777216 |  No. of Runs: 30
==19429== Profiling application: ./Q3_GPUGridStride
==19429== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  97.798ms        30  3.2599ms  1.8491ms  43.571ms  square(float*, float*, int)
      API calls:   78.31%  391.31ms         2  195.66ms  62.109us  391.25ms  cudaMallocManaged
                   19.61%  97.966ms        30  3.2655ms  1.8534ms  43.586ms  cudaDeviceSynchronize
                    1.83%  9.1425ms         2  4.5713ms  3.2609ms  5.8817ms  cudaFree
                    0.12%  576.94us         1  576.94us  576.94us  576.94us  cuDeviceTotalMem
                    0.08%  402.21us        30  13.406us  7.6880us  73.063us  cudaLaunchKernel
                    0.05%  241.62us       101  2.3920us     282ns  109.87us  cuDeviceGetAttribute
                    0.01%  33.693us         1  33.693



In [None]:
%%writefile blocksize.c
const int numThreads = 1024;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19473== NVPROF is profiling process 19473, command: ./Q3_GPUGridStride
Array Size = 1048576 |  No. of Runs: 30
==19473== Profiling application: ./Q3_GPUGridStride
==19473== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  5.7811ms        30  192.70us  116.89us  2.3769ms  square(float*, float*, int)
      API calls:   98.07%  389.28ms         2  194.64ms  52.677us  389.22ms  cudaMallocManaged
                    1.49%  5.9087ms        30  196.96us  120.77us  2.3847ms  cudaDeviceSynchronize
                    0.15%  602.11us         2  301.06us  163.57us  438.54us  cudaFree
                    0.15%  596.49us         1  596.49us  596.49us  596.49us  cuDeviceTotalMem
                    0.07%  269.30us        30  8.9760us  6.3600us  43.585us  cudaLaunchKernel
                    0.06%  245.27us       101  2.4280us     289ns  104.51us  cuDeviceGetAttribute
                    0.01%  34.380us         1  34.380u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19517== NVPROF is profiling process 19517, command: ./Q3_GPUGridStride
Array Size = 4194304 |  No. of Runs: 30
==19517== Profiling application: ./Q3_GPUGridStride
==19517== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  26.867ms        30  895.58us  552.44us  10.781ms  square(float*, float*, int)
      API calls:   93.01%  405.17ms         2  202.59ms  41.826us  405.13ms  cudaMallocManaged
                    6.20%  27.022ms        30  900.75us  557.05us  10.790ms  cudaDeviceSynchronize
                    0.53%  2.2962ms         2  1.1481ms  786.97us  1.5093ms  cudaFree
                    0.12%  543.86us         1  543.86us  543.86us  543.86us  cuDeviceTotalMem
                    0.07%  326.17us        30  10.872us  6.3470us  57.614us  cudaLaunchKernel
                    0.05%  231.99us       101  2.2960us     193ns  89.327us  cuDeviceGetAttribute
                    0.01%  35.754us         1  35.754u



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q3_GPUGridStride.cu -o Q3_GPUGridStride
nvprof ./Q3_GPUGridStride

==19563== NVPROF is profiling process 19563, command: ./Q3_GPUGridStride
Array Size = 16777216 |  No. of Runs: 30
==19563== Profiling application: ./Q3_GPUGridStride
==19563== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  119.62ms        30  3.9874ms  2.2016ms  55.671ms  square(float*, float*, int)
      API calls:   75.98%  411.53ms         2  205.77ms  66.614us  411.46ms  cudaMallocManaged
                   22.12%  119.83ms        30  3.9943ms  2.2067ms  55.685ms  cudaDeviceSynchronize
                    1.66%  9.0126ms         2  4.5063ms  3.3274ms  5.6852ms  cudaFree
                    0.10%  558.18us         1  558.18us  558.18us  558.18us  cuDeviceTotalMem
                    0.08%  441.78us        30  14.725us  7.4360us  62.502us  cudaLaunchKernel
                    0.04%  217.18us       101  2.1500us     272ns  92.016us  cuDeviceGetAttribute
                    0.01%  32.251us         1  32.251



# Memory Prefetching

In [None]:
%%writefile Q4_MemPrefetch.cu

#include <stdio.h>
#include <stdlib.h>
#include "N.c"
#include "runs.c"
#include "blocksize.c"

__global__     
void square(float *h_out, float *h_in, int n) {

  int index = blockIdx.x * blockDim.x + threadIdx.x;    
  int stride = blockDim.x * gridDim.x;  

  for (int i=index; i < n; i+=stride)   
     h_out[i] = h_in[i] * h_in[i];
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  float *in, *out;

  int numBlocks = (ARRAY_SIZE+numThreads-1)/numThreads;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
    in[i] = float(i);
  }
  // prefetch data
  int device = -1;
  cudaGetDevice(&device);
  cudaMemPrefetchAsync(in, ARRAY_BYTES, device, NULL);    // CPU = cudaCPUDeviceID
  cudaMemPrefetchAsync(out, ARRAY_BYTES, device, NULL);   // CPU = cudaCPUDeviceID
  for (int j = 0; j < runs; j++) {

    square<<<numBlocks, numThreads>>> (out, in, ARRAY_SIZE);        // launch CUDA kernel.  
    cudaDeviceSynchronize();
  }
  
  // free memory
  cudaFree(in);
  cudaFree(out);

  printf("Array Size = %d | No. of Runs: %d\n", ARRAY_SIZE ,runs);
  return 0;
}

Overwriting Q4_MemPrefetch.cu


In [None]:
%%writefile Q4_NoMemPrefetch.cu

#include <stdio.h>
#include <stdlib.h>
#include "N.c"
#include "runs.c"
#include "blocksize.c"

__global__     
void square(float *h_out, float *h_in, int n) {

  int index = blockIdx.x * blockDim.x + threadIdx.x;    
  int stride = blockDim.x * gridDim.x;  

  for (int i=index; i < n; i+=stride)   
     h_out[i] = h_in[i] * h_in[i];
}

int main() {
  const int ARRAY_BYTES = ARRAY_SIZE*sizeof(float);
  float *in, *out;

  int numBlocks = (ARRAY_SIZE+numThreads-1)/numThreads;
  cudaMallocManaged(&in, ARRAY_BYTES);
  cudaMallocManaged(&out, ARRAY_BYTES);

  for (int i = 0; i < ARRAY_SIZE; i++) {
    in[i] = float(i);
  }
  
  for (int j = 0; j < runs; j++) {
 
    square<<<numBlocks, numThreads>>> (out, in, ARRAY_SIZE);    // launch CUDA kernel. 
    cudaDeviceSynchronize();      
  }

  // free memory
  cudaFree(in);
   cudaFree(out);
  printf("Array Size = %d No. of Runs: %d\n", ARRAY_SIZE, runs);
  return 0;
}

Overwriting Q4_NoMemPrefetch.cu


In [None]:
%%writefile blocksize.c
const int numThreads = 256;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==19607== NVPROF is profiling process 19607, command: ./Q4_MemPrefetch
Array Size = 1048576 | No. of Runs: 30
==19607== Profiling application: ./Q4_MemPrefetch
==19607== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.2820ms        30  42.731us  41.535us  43.104us  square(float*, float*, int)
      API calls:   96.73%  407.40ms         2  203.70ms  58.028us  407.34ms  cudaMallocManaged
                    2.75%  11.590ms        30  386.35us  44.428us  10.239ms  cudaDeviceSynchronize
                    0.14%  597.55us         1  597.55us  597.55us  597.55us  cuDeviceTotalMem
                    0.14%  575.35us         2  287.67us  118.14us  457.21us  cudaMemPrefetchAsync
                    0.12%  492.64us         2  246.32us  68.010us  424.63us  cudaFree
                    0.06%  260.62us        30  8.6870us  6.2930us  42.358us  cudaLaunchKernel
                    0.05%  220.03us       101  2.1780us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==19652== NVPROF is profiling process 19652, command: ./Q4_NoMemPrefetch
Array Size = 1048576 No. of Runs: 30
==19652== Profiling application: ./Q4_NoMemPrefetch
==19652== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  4.2259ms        30  140.86us  54.975us  2.6233ms  square(float*, float*, int)
      API calls:   98.51%  399.74ms         2  199.87ms  38.030us  399.70ms  cudaMallocManaged
                    1.08%  4.3840ms        30  146.13us  58.913us  2.6501ms  cudaDeviceSynchronize
                    0.14%  582.84us         2  291.42us  153.28us  429.55us  cudaFree
                    0.14%  557.57us         1  557.57us  557.57us  557.57us  cuDeviceTotalMem
                    0.07%  280.97us        30  9.3650us  6.0980us  69.680us  cudaLaunchKernel
                    0.05%  206.30us       101  2.0420us     200ns  84.497us  cuDeviceGetAttribute
                    0.01%  34.964us         1  34.964us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==19696== NVPROF is profiling process 19696, command: ./Q4_MemPrefetch
Array Size = 4194304 | No. of Runs: 30
==19696== Profiling application: ./Q4_MemPrefetch
==19696== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.1858ms        30  206.19us  204.64us  206.81us  square(float*, float*, int)
      API calls:   94.60%  396.01ms         2  198.01ms  50.039us  395.96ms  cudaMallocManaged
                    4.28%  17.921ms        30  597.37us  209.95us  11.810ms  cudaDeviceSynchronize
                    0.40%  1.6623ms         2  831.14us  122.13us  1.5401ms  cudaMemPrefetchAsync
                    0.39%  1.6454ms         2  822.69us  174.64us  1.4708ms  cudaFree
                    0.20%  822.61us         1  822.61us  822.61us  822.61us  cuDeviceTotalMem
                    0.06%  268.01us        30  8.9330us  5.7600us  50.697us  cudaLaunchKernel
                    0.05%  215.97us       101  2.1380us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==19741== NVPROF is profiling process 19741, command: ./Q4_NoMemPrefetch
Array Size = 4194304 No. of Runs: 30
==19741== Profiling application: ./Q4_NoMemPrefetch
==19741== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  19.189ms        30  639.65us  205.15us  13.230ms  square(float*, float*, int)
      API calls:   94.61%  399.79ms         2  199.89ms  33.398us  399.75ms  cudaMallocManaged
                    4.57%  19.326ms        30  644.19us  208.24us  13.239ms  cudaDeviceSynchronize
                    0.53%  2.2333ms         2  1.1166ms  766.66us  1.4666ms  cudaFree
                    0.14%  598.98us         1  598.98us  598.98us  598.98us  cuDeviceTotalMem
                    0.07%  311.32us        30  10.377us  6.3190us  63.624us  cudaLaunchKernel
                    0.05%  230.94us       101  2.2860us     268ns  98.664us  cuDeviceGetAttribute
                    0.01%  47.031us         1  47.031us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==19785== NVPROF is profiling process 19785, command: ./Q4_MemPrefetch
Array Size = 16777216 | No. of Runs: 30
==19785== Profiling application: ./Q4_MemPrefetch
==19785== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  24.245ms        30  808.18us  806.45us  809.65us  square(float*, float*, int)
      API calls:   91.17%  410.24ms         2  205.12ms  57.242us  410.19ms  cudaMallocManaged
                    5.46%  24.553ms        30  818.43us  810.88us  970.27us  cudaDeviceSynchronize
                    1.58%  7.1168ms         2  3.5584ms  1.1573ms  5.9596ms  cudaFree
                    1.52%  6.8337ms         2  3.4169ms  409.87us  6.4239ms  cudaMemPrefetchAsync
                    0.13%  605.50us         1  605.50us  605.50us  605.50us  cuDeviceTotalMem
                    0.07%  315.23us        30  10.507us  6.7250us  49.050us  cudaLaunchKernel
                    0.05%  232.86us       101  2.3050us   



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==19831== NVPROF is profiling process 19831, command: ./Q4_NoMemPrefetch
Array Size = 16777216 No. of Runs: 30
==19831== Profiling application: ./Q4_NoMemPrefetch
==19831== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  79.318ms        30  2.6439ms  804.88us  55.932ms  square(float*, float*, int)
      API calls:   81.42%  393.50ms         2  196.75ms  51.151us  393.45ms  cudaMallocManaged
                   16.46%  79.550ms        30  2.6517ms  809.81us  55.945ms  cudaDeviceSynchronize
                    1.86%  8.9996ms         2  4.4998ms  3.2207ms  5.7789ms  cudaFree
                    0.12%  587.42us         1  587.42us  587.42us  587.42us  cuDeviceTotalMem
                    0.08%  391.84us        30  13.061us  6.6870us  76.483us  cudaLaunchKernel
                    0.05%  241.93us       101  2.3950us     258ns  101.31us  cuDeviceGetAttribute
                    0.01%  40.101us         1  40.101us 



In [None]:
%%writefile blocksize.c
const int numThreads = 512;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==19875== NVPROF is profiling process 19875, command: ./Q4_MemPrefetch
Array Size = 1048576 | No. of Runs: 30
==19875== Profiling application: ./Q4_MemPrefetch
==19875== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.8003ms        30  60.010us  58.911us  60.447us  square(float*, float*, int)
      API calls:   97.38%  400.46ms         2  200.23ms  48.482us  400.41ms  cudaMallocManaged
                    2.09%  8.6090ms        30  286.97us  61.122us  6.7463ms  cudaDeviceSynchronize
                    0.14%  584.71us         2  292.36us  125.50us  459.21us  cudaMemPrefetchAsync
                    0.13%  546.86us         1  546.86us  546.86us  546.86us  cuDeviceTotalMem
                    0.12%  491.82us         2  245.91us  70.479us  421.34us  cudaFree
                    0.06%  259.31us        30  8.6430us  5.8970us  39.000us  cudaLaunchKernel
                    0.05%  210.94us       101  2.0880us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==19920== NVPROF is profiling process 19920, command: ./Q4_NoMemPrefetch
Array Size = 1048576 No. of Runs: 30
==19920== Profiling application: ./Q4_NoMemPrefetch
==19920== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  4.1436ms        30  138.12us  60.031us  2.3830ms  square(float*, float*, int)
      API calls:   98.47%  387.80ms         2  193.90ms  35.604us  387.77ms  cudaMallocManaged
                    1.10%  4.3154ms        30  143.85us  63.720us  2.4167ms  cudaDeviceSynchronize
                    0.15%  596.27us         1  596.27us  596.27us  596.27us  cuDeviceTotalMem
                    0.14%  558.96us         2  279.48us  149.45us  409.52us  cudaFree
                    0.07%  286.03us        30  9.5340us  5.8770us  50.403us  cudaLaunchKernel
                    0.06%  224.04us       101  2.2180us     280ns  89.219us  cuDeviceGetAttribute
                    0.01%  34.531us         1  34.531us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==19964== NVPROF is profiling process 19964, command: ./Q4_MemPrefetch
Array Size = 4194304 | No. of Runs: 30
==19964== Profiling application: ./Q4_MemPrefetch
==19964== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.7892ms        30  226.31us  224.28us  226.94us  square(float*, float*, int)
      API calls:   95.35%  408.20ms         2  204.10ms  47.586us  408.16ms  cudaMallocManaged
                    3.55%  15.208ms        30  506.94us  229.22us  8.3663ms  cudaDeviceSynchronize
                    0.43%  1.8266ms         2  913.32us  120.52us  1.7061ms  cudaMemPrefetchAsync
                    0.40%  1.6944ms         2  847.20us  173.69us  1.5207ms  cudaFree
                    0.14%  588.44us         1  588.44us  588.44us  588.44us  cuDeviceTotalMem
                    0.07%  282.72us        30  9.4230us  6.0700us  41.532us  cudaLaunchKernel
                    0.05%  234.36us       101  2.3200us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==20009== NVPROF is profiling process 20009, command: ./Q4_NoMemPrefetch
Array Size = 4194304 No. of Runs: 30
==20009== Profiling application: ./Q4_NoMemPrefetch
==20009== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  18.930ms        30  630.99us  225.92us  12.352ms  square(float*, float*, int)
      API calls:   94.33%  391.27ms         2  195.63ms  57.105us  391.21ms  cudaMallocManaged
                    4.85%  20.135ms        30  671.18us  228.55us  13.424ms  cudaDeviceSynchronize
                    0.54%  2.2339ms         2  1.1170ms  766.28us  1.4677ms  cudaFree
                    0.14%  566.57us         1  566.57us  566.57us  566.57us  cuDeviceTotalMem
                    0.07%  290.04us        30  9.6670us  5.8320us  57.149us  cudaLaunchKernel
                    0.06%  239.18us       101  2.3680us     260ns  108.26us  cuDeviceGetAttribute
                    0.01%  35.050us         1  35.050us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==20053== NVPROF is profiling process 20053, command: ./Q4_MemPrefetch
Array Size = 16777216 | No. of Runs: 30
==20053== Profiling application: ./Q4_MemPrefetch
==20053== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  26.797ms        30  893.22us  891.22us  894.99us  square(float*, float*, int)
      API calls:   90.61%  408.08ms         2  204.04ms  57.609us  408.02ms  cudaMallocManaged
                    6.02%  27.124ms        30  904.14us  895.86us  1.0400ms  cudaDeviceSynchronize
                    1.58%  7.0956ms         2  3.5478ms  1.1829ms  5.9126ms  cudaFree
                    1.50%  6.7574ms         2  3.3787ms  640.44us  6.1170ms  cudaMemPrefetchAsync
                    0.13%  602.48us         1  602.48us  602.48us  602.48us  cuDeviceTotalMem
                    0.09%  409.76us        30  13.658us  7.0480us  51.428us  cudaLaunchKernel
                    0.06%  259.04us       101  2.5640us   



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==20099== NVPROF is profiling process 20099, command: ./Q4_NoMemPrefetch
Array Size = 16777216 No. of Runs: 30
==20099== Profiling application: ./Q4_NoMemPrefetch
==20099== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  70.705ms        30  2.3568ms  888.82us  44.864ms  square(float*, float*, int)
      API calls:   83.91%  428.63ms         2  214.32ms  51.452us  428.58ms  cudaMallocManaged
                   13.97%  71.340ms        30  2.3780ms  884.99us  45.348ms  cudaDeviceSynchronize
                    1.88%  9.5790ms         2  4.7895ms  3.9030ms  5.6760ms  cudaFree
                    0.11%  578.96us         1  578.96us  578.96us  578.96us  cuDeviceTotalMem
                    0.08%  410.74us        30  13.691us  7.0360us  58.405us  cudaLaunchKernel
                    0.04%  213.18us       101  2.1100us     278ns  84.708us  cuDeviceGetAttribute
                    0.01%  32.216us         1  32.216us 



In [None]:
%%writefile blocksize.c
const int numThreads = 1024;

Overwriting blocksize.c


In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<20;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==20143== NVPROF is profiling process 20143, command: ./Q4_MemPrefetch
Array Size = 1048576 | No. of Runs: 30
==20143== Profiling application: ./Q4_MemPrefetch
==20143== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.1347ms        30  71.157us  69.951us  72.479us  square(float*, float*, int)
      API calls:   96.80%  379.31ms         2  189.66ms  48.316us  379.26ms  cudaMallocManaged
                    2.62%  10.262ms        30  342.08us  73.705us  8.0663ms  cudaDeviceSynchronize
                    0.17%  651.23us         1  651.23us  651.23us  651.23us  cuDeviceTotalMem
                    0.14%  551.80us         2  275.90us  68.476us  483.33us  cudaFree
                    0.14%  550.72us         2  275.36us  117.25us  433.47us  cudaMemPrefetchAsync
                    0.07%  258.03us        30  8.6010us  5.8010us  36.975us  cudaLaunchKernel
                    0.05%  207.53us       101  2.0540us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==20188== NVPROF is profiling process 20188, command: ./Q4_NoMemPrefetch
Array Size = 1048576 No. of Runs: 30
==20188== Profiling application: ./Q4_NoMemPrefetch
==20188== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  4.4049ms        30  146.83us  71.327us  2.3057ms  square(float*, float*, int)
      API calls:   98.47%  403.30ms         2  201.65ms  36.912us  403.27ms  cudaMallocManaged
                    1.11%  4.5372ms        30  151.24us  63.135us  2.3307ms  cudaDeviceSynchronize
                    0.14%  574.39us         2  287.20us  151.53us  422.86us  cudaFree
                    0.14%  562.53us         1  562.53us  562.53us  562.53us  cuDeviceTotalMem
                    0.07%  280.70us        30  9.3560us  6.0470us  50.900us  cudaLaunchKernel
                    0.06%  250.14us       101  2.4760us     260ns  114.88us  cuDeviceGetAttribute
                    0.01%  31.855us         1  31.855us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<22;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==20232== NVPROF is profiling process 20232, command: ./Q4_MemPrefetch
Array Size = 4194304 | No. of Runs: 30
==20232== Profiling application: ./Q4_MemPrefetch
==20232== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.0235ms        30  267.45us  266.01us  268.41us  square(float*, float*, int)
      API calls:   96.70%  392.02ms         2  196.01ms  39.569us  391.98ms  cudaMallocManaged
                    2.02%  8.1723ms        30  272.41us  269.94us  286.09us  cudaDeviceSynchronize
                    0.55%  2.2207ms         2  1.1103ms  694.78us  1.5259ms  cudaFree
                    0.46%  1.8554ms         2  927.71us  114.13us  1.7413ms  cudaMemPrefetchAsync
                    0.15%  609.61us         1  609.61us  609.61us  609.61us  cuDeviceTotalMem
                    0.06%  259.10us        30  8.6360us  6.3550us  43.000us  cudaLaunchKernel
                    0.05%  219.35us       101  2.1710us    



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==20276== NVPROF is profiling process 20276, command: ./Q4_NoMemPrefetch
Array Size = 4194304 No. of Runs: 30
==20276== Profiling application: ./Q4_NoMemPrefetch
==20276== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  18.497ms        30  616.58us  270.52us  10.582ms  square(float*, float*, int)
      API calls:   94.79%  400.85ms         2  200.42ms  43.780us  400.81ms  cudaMallocManaged
                    4.41%  18.634ms        30  621.13us  274.42us  10.590ms  cudaDeviceSynchronize
                    0.54%  2.2973ms         2  1.1486ms  774.95us  1.5223ms  cudaFree
                    0.13%  550.70us         1  550.70us  550.70us  550.70us  cuDeviceTotalMem
                    0.07%  290.01us        30  9.6660us  5.9350us  58.044us  cudaLaunchKernel
                    0.05%  218.23us       101  2.1600us     262ns  86.527us  cuDeviceGetAttribute
                    0.01%  36.385us         1  36.385us  



In [None]:
%%writefile N.c
const int ARRAY_SIZE = 1<<24;

Overwriting N.c


In [None]:
%%shell
nvcc Q4_MemPrefetch.cu -o Q4_MemPrefetch
nvprof ./Q4_MemPrefetch

==20320== NVPROF is profiling process 20320, command: ./Q4_MemPrefetch
Array Size = 16777216 | No. of Runs: 30
==20320== Profiling application: ./Q4_MemPrefetch
==20320== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  32.547ms        30  1.0849ms  1.0816ms  1.0903ms  square(float*, float*, int)
      API calls:   88.00%  401.76ms         2  200.88ms  47.871us  401.71ms  cudaMallocManaged
                    9.10%  41.550ms        30  1.3850ms  1.0856ms  9.9416ms  cudaDeviceSynchronize
                    1.34%  6.1324ms         2  3.0662ms  550.36us  5.5820ms  cudaFree
                    1.29%  5.9100ms         2  2.9550ms  131.99us  5.7780ms  cudaMemPrefetchAsync
                    0.13%  577.94us         1  577.94us  577.94us  577.94us  cuDeviceTotalMem
                    0.07%  332.18us        30  11.072us  7.5320us  43.854us  cudaLaunchKernel
                    0.05%  234.71us       101  2.3230us   



In [None]:
%%shell
nvcc Q4_NoMemPrefetch.cu -o Q4_NoMemPrefetch
nvprof ./Q4_NoMemPrefetch

==20367== NVPROF is profiling process 20367, command: ./Q4_NoMemPrefetch
Array Size = 16777216 No. of Runs: 30
==20367== Profiling application: ./Q4_NoMemPrefetch
==20367== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  76.110ms        30  2.5370ms  1.0681ms  44.947ms  square(float*, float*, int)
      API calls:   82.55%  410.15ms         2  205.08ms  48.444us  410.10ms  cudaMallocManaged
                   15.35%  76.255ms        30  2.5418ms  1.0726ms  44.961ms  cudaDeviceSynchronize
                    1.84%  9.1482ms         2  4.5741ms  3.3094ms  5.8388ms  cudaFree
                    0.12%  613.83us         1  613.83us  613.83us  613.83us  cuDeviceTotalMem
                    0.08%  381.12us        30  12.703us  7.8960us  69.177us  cudaLaunchKernel
                    0.05%  233.88us       101  2.3150us     265ns  88.360us  cuDeviceGetAttribute
                    0.01%  36.480us         1  36.480us 

