In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-2vx3mk2k
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-2vx3mk2k
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=0abaeb55270cc3c61a3735ee73cf40067c7da30469c2613312adfb08fe67e432
  Stored in directory: /tmp/pip-ephem-wheel-cache-9gzi_8ax/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


In [None]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
%%cuda --name dotproduct.cu
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#define BLOCK_DIM 4

using namespace std;
typedef unsigned int ui;

void printGPUproperties(cudaDeviceProp &devProp)
{
    printf("+++ Name : %s\n", devProp.name);
    printf("    Total global memory : %ld bytes\n", devProp.totalGlobalMem);
    printf("    Total shared memory per block : %ld bytes\n", devProp.sharedMemPerBlock);
    printf("    Total shared memory per SM : %ld bytes\n", devProp.sharedMemPerMultiprocessor);
    printf("    Memory bus width : %d bits\n", devProp.memoryBusWidth);
    printf("    Memory clock rate : %d kHz\n", devProp.memoryClockRate);
    printf("    Warp size : %d\n", devProp.warpSize);
}

void print_array(float *A, ui n)
{
    cout<<"+++ Printing float array of length = "<<n<<endl;
    cout<<"    ";
    for(ui i = 0; i < n; i++)
        printf("%.2f ", A[i]);
    cout<<endl;
}

float cpudot(float *A, float *B, int n)
{
    float res = 0.0;
    for(ui i=0; i<n; i++)
    {
        //if(i == 368)
          //  cout<<"ff "<<A[i]<<" sq = "<<A[i]*B[i]<<"+ = "<<int(res + A[i]*B[i])<<endl;
        res = res + (A[i] * B[i]);
        //cout<<"After i = "<<i+1<<", res = ";
        //printf("%.2f \n", res);
    }

    return res;
}

__global__ void dotproduct(float *d_A, float *d_B, float *d_C, int n)
{
    __shared__ float tile[BLOCK_DIM];

    ui i = blockIdx.x * blockDim.x + threadIdx.x;

    tile[threadIdx.x] = (i < n) ? (d_A[i] * d_B[i]) : 0;
    __syncthreads();
 
    if(threadIdx.x == 0)
    {
        float psum = 0.0;
        for(ui j=0; j<blockDim.x; j++)
            psum += tile[j];
    
        d_C[blockIdx.x] = psum;
    }
}

__global__ void dotproduct1(float *d_A, float *d_B, float *d_C, int n)
{
    __shared__ float tile[BLOCK_DIM];
    ui i = blockIdx.x * blockDim.x + threadIdx.x;
    ui tid = threadIdx.x;
 
    tile[tid] = (i < n) ? (d_A[i] * d_B[i]) : 0;
    __syncthreads();
 
    for(ui s = blockDim.x/2; s > 32; s >>= 1)
    {
        if(tid < s)
            tile[tid] += tile[tid + s];
        __syncthreads();
    }
 
    if(tid < 32)
    {
        tile[tid] += tile[tid + 32];
        __syncthreads();
        tile[tid] += tile[tid + 16];
        __syncthreads();
        tile[tid] += tile[tid + 8];
        __syncthreads();
        tile[tid] += tile[tid + 4];
        __syncthreads();
        tile[tid] += tile[tid + 2];
        __syncthreads();
        tile[tid] += tile[tid + 1];
        __syncthreads();
    }

    if(tid == 0)
        d_C[blockIdx.x] = tile[0];
}

__global__ void reduce(float *d_B, float *d_C, int k)
{
    __shared__ float tile[BLOCK_DIM];
    ui i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
    ui tid = threadIdx.x;

    tile[tid] = d_B[i] + d_B[i + BLOCK_DIM];
    __syncthreads();
 
    /*if(tid == 0 && blockIdx.x == 0)
    {
        printf("Tile content at block = %d\n", blockIdx.x);
        for(int i=0; i<BLOCK_DIM; i++)
            printf("%.2f ", tile[i]);
        printf("\n");
    }*/
 
    for(ui s = blockDim.x/2; s > 32; s >>= 1)
    {
        if(tid < s)
            tile[tid] += tile[tid + s];
        __syncthreads();
    }
 
    if(tid < 32)
    {
        tile[tid] += tile[tid + 32];
        __syncthreads();
        tile[tid] += tile[tid + 16];
        __syncthreads();
        tile[tid] += tile[tid + 8];
        __syncthreads();
        tile[tid] += tile[tid + 4];
        __syncthreads();
        tile[tid] += tile[tid + 2];
        __syncthreads();
        tile[tid] += tile[tid + 1];
        __syncthreads();
    }
 
    if(tid == 0)
    {
        //printf("Value = %.2f written at block no. = %d\n", tile[0], blockIdx.x);
        d_C[blockIdx.x] = tile[0];
    }
}

void cpuReduce(float *a, ui n)
{
    float sum = 0.0;
    for(ui i=0; i<n; i++)
        sum += a[i];
 
    a[0] = sum;
}

int main(int argc, char* argv[])
{
    cudaDeviceProp devprop;
    cudaGetDeviceProperties(&devprop , 0);
    printGPUproperties(devprop);
 
    ui n;
    //n = 1<<23;
    n = atoi(argv[1]); //num_elements in each vector
    const size_t size = n * sizeof(float);
    cout<<"+++ n = "<<n<<endl;
    cout<<"    size = "<<size<<" bytes"<<endl;
 
    float *A, *B; //for cpu computations
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C; //for gpu computations
    
    ui n1 = (n + BLOCK_DIM - 1) / BLOCK_DIM;
    const size_t size1 = n1 * sizeof(float);
    cout<<"    n1 = "<<n1<<endl;
 
    //Allocate Memory to cpu variables
    h_A = (float *)malloc(size);
    h_B = (float *)malloc(size);
    h_C = (float *)malloc(size1);
    A = (float *)malloc(size);
    B = (float *)malloc(size);
 
    //Allocate Memory to gpu variables
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size1);
 
    //Initialise CPU and GPU variables
    for(ui i=0; i<n; i++)
    {
        //h_A[i] = (float(rand()) / (float(RAND_MAX)));
        //h_A[i] = float(rand());
        h_A[i] = 1.0 ;
        //h_B[i] = (float(rand()) / (float(RAND_MAX)));
        h_B[i] = 1.0 ;
        //h_B[i] = h_A[i];

        A[i] = h_A[i];
        B[i] = h_B[i];
    }
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 
    print_array(A, n);
    //print_array(B, n);
    //print_array(h_A, n);
    //print_array(h_B, n);
 
    //CPU transpose time
    clock_t t1, t2;  
    t1 = clock();
    float result = cpudot(A, B, n);
    t2 = clock();
    printf("+++ n = %d, CPU Result = %.2f, CPU Dot Product time taken = %lf ms\n", n, result, ((double)(t2-t1)/(double)CLOCKS_PER_SEC) * 1000);
 
    //GPU dotproduct
    float ms;
    dim3 block(BLOCK_DIM, 1, 1);
    dim3 grid((n + block.x - 1)/block.x, 1, 1);
    cout<<"+++ Block Dim = ("<<block.x<<", "<<block.y<<")"<<endl;
    cout<<"    Grid Dim = ("<<grid.x<<", "<<grid.y<<")"<<endl;
 
    cudaEvent_t c1, c2;
    cudaEventCreate(&c1);
    cudaEventCreate(&c2);
 
    cudaEventRecord(c1);
    dotproduct<<<grid, block>>>(d_A, d_B, d_C, n);
    cudaEventRecord(c2);
 
    cudaEventSynchronize(c2);
    cudaEventElapsedTime(&ms, c1, c2);
    cudaMemcpy(h_C, d_C, size1, cudaMemcpyDeviceToHost);
    
    printf("+++ n = %d, GPU Dot Product Reduce naive time taken = %lf ms\n", n, ms);
    print_array(h_C, n1);
 
    //Reallocate used placeholders
    cudaFree(d_C);
    free(h_C);
    cudaMalloc((void **)&d_C, size1);
    h_C = (float *)calloc(n1,  sizeof(float));
 
    //GPU efficient dot reduce
    cudaEventRecord(c1);
    dotproduct1<<<grid, block>>>(d_A, d_B, d_C, n);
    cudaEventRecord(c2);
 
    cudaEventSynchronize(c2);
    cudaEventElapsedTime(&ms, c1, c2);
    cudaMemcpy(h_C, d_C, size1, cudaMemcpyDeviceToHost);
    
    printf("+++ n = %d, GPU Dot Product Reduce efficient time taken = %lf ms\n", n, ms);
    print_array(h_C, n1);
 
    ui curlen = n1;
    while(curlen > BLOCK_DIM)
    {
        printf("In while for len = %d\n", curlen);
        n1 = curlen;
        grid.x = (curlen + BLOCK_DIM - 1) / BLOCK_DIM;
        grid.x = grid.x / 2;
     
        cudaFree(d_B);
        cudaMalloc((void **)&d_B, grid.x * sizeof(float));
        free(h_C);
        h_C = (float *)malloc(grid.x * sizeof(float));

        cout<<"+++ Block Dim = ("<<block.x<<", "<<block.y<<")"<<endl;
        cout<<"    Grid Dim = ("<<grid.x<<", "<<grid.y<<")"<<endl;
        reduce<<<grid, block>>>(d_C, d_B, n1);
        cudaMemcpy(h_C, d_B, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
     
        cudaFree(d_C);
        cudaMalloc((void **)&d_C, grid.x * sizeof(float));
        cudaMemcpy(d_C, h_C, grid.x * sizeof(float), cudaMemcpyHostToDevice);
     
        curlen = grid.x;
        printf("curlen = %d\n", curlen);
        print_array(h_C, curlen);   
    }
    cpuReduce(h_C, curlen);
    printf("Final ans = ");
    printf("%.2f\n", h_C[0]);
 
    //cpuReduce(A, n);
    //printf("Final ans = ");
    //printf("%.2f\n", A[0]);
 
    return 0;
}

'File written in /content/src/dotproduct.cu'

In [None]:
!nvcc /content/src/dotproduct.cu -o /content/src/dotproduct

In [None]:
!/content/src/dotproduct 256 # 33554432

+++ Name : Tesla T4
    Total global memory : 15812263936 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 5001000 kHz
    Warp size : 32
+++ n = 256
    size = 1024 bytes
    n1 = 64
+++ Printing float array of length = 256
    1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/dotproduct 8192

==513== NVPROF is profiling process 513, command: /content/src/dotproduct 8192
+++ Name : Tesla P4
    Total global memory : 7981694976 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 98304 bytes
    Memory bus width : 256 bits
    Memory clock rate : 3003000 kHz
    Warp size : 32
+++ n = 8192
    size = 32768 bytes
    n1 = 8
+++ Printing float array of length = 8192
    1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 

In [None]:
!cuda-memcheck /content/src/dotproduct 256

+++ Name : Tesla T4
    Total global memory : 15812263936 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 5001000 kHz
    Warp size : 32
+++ n = 256
    size = 1024 bytes
    n1 = 64
+++ n = 256, CPU Result = 256.00, CPU Dot Product time taken = 0.003000 ms
+++ Block Dim = (4, 1)
    Grid Dim = (64, 1)
+++ n = 256, GPU Dot Product Reduce naive time taken = 3.104224 ms
+++ Printing float array of length = 64
    4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 
+++ n = 256, GPU Dot Product Reduce efficient time taken = 3.104224 ms
+++ Printing float array of length = 64
    0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.

In [None]:
!/content/src/dotproduct 2048 # 33554432

+++ Name : Tesla P4
    Total global memory : 7981694976 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 98304 bytes
    Memory bus width : 256 bits
    Memory clock rate : 3003000 kHz
    Warp size : 32
+++ n = 2048
    size = 8192 bytes
    n1 = 2
+++ Printing float array of length = 2048
    1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00 31.00 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00 61.00 62.00 63.00 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00 96.00 97.00 98.00 99.00 100.00 101.00 102.00 103.00 104.00 105.00 106.00 107.00 108.00 109.00 110.00 111

In [None]:
!/content/src/dotproduct 21 # 33554432

+++ Name : Tesla T4
    Total global memory : 15812263936 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 5001000 kHz
    Warp size : 32
+++ n = 21
    size = 84 bytes
    n1 = 6
+++ Printing float array of length = 21
    1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00 21.00 
+++ Printing float array of length = 21
    1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00 21.00 
+++ n = 21, CPU Result = 3311.00, CPU Dot Product time taken = 0.002000 ms
+++ Block Dim = (4, 1)
    Grid Dim = (6, 1)
+++ Printing float array of length = 6
    30.00 174.00 446.00 846.00 1374.00 441.00 
