In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ac2ggrdy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ac2ggrdy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=1707424a068a39cb5561ee9a1a8831dc97c2adb4c8c265da59d47c07c4223493
  Stored in directory: /tmp/pip-ephem-wheel-cache-d2simn5_/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu

#include <iostream>
#include <time.h>
using namespace std;


__global__ void multiplyParallel(int *mat1, int *mat2, int *result, int r, int m, int c)
{
    int i = (threadIdx.z * blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    int j = (blockIdx.z * gridDim.x * gridDim.y) + (gridDim.x * blockIdx.y) + blockIdx.x;
  
    int sum = 0;
 
    if(i < r && j < c)
    {
        for(int k = 0; k < m; k++)
          sum += (mat1[(i * m) + k] * mat2[(k * c) + j]);
  
        result[(i * c) + j] = sum;
    }
}

__global__ void vectorAddition(int *a, int *b, int *result) 
{
    int tid = blockIdx.x;
    
    result[tid] = a[tid] + b[tid];
}


void printMatrix(int *arr, int r, int c)
{
    for(int i = 0; i < (r * c); i++)
    {
        if(i % c == 0)
           cout<<endl;
     
        cout<<arr[i]<<" ";
    }
}

int* declareMatrix(int r, int c)
{
    int *arr = new int[r * c];
 
    for(int i = 0; i < (r * c); i++)
        arr[i] = 0;

    return arr;
}

void assignMatrix(int *arr, int r, int c)
{
    for(int i = 0; i < (r * c); i++)
        arr[i] = rand() % 10;
}

int* multiply(int *arr1, int *arr2, int r, int m, int c)
{
    int *arr3 = declareMatrix(r, c);
  
    for(int i = 0; i < r; i++)
        for(int j = 0; j < c; j++)
            for(int k = 0; k < m; k++)
                arr3[(i * c) + j] += (arr1[(i * m) + k] * arr2[(k * c) + j]);
 
    return arr3;
}

void multiply2mats()
{
    int r = rand() % 1000, m = rand() % 1000, c = rand() % 1000;
     
        cout<<"\nr: "<<r<<" m: "<<m<<" c: "<<c<<endl;

        int *mat1 = declareMatrix(r, m), *mat2 = declareMatrix(m, c), *resultSerial;
        int *d_mat1, *d_mat2, *d_result, *resultParallel = declareMatrix(r, c);

        assignMatrix(mat1, r, m);
        assignMatrix(mat2, m, c);
     
       
        
        clock_t t = clock();
     
        resultSerial = multiply(mat1, mat2, r, m, c);

        t = clock() - t;
     
        cout<<"\nIt took "<<(double)t / CLOCKS_PER_SEC<<" seconds - Serial"<<endl;
     
        //printMatrix(resultSerial, r, c);
        cout<<endl;
     
      

        cudaMalloc(&d_mat1, sizeof(int) * r * m);
        cudaMalloc(&d_mat2, sizeof(int) * m * c);
        cudaMalloc(&d_result, sizeof(int) * r * c);
          
        cudaMemcpy(d_mat1, mat1, sizeof(int) * r * m, cudaMemcpyHostToDevice);
        cudaMemcpy(d_mat2, mat2, sizeof(int) * m * c, cudaMemcpyHostToDevice);

        dim3 GridDimensions(16, 16, 16);
        dim3 BlockDimensions(16, 8, 8);
     
        t = clock();

        multiplyParallel<<<GridDimensions, BlockDimensions>>>(d_mat1, d_mat2, d_result, r, m, c);

        t = clock() - t;

        cout<<"\n\nIt took "<<(double)t / CLOCKS_PER_SEC<<" seconds - Parallel"<<endl;
     
        cudaMemcpy(resultParallel, d_result, sizeof(int) * r * c, cudaMemcpyDeviceToHost);
        cudaThreadSynchronize();
     
       
     
        int flag = 0;
        for(int i = 0; i < (r * c); i++)
        {
            if(resultSerial[i] != resultParallel[i])
            {
                flag = 1;
             
                break;
            }
        }
     
        if(flag == 0)
          cout<<"\n\nResult is correct\n";
        else
          cout<<"\n\nResult is incorrect\n";
     
        delete[] mat1;
        delete[] mat2;
        delete[] resultSerial;
        delete[] resultParallel;
     
        cudaFree(d_mat1);
        cudaFree(d_mat2);
        cudaFree(d_mat1);
}

void multiplyMatrixVec()
{
    int r = 1, m = rand() % 10, c = rand() % 10;
     
        cout<<"\nr: "<<r<<" m: "<<m<<" c: "<<c<<endl;

        int *mat1 = declareMatrix(r, m), *mat2 = declareMatrix(m, c), *resultSerial;
        int *d_mat1, *d_mat2, *d_result, *resultParallel = declareMatrix(r, c);

        assignMatrix(mat1, r, m);
        assignMatrix(mat2, m, c);
     
        
        
        clock_t t = clock();
     
        resultSerial = multiply(mat1, mat2, r, m, c);

        t = clock() - t;
     
        cout<<"\nIt took "<<(double)t / CLOCKS_PER_SEC<<" seconds - Serial"<<endl;
     
        printMatrix(resultSerial, r, c);
        cout<<endl;
     
       
        cudaMalloc(&d_mat1, sizeof(int) * r * m);
        cudaMalloc(&d_mat2, sizeof(int) * m * c);
        cudaMalloc(&d_result, sizeof(int) * r * c);
          
        cudaMemcpy(d_mat1, mat1, sizeof(int) * r * m, cudaMemcpyHostToDevice);
        cudaMemcpy(d_mat2, mat2, sizeof(int) * m * c, cudaMemcpyHostToDevice);

        dim3 GridDimensions(16, 16, 16);
        dim3 BlockDimensions(16, 8, 8);
     
        t = clock();

        multiplyParallel<<<GridDimensions, BlockDimensions>>>(d_mat1, d_mat2, d_result, r, m, c);

        t = clock() - t;

        cout<<"\n\nIt took "<<(double)t / CLOCKS_PER_SEC<<" seconds - Parallel"<<endl;
     
        cudaMemcpy(resultParallel, d_result, sizeof(int) * r * c, cudaMemcpyDeviceToHost);
        cudaThreadSynchronize();
     
        printMatrix(resultParallel, r, c);
     
        int flag = 0;
        for(int i = 0; i < (r * c); i++)
        {
            if(resultSerial[i] != resultParallel[i])
            {
                flag = 1;
             
                break;
            }
        }
     
        if(flag == 0)
          cout<<"\n\nResult is correct\n";
        else
          cout<<"\n\nResult is incorrect\n";
     
        delete[] mat1;
        delete[] mat2;
        delete[] resultSerial;
        delete[] resultParallel;
     
        cudaFree(d_mat1);
        cudaFree(d_mat2);
        cudaFree(d_mat1);
}

void add2mats()
{
    int n = 3;//rand() % 100;
        
        int *a = new int[n], *b = new int[n], *c = new int[n];
        int *d_a, *d_b, *d_c;
     
        for(int i = 0; i < n; i++)
        {
            a[i] = rand() % 20;
            b[i] = rand() % 20;
        }
     
        cudaMalloc(&d_a, sizeof(int) * n);
        cudaMalloc(&d_b, sizeof(int) * n);
        cudaMalloc(&d_c, sizeof(int) * n);
     

        cudaMemcpy(d_a, a, sizeof(int) * n, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, sizeof(int) * n, cudaMemcpyHostToDevice);

        vectorAddition<<<n, 1>>>(d_a, d_b, d_c);
     
        cudaMemcpy(c, d_c, sizeof(int) * n, cudaMemcpyDeviceToHost);
     
        if(n < 20)
        {
            cout<<"\nA: ";
            for(int i = 0; i < n; i++)
              cout<<a[i]<<" ";
            cout<<endl;
         
            cout<<"\nB: ";
            for(int i = 0; i < n; i++)
              cout<<b[i]<<" ";
            cout<<endl;
        }
     
        cout<<"\nSum: ";
        for(int i = 0; i < n; i++)
          cout<<c[i]<<" ";
        cout<<endl;
     
        delete[] a;
        delete[] b;
        delete[] c;
     
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);
}




int main()
{
    srand(time(NULL));
 
multiply2mats();
 cout<<"\n\n\n\n";
 multiplyMatrixVec();
 cout<<"\n\n\n";
add2mats();

 cout<<"\n\n\n";
 
    return 0;
}


r: 814 m: 816 c: 201

It took 0.697051 seconds - Serial



It took 1.8e-05 seconds - Parallel


Result is correct





r: 1 m: 9 c: 8

It took 1.6e-05 seconds - Serial

296 356 247 136 227 296 233 169 


It took 1.7e-05 seconds - Parallel

296 356 247 136 227 296 233 169 

Result is correct




A: 16 2 15 

B: 8 19 17 

Sum: 24 21 32 




