In [None]:
%%writefile lab3_ex2.cu

#include  <stdio.h>
#include  <sys/time.h>
#include  <stdlib.h>
#include  <random>
//#define DataType double
#define DataType float
double cpuSeconds() {
  struct timeval tp;
  gettimeofday(&tp,NULL);
  return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
// Compute C = A * B
__global__ void gemm(DataType *A, DataType *B, DataType *C, int numARows,
                      int numAColumns, int numBRows, int numBColumns){
  //@@ Insert code to implement matrix multiplication here
  int row = blockDim.y * blockIdx.y + threadIdx.y;
  int col = blockDim.x * blockIdx.x + threadIdx.x;
  if ((col >= numBColumns) || (row >= numARows)) return;
  double sum=0.0;
  for (int k = 0; k < numAColumns; k++) {
        sum += A[row*numAColumns + k] * B[k*numBColumns + col];
    }
    C[row*numBColumns+col]=sum;
}

int main(int argc, char **argv) {
  
  DataType *hostA; // The A matrix
  DataType *hostB; // The B matrix
  DataType *hostC; // The output C matrix
  DataType *resultRef; // The reference result
  DataType *deviceA;
  DataType *deviceB;
  DataType *deviceC;
  int numARows;    // number of rows in the matrix A
  int numAColumns; // number of columns in the matrix A
  int numBRows;    // number of rows in the matrix B
  int numBColumns; // number of columns in the matrix B
  int numCRows;
  int numCColumns;

  //@@ Insert code below to read in numARows, numAColumns, numBColumns from args
   numARows = atoi(argv[1]);
   numAColumns = atoi(argv[2]);
   numBRows = atoi(argv[3]);
   numBColumns = atoi(argv[4]);
   numCRows = numARows;
   numCColumns = numBColumns;
  printf("Input matrix dim (%d x %d) (%d x %d) (%d x %d)\n", numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
  if (numAColumns != numBRows) {
        printf("ERROR: the matrix could not be multiplied" );
        return 0;
    }

  //@@ Insert code below to allocate Host memory for input and output
  hostA = (DataType*)malloc(numARows * numAColumns * sizeof(DataType));
  hostB = (DataType*)malloc(numBRows * numBColumns * sizeof(DataType));
  hostC = (DataType*)malloc(numCRows * numCColumns * sizeof(DataType));
  
  //@@ Insert code below to initialize hostA and hostB to random numbers, and create reference result in CPU
  std::uniform_real_distribution<DataType> distribution(0.0, 1.0);
  std::default_random_engine gen(1145);
  for (int i = 0; i < numARows; ++i) {
        for (int j = 0; j < numAColumns; ++j) {
            DataType randomNumber = distribution(gen);
            hostA[i*numAColumns + j] = randomNumber;
            
        }
    }
    for (int i = 0; i < numBRows; ++i) {
        for (int j = 0; j < numBColumns; ++j) {
            DataType randomNumber = distribution(gen); 
            hostB[i*numBColumns + j] = randomNumber;
            
        }
    }
  resultRef = (DataType*)malloc(numCRows * numCColumns * sizeof(DataType));
  for(int i=0; i<numARows;i++){
    for(int j=0;j<numBColumns;j++){
       resultRef[i*numBColumns+j]=0.0;
      for(int k=0;k<numBRows;k++){
        resultRef[i*numBColumns+j]+=hostA[k+numBRows*i]*hostB[j+numBColumns*k];
      }
      
    }
  }
  //@@ Insert code below to allocate GPU memory here
  cudaMalloc(&deviceA, numARows * numAColumns * sizeof(DataType));
  cudaMalloc(&deviceB, numBRows * numBColumns * sizeof(DataType));
  cudaMalloc(&deviceC, numCRows * numCColumns * sizeof(DataType));
  
  //@@ Insert code to below to Copy memory to the GPU here
  double start = cpuSeconds();
  cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(DataType), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(DataType), cudaMemcpyHostToDevice);
  double duration = cpuSeconds() - start;
  printf("float value\nHost -> Device: %f\n", duration);
  
  int Dg_x=(numCColumns+32-1)/32;
  int Dg_y=(numCRows+32-1)/32;
  //@@ Initialize the grid and block dimensions here
  dim3 Dg(Dg_x,Dg_y,1);
  dim3 Db(32,32,1);

  //@@ Launch the GPU Kernel here
  start = cpuSeconds();
  gemm<<<Dg,Db>>>(deviceA,deviceB,deviceC,numARows,numAColumns,numBRows,numBColumns);
  cudaDeviceSynchronize();
  duration = cpuSeconds() - start;
  printf("Kernel: %f\n", duration);

  //@@ Copy the GPU memory back to the CPU here
  start = cpuSeconds();
  cudaMemcpy(hostC, deviceC,  numCRows * numCColumns *sizeof(DataType), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  duration = cpuSeconds() - start;
  printf("Device->Host: %f\n", duration);
  //@@ Insert code below to compare the output with the reference
  for(int i=0;i<numCRows * numCColumns;i++){
    if(hostC[i]!=resultRef[i] && abs(resultRef[i]-hostC[i])>0.0001 ){
      printf("error %f - %f\n",hostC[i],resultRef[i]);
      return -1;
    }
  }
  printf("the commdan is correct");
  //@@ Free the GPU memory here
  cudaFree(deviceA);
  cudaFree(deviceB);
  cudaFree(deviceC);

  //@@ Free the CPU memory here
  free(hostA);
  free(hostB);
  free(hostC);

  return 0;
}


Overwriting lab3_ex2.cu


In [None]:
!nvcc -arch=sm_75 ./lab3_ex2.cu -o lab3_ex2

In [None]:
!./lab3_ex2 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
Host -> Device: 0.000099
Kernel: 0.000147
Device->Host: 0.000134
the commdan is correct

In [None]:
!./lab3_ex2 256 128 128 256

Input matrix dim (256 x 128) (128 x 256) (256 x 256)
Host -> Device: 0.000167
Kernel: 0.000267
Device->Host: 0.000380
the commdan is correct

In [None]:
!./lab3_ex2 512 128 128 512

Input matrix dim (512 x 128) (128 x 512) (512 x 512)
Host -> Device: 0.000291
Kernel: 0.000869
Device->Host: 0.001448
the commdan is correct

In [None]:
!./lab3_ex2 1024 128 128 1024

Input matrix dim (1024 x 128) (128 x 1024) (1024 x 1024)
Host -> Device: 0.000625
Kernel: 0.003164
Device->Host: 0.005803
the commdan is correct

In [None]:
!./lab3_ex2 2048 128 128 2048

Input matrix dim (2048 x 128) (128 x 2048) (2048 x 2048)
Host -> Device: 0.001006
Kernel: 0.012258
Device->Host: 0.021445
the commdan is correct

In [None]:
!./lab3_ex2 4096 128 128 4096

Input matrix dim (4096 x 128) (128 x 4096) (4096 x 4096)
Host -> Device: 0.001966
Kernel: 0.048724
Device->Host: 0.086303
the commdan is correct

In [None]:
!./lab3_ex2 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
float value
Host -> Device: 0.000077
Kernel: 0.000289
Device->Host: 0.000076
the commdan is correct

In [None]:
!./lab3_ex2 256 128 128 256

Input matrix dim (256 x 128) (128 x 256) (256 x 256)
float value
Host -> Device: 0.000095
Kernel: 0.000548
Device->Host: 0.000201
the commdan is correct

In [None]:
!./lab3_ex2 512 128 128 512

Input matrix dim (512 x 128) (128 x 512) (512 x 512)
float value
Host -> Device: 0.000170
Kernel: 0.001849
Device->Host: 0.000734
the commdan is correct

In [None]:
!./lab3_ex2 1024 128 128 1024

Input matrix dim (1024 x 128) (128 x 1024) (1024 x 1024)
float value
Host -> Device: 0.000319
Kernel: 0.006803
Device->Host: 0.002786
the commdan is correct

In [None]:
!./lab3_ex2 2048 128 128 2048

Input matrix dim (2048 x 128) (128 x 2048) (2048 x 2048)
float value
Host -> Device: 0.000555
Kernel: 0.026844
Device->Host: 0.010799
the commdan is correct

In [None]:
!./lab3_ex2 4096 128 128 4096

Input matrix dim (4096 x 128) (128 x 4096) (4096 x 4096)
float value
Host -> Device: 0.001014
Kernel: 0.106622
Device->Host: 0.043184
the commdan is correct

In [None]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./lab3_ex2 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
==PROF== Connected to process 319 (/content/lab3_ex2)
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
the commdan is correct==PROF== Disconnected from process 319
[319] lab3_ex2@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2022-Dec-15 13:19:08, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.99
    SM Frequency                                                             cycle/usecond                         584.02
    Elapsed Cycles                                                                   cycle                         71,433
    Memory [%]                                                                           %                           9.28
    SOL 

In [None]:
!nvprof ./lab3_ex2 128 128 128 128

Input matrix dim (128 x 128) (128 x 128) (128 x 128)
==264== NVPROF is profiling process 264, command: ./lab3_ex2 128 128 128 128
the commdan is correct==264== Profiling application: ./lab3_ex2 128 128 128 128
==264== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   75.67%  123.61us         1  123.61us  123.61us  123.61us  gemm(double*, double*, double*, int, int, int, int)
                   16.91%  27.616us         2  13.808us  13.600us  14.016us  [CUDA memcpy HtoD]
                    7.42%  12.128us         1  12.128us  12.128us  12.128us  [CUDA memcpy DtoH]
      API calls:   99.63%  298.32ms         3  99.439ms  3.0130us  298.31ms  cudaMalloc
                    0.12%  359.20us         1  359.20us  359.20us  359.20us  cuDeviceTotalMem
                    0.09%  255.00us         3  84.999us  53.853us  129.81us  cudaMemcpy
                    0.05%  146.14us         3  48.712us  3.9110us  129.52us  cudaFree
     

In [None]:
!./lab3_ex2 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
the commdan is correct

In [None]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli ./lab3_ex2 511 1023 1023 4094

Input matrix dim (511 x 1023) (1023 x 4094) (511 x 4094)
==PROF== Connected to process 347 (/content/lab3_ex2)
==PROF== Profiling "gemm" - 1: 0%....50%....100% - 8 passes
the commdan is correct==PROF== Disconnected from process 347
[347] lab3_ex2@127.0.0.1
  gemm(double*, double*, double*, int, int, int, int), 2022-Dec-15 13:23:05, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           5.00
    SM Frequency                                                             cycle/usecond                         584.99
    Elapsed Cycles                                                                   cycle                     28,127,262
    Memory [%]                                                                           %                          25.29
    

In [None]:
!nvprof ./lab3_ex2 2048 1536 1536 4096

Input matrix dim (2048 x 1536) (1536 x 4096) (2048 x 4096)
==284== NVPROF is profiling process 284, command: ./lab3_ex2 2048 1536 1536 4096
the commdan is correct==284== Profiling application: ./lab3_ex2 2048 1536 1536 4096
==284== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   79.03%  222.78ms         1  222.78ms  222.78ms  222.78ms  gemm(double*, double*, double*, int, int, int, int)
                   15.00%  42.291ms         1  42.291ms  42.291ms  42.291ms  [CUDA memcpy DtoH]
                    5.96%  16.804ms         2  8.4022ms  6.3323ms  10.472ms  [CUDA memcpy HtoD]
      API calls:   51.89%  309.06ms         3  103.02ms  136.62us  308.78ms  cudaMalloc
                   37.42%  222.84ms         1  222.84ms  222.84ms  222.84ms  cudaDeviceSynchronize
                   10.21%  60.822ms         3  20.274ms  6.5959ms  43.537ms  cudaMemcpy
                    0.36%  2.1447ms         3  714.90us  231.90us  1.142