In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-brn2a3cw
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-brn2a3cw
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=173eac38c0d5f517efa8e0cc131fb004e2c8392b4538e6cb235250c9a8e5b6bb
  Stored in directory: /tmp/pip-ephem-wheel-cache-nq_86zln/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cuda --name transpose.cu
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#define TILE_DIM 32
#define W 16

using namespace std;

void printGPUproperties(cudaDeviceProp &devProp)
{
    printf("+++ Name : %s\n", devProp.name);
    printf("    Total global memory : %ld bytes\n", devProp.totalGlobalMem);
    printf("    Total shared memory per block : %ld bytes\n", devProp.sharedMemPerBlock);
    printf("    Total shared memory per SM : %ld bytes\n", devProp.sharedMemPerMultiprocessor);
    printf("    Memory bus width : %d bits\n", devProp.memoryBusWidth);
    printf("    Memory clock rate : %d kHz\n", devProp.memoryClockRate);
    printf("    Warp size : %d\n", devProp.warpSize);
}

void print_matrix(float *a, int m, int n, char c)
{
    if(c == 'i')
        cout<<"Printing input matrix A = "<<endl;
    else
    {
        cout<<"Printing transposed matrix AT = "<<endl;
        m = m + n;
        n = m - n;
        m = m - n;
    }
    for(int i=0; i<m; i++)
    {
        for(int j=0; j<n; j++)
            printf("%.2f ", a[n*i + j]);
        cout<<endl;
    }
    cout<<endl;
}

int verify_result(float *AT, float *h_out, int m, int n)
{
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<m; j++)
        {
            if(abs(AT[m*i + j] - h_out[m*i + j]) > 1e-6)
            {
                cout<<"AT val = "<<AT[m*i + j]<<endl;
                cout<<"h_out val = "<<h_out[m*i + j]<<endl;
                return 0;
            }
        }
    }
    return 1;
}

void cpuTranspose(float *A, float *AT, int m, int n)
{
    for(int i=0; i<m; i++)
        for(int j=0; j<n; j++)
            AT[m*j + i] = A[n*i + j];
 
    /*
    for(int i=0; i<n; i++)
        for(int j=0; j<m; j++)
            AT[m*i + j] = A[n*j + i];
    */
}

__global__ void gpuTransposeRow(float *d_in, float *d_out, int m, int n)
{
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
  
    //implement out[j][i] = in[i][j]
    if(i < m && j < n)
        d_out[m*j + i] = d_in[n*i + j];
}


__global__ void gpuTransposeCol(float *d_in, float *d_out, int m, int n)
{
    int i_o = blockIdx.x * blockDim.x + threadIdx.y;
    int j_o = blockIdx.y * blockDim.y + threadIdx.x;
    
    //implement out[i][j] = in[j][i]
    if(i_o < n && j_o < m)
    {
        d_out[m*i_o + j_o] = d_in[n*j_o + i_o];
        //printf("i = %d j = %d\n", i, j);
    }
}


__global__ void gpuCopy(float *d_in, float *d_out, int m, int n)
{
    __shared__ float sharedcopy[TILE_DIM][TILE_DIM];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(i < m && j < n)
        sharedcopy[threadIdx.y][threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    if(i < m && j < n)
        d_out[n*i + j] = sharedcopy[threadIdx.y][threadIdx.x];
}

__global__ void gpuTransposeCoalesced(float *d_in, float *d_out, int m, int n)
{
    __shared__ float coal[TILE_DIM][TILE_DIM];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(i < m && j < n)
        coal[threadIdx.y][threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int i_o = blockIdx.x * blockDim.x + threadIdx.y;
    int j_o = blockIdx.y * blockDim.y + threadIdx.x;
  
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = coal[threadIdx.x][threadIdx.y];

}

__global__ void gpuTransposeCoalescedNoBC(float *d_in, float *d_out, int m, int n)
{
    __shared__ float sharednbc[TILE_DIM][TILE_DIM+1];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(i < m && j < n)
        sharednbc[threadIdx.y][threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int i_o = blockIdx.x * blockDim.x + threadIdx.y;
    int j_o = blockIdx.y * blockDim.y + threadIdx.x;
  
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = sharednbc[threadIdx.x][threadIdx.y];
}

__global__ void gpuTransposeCoalesced1D(float *d_in, float *d_out, int m, int n)
{
    extern __shared__ float shared[];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(i < m && j < n)
        shared[TILE_DIM*threadIdx.y + threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int i_o = blockIdx.x * blockDim.x + threadIdx.y;
    int j_o = blockIdx.y * blockDim.y + threadIdx.x;
  
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = shared[TILE_DIM*threadIdx.x + threadIdx.y];

}

__global__ void gpuTransposeCoalesced1DNoBC(float *d_in, float *d_out, int m, int n)
{
    extern __shared__ float shared2[];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(i < m && j < n)
        shared2[(TILE_DIM+1)*threadIdx.y + threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int i_o = blockIdx.x * blockDim.x + threadIdx.y;
    int j_o = blockIdx.y * blockDim.y + threadIdx.x;
  
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = shared2[(TILE_DIM+1)*threadIdx.x + threadIdx.y];
}


__global__ void gpuTransposeRect(float *d_in, float *d_out, int m, int n)
{
    __shared__ float rect[W][2*W];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
 
    if(i < m && j < n)
        rect[threadIdx.y][threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int threadnum = threadIdx.y * blockDim.x + threadIdx.x;
    int tidy_o = threadnum / blockDim.y;
    int tidx_o = threadnum % blockDim.y;
 
    int i_o = blockIdx.x * blockDim.x + tidy_o;
    int j_o = blockIdx.y * blockDim.y + tidx_o;
 
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = rect[tidx_o][tidy_o];
}

__global__ void gpuTransposeRectNoBC(float *d_in, float *d_out, int m, int n)
{
    __shared__ float rect[W][2*W+1];
 
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
 
 
    if(i < m && j < n)
        rect[threadIdx.y][threadIdx.x] = d_in[n*i + j];
    __syncthreads();
 
    int threadnum = threadIdx.y * blockDim.x + threadIdx.x;
    int tidy_o = threadnum / blockDim.y;
    int tidx_o = threadnum % blockDim.y;
 
    int i_o = blockIdx.x * blockDim.x + tidy_o;
    int j_o = blockIdx.y * blockDim.y + tidx_o;
 
    if(i_o < n && j_o < m)
        d_out[m*i_o + j_o] = rect[tidx_o][tidy_o];
}


int main(int argc, char* argv[])
{
    cudaDeviceProp devprop;
    cudaGetDeviceProperties(&devprop , 0);
    printGPUproperties(devprop);
 
    int m, n;
    m = atoi(argv[1]); //num_rows
    n = atoi(argv[2]); //num_cols
 
    float *h_in, *h_out, *d_in, *d_out;; //for GPU computations
    float *A, *AT; //for CPU computations
 
    //Allocate Memory to cpu variables
    h_in = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)malloc(m * n * sizeof(float));
    A = (float *)malloc(m * n * sizeof(float));
    AT = (float *)malloc(m * n * sizeof(float));
 
    //Allocate Memory to gpu variables
    cudaMalloc((void **)&d_in, m * n * sizeof(float));
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
 
    for(int i=0; i<m; i++)
    {
        for(int j=0; j<n; j++)
        {
            //h_in[n*i + j] = (float(rand()) / (float(RAND_MAX)));
            h_in[n*i + j] = n*i + j + 1;
            A[n*i + j] = h_in[n*i + j];
        }
    }
    
    //CPU transpose time
    clock_t t1, t2;  
    t1 = clock();
    cpuTranspose(A, AT, m, n);
    t2 = clock();
    print_matrix(A, m, n, 'i');
    print_matrix(AT, m, n, 'o');
    printf("+++ m = %d n = %d, CPU Time taken = %lf ms\n", m, n, ((double)(t2-t1)/(double)CLOCKS_PER_SEC) * 1000);
 

    //GPU transpose row
    cudaMemcpy(d_in, h_in, m * n *sizeof(float), cudaMemcpyHostToDevice);
    //float grid_dim = ceil(sqrt(m*n / 1024.0));
    //cout<<"grid dim = "<<grid_dim<<endl;
    //dim3 grid(int(grid_dim), int(grid_dim), 1);
    //dim3 block(32,32,1);
    float ms;
 
    dim3 block (TILE_DIM, TILE_DIM);
    dim3 grid (( n + block.x - 1) / block .x, (m + block.y - 1) / block .y);
 
    cout<<"+++ Block Dim = ("<<block.x<<", "<<block.y<<")"<<endl;
    cout<<"    Grid Dim = ("<<grid.x<<", "<<grid.y<<")"<<endl;

    cudaEvent_t c1, c2;
    cudaEventCreate(&c1);
    cudaEventCreate(&c2);
 
    cudaEventRecord(c1);
    gpuTransposeRow<<<grid, block>>>(d_in, d_out, m, n);
    cudaEventRecord(c2);
 
    cudaEventSynchronize(c2);
    cudaEventElapsedTime(&ms, c1, c2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (Read Rows, Store Cols) Time taken = %lf ms\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 

    //GPU transpose by columns
    cudaEvent_t g1, g2;
    cudaEventCreate(&g1);
    cudaEventCreate(&g2);
 
    cudaEventRecord(g1);
    gpuTransposeCol<<<grid, block>>>(d_in, d_out, m, n);
    cudaEventRecord(g2);
    cudaEventSynchronize(g2);
    cudaEventElapsedTime(&ms, g1, g2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (Read Cols, Store Rows) Time taken = %lf ms\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 

    //GPU simple copy op 
    cudaEventRecord(g1);
    gpuCopy<<<grid, block>>>(d_in, d_out, m, n);
    cudaEventRecord(g2);
    cudaEventSynchronize(g2);
    cudaEventElapsedTime(&ms, g1, g2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'i');
    //if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU simple copy (square sh. mem. no bank conflicts) Time taken = %lf ms [Reference Time]\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 
    //GPU transpose square shared memory
    cudaEvent_t sh1, sh2;
    cudaEventCreate(&sh1);
    cudaEventCreate(&sh2);
 
    //cout<<"calling sq sh"<<endl;
    cudaEventRecord(sh1);
    gpuTransposeCoalesced<<<grid, block>>>(d_in, d_out, m, n);
    cudaEventRecord(sh2);
    cudaEventSynchronize(sh2);
    cudaEventElapsedTime(&ms, sh1, sh2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (square sh. mem. with bank conflicts) Time taken = %lf ms\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 
    //GPU transpose square shared memory no bank conflicts
    cudaEventRecord(sh1);
    gpuTransposeCoalescedNoBC<<<grid, block>>>(d_in, d_out, m, n);
    cudaEventRecord(sh2);
    cudaEventSynchronize(sh2);
    cudaEventElapsedTime(&ms, sh1, sh2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (square sh. mem. no bank conflicts) Time taken = %lf ms\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 
    //GPU transpose 1D shared memory with bank conflicts
    cudaEventRecord(sh1);
    gpuTransposeCoalesced1D<<<grid, block, TILE_DIM * TILE_DIM * sizeof(float)>>>(d_in, d_out, m, n);
    cudaEventRecord(sh2);
    cudaEventSynchronize(sh2);
    cudaEventElapsedTime(&ms, sh1, sh2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = %lf ms\n", m, n, ms);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    //h_out = (float *)malloc(m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 
    //GPU transpose 1D shared memory no bank conflicts
    cudaEventRecord(sh1);
    gpuTransposeCoalesced1DNoBC<<<grid, block, TILE_DIM * (TILE_DIM+1) * sizeof(float)>>>(d_in, d_out, m, n);
    cudaEventRecord(sh2);
    cudaEventSynchronize(sh2);
    cudaEventElapsedTime(&ms, sh1, sh2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (padded 1D sh. mem. no bank conflicts) Time taken = %lf ms\n", m, n, ms);
 

    /*cout<<"Before freeing"<<endl;
    for(int i=0; i<10; i++)
        cout<<h_out[i]<<" ";
    cout<<endl; */

    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 
    /*cout<<"After freeing"<<endl;
    for(int i=0; i<10; i++)
        cout<<h_out[i]<<" ";
    cout<<endl; */

    //GPU transpose 2d rect shared memory with bank conflicts
    dim3 blockr (32, 16);
    dim3 gridr (( n + blockr.x - 1) / blockr.x, (m + blockr.y - 1) / blockr.y);
 
    float ms1;
    cout<<"+++ Block Dim = ("<<blockr.x<<", "<<blockr.y<<")"<<endl;
    cout<<"    Grid Dim = ("<<gridr.x<<", "<<gridr.y<<")"<<endl;
 
    cudaEvent_t r1, r2;
    cudaEventCreate(&r1);
    cudaEventCreate(&r2);

    cudaEventRecord(r1);
    gpuTransposeRect<<<gridr, blockr>>>(d_in, d_out, m, n);
    cudaEventRecord(r2);
    cudaEventSynchronize(r2);
    cudaEventElapsedTime(&ms1, r1, r2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    //print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (rectangular sh. mem. with bank conflicts) Time taken = %lf ms\n", m, n, ms1);
    //printf("always prints+++ m = %d n = %d, GPU Transpose (rectangular sh. mem. with bank conflicts) Time taken = %lf ms\n", m, n, ms1);
 
    //Reallocate used placeholders
    cudaFree(d_out);
    free(h_out);
    cudaMalloc((void **)&d_out, m * n * sizeof(float));
    h_out = (float *)calloc(m * n,  sizeof(float));
 

    //GPU transpose 2d rect shared memory with no bank conflicts
 
    cudaEventRecord(r1);
    gpuTransposeRectNoBC<<<gridr, blockr>>>(d_in, d_out, m, n);
    cudaEventRecord(r2);
    cudaEventSynchronize(r2);
    cudaEventElapsedTime(&ms1, r1, r2);

    cudaMemcpy(h_out, d_out, m * n *sizeof(float), cudaMemcpyDeviceToHost);
    print_matrix(h_out, m, n, 'o');
    if(verify_result(AT, h_out, m, n))
        printf("+++ m = %d n = %d, GPU Transpose (rectangular sh. mem. with no bank conflicts) Time taken = %lf ms\n", m, n, ms1);

 
    cudaFree(d_in);
    cudaFree(d_out);
    free(h_in);
    free(h_out);
    free(A);
    free(AT);

    return 0;
}

'File written in /content/src/transpose.cu'

In [None]:
!nvcc /content/src/transpose.cu -o /content/src/transpose

In [None]:
!/content/src/transpose 27 29

+++ Name : Tesla P100-PCIE-16GB
    Total global memory : 17071734784 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Memory bus width : 4096 bits
    Memory clock rate : 715000 kHz
    Warp size : 32
Printing input matrix A = 
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 
30.00 31.00 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00 41.00 42.00 43.00 44.00 45.00 46.00 47.00 48.00 49.00 50.00 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 
59.00 60.00 61.00 62.00 63.00 64.00 65.00 66.00 67.00 68.00 69.00 70.00 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00 80.00 81.00 82.00 83.00 84.00 85.00 86.00 87.00 
88.00 89.00 90.00 91.00 92.00 93.00 94.00 95.00 96.00 97.00 98.00 99.00 100.00 101.00 102.00 103.00 104.00 105.00 106.00 107.00 108.00 109.00 110.00 111.00 112.00 113.00 114.00 115.00 116.00 
117.00 11

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla P100-PCIE-16GB
    Total global memory : 17071734784 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Memory bus width : 4096 bits
    Memory clock rate : 715000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2006.693000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (256, 256)
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 4.361120 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 1.262272 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 1.287872 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 1.887552 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 1.262528 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 1.892704 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2178.484000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (256, 256)
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 12.237888 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 21.925728 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 5.525760 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 9.915456 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 6.010240 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 9.919840 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem. no ban

In [None]:
!cuda-memcheck /content/src/transpose 843 94949

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 843 n = 94949, CPU Time taken = 2615.322000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (2968, 27)
+++ m = 843 n = 94949, GPU Transpose (Read Rows, Store Cols) Time taken = 139.082169 ms
+++ m = 843 n = 94949, GPU Transpose (Read Cols, Store Rows) Time taken = 122.883652 ms
+++ m = 843 n = 94949, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 213.194687 ms [Reference Time]
+++ m = 843 n = 94949, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 219.416473 ms
+++ m = 843 n = 94949, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 221.048218 ms
+++ m = 843 n = 94949, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 220.823364 ms
+++ m = 843 n = 94949, GPU Transpose (padded 1D sh. m

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2436.365000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (256, 256)
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 12.265568 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 21.971775 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 5.544192 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 9.974976 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 6.020640 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 9.979776 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem. no ban

In [None]:
!cuda-memcheck /content/src/transpose 843 94949

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 843 n = 94949, CPU Time taken = 2724.855000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (2968, 27)
+++ m = 843 n = 94949, GPU Transpose (Read Rows, Store Cols) Time taken = 139.099716 ms
+++ m = 843 n = 94949, GPU Transpose (Read Cols, Store Rows) Time taken = 122.914818 ms
+++ m = 843 n = 94949, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 213.377563 ms [Reference Time]
+++ m = 843 n = 94949, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 220.063431 ms
+++ m = 843 n = 94949, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 221.479034 ms
+++ m = 843 n = 94949, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 220.827423 ms
+++ m = 843 n = 94949, GPU Transpose (padded 1D sh. m

In [None]:
!cat file.txt

cat: file.txt: No such file or directory


In [None]:
!/content/src/transpose 10 1

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 10 n = 1, CPU Time taken = 0.001000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (1, 1)
+++ m = 10 n = 1, GPU Transpose (Read Rows, Store Cols) Time taken = 0.122080 ms
+++ m = 10 n = 1, GPU Transpose (Read Cols, Store Rows) Time taken = 0.023456 ms
+++ m = 10 n = 1, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.022880 ms [Reference Time]
+++ m = 10 n = 1, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 0.020576 ms
+++ m = 10 n = 1, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.022208 ms
+++ m = 10 n = 1, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 0.040960 ms
+++ m = 10 n = 1, GPU Transpose (padded 1D sh. mem. no bank conflicts) Time taken = 0.022304 ms
+++ Block D

In [None]:
!/content/src/transpose 32 32

+++ Name : Tesla T4
    Total global memory : 15812263936 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Total registers per block : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 5001000 kHz
    Warp size : 32
+++ m = 32 n = 32, CPU Time taken = 0.006000 ms
+++ m = 32 n = 32, GPU Transpose (Read Rows, Store Cols) Time taken = 0.024064 ms
+++ m = 32 n = 32, GPU Transpose (Read Cols, Store Rows) Time taken = 0.015488 ms
+++ m = 32 n = 32, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.012096 ms [Reference Time]
+++ m = 32 n = 32, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 0.013472 ms
+++ m = 32 n = 32, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.011328 ms
+++ m = 32 n = 32, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 0.012096 ms
+++ m = 32 n = 32, GPU Transpose (padded 1D sh. mem. no bank conflicts) Time taken = 0.010016 ms
Block Di

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla P4
    Total global memory : 7981694976 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 98304 bytes
    Total registers per block : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 3003000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 1954.697000 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 9.793824 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 6.728000 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 4.287712 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 6.149280 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 4.586656 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 6.158496 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem. no bank conflicts)

In [None]:
!/content/src/transpose 10000 10000

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 10000 n = 10000, CPU Time taken = 3091.290000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (313, 313)
+++ m = 10000 n = 10000, GPU Transpose (Read Rows, Store Cols) Time taken = 19.008160 ms
+++ m = 10000 n = 10000, GPU Transpose (Read Cols, Store Rows) Time taken = 33.899361 ms
+++ m = 10000 n = 10000, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 9.110240 ms [Reference Time]
+++ m = 10000 n = 10000, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 15.127936 ms
+++ m = 10000 n = 10000, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 10.157280 ms
+++ m = 10000 n = 10000, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 15.119616 ms
+++ m = 10000 n = 10000, GPU Transpose (padded

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla P100-PCIE-16GB
    Total global memory : 17071734784 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Total registers per block : 65536 bytes
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2216.960000 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 4.330208 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 1.257792 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 1.149312 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 1.902656 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 1.244512 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 1.859136 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem. no bank conflicts) Time taken = 1.245984 ms


In [None]:
!/content/src/transpose 4096 4096

+++ Name : Tesla T4
    Total global memory : 15812263936 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 65536 bytes
    Total registers per block : 65536 bytes
    Memory bus width : 256 bits
    Memory clock rate : 5001000 kHz
    Warp size : 32
+++ m = 4096 n = 4096, CPU Time taken = 355.001000 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 2.933312 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Cols, Store Rows) Time taken = 1.963744 ms
+++ m = 4096 n = 4096, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.819424 ms [Reference Time]
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 2.084960 ms
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.865824 ms
+++ m = 4096 n = 4096, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 2.084224 ms
+++ m = 4096 n = 4096, GPU Transpose (padded 1D sh. mem. no bank conflicts)

In [None]:
!/content/src/transpose 4096 4096

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Total registers per block : 65536 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 4096 n = 4096, CPU Time taken = 553.292000 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 3.076288 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Cols, Store Rows) Time taken = 5.397504 ms
+++ m = 4096 n = 4096, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 1.315744 ms [Reference Time]
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 2.455616 ms
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 1.463104 ms
+++ m = 4096 n = 4096, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 2.457504 ms
+++ m = 4096 n = 4096, GPU Transpose (padded 1D sh. mem. no bank conflict

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 8192 8192 #Tesla K-80

==240== NVPROF is profiling process 240, command: /content/src/transpose 8192 8192
+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Total registers per block : 65536 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2340.207000 ms
==240== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==240== Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (1 of 60)... 
	fb_subp1_write_sectors
	fb_subp1_read_sectors
	l2_subp1_read_sector_misses
	l2_subp1_write_sector_misses
	active_cycles
	10 internal events
==240== [1A
[K[7A[K
[K
[K
[K
[K
[K
[K
[7A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (2 of 60)... 
	l2_subp2_total_read_sector_queries
	4 internal events
==240== [1A
[K[3A[K
[K
[K
[3A[KReplaying kernel "gpuTransposeRo

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 8192 8192 #time mismatch in row and col

==598== NVPROF is profiling process 598, command: /content/src/transpose 8192 8192
+++ Name : Tesla P100-PCIE-16GB
    Total global memory : 17071734784 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM :65536 bytes
    Total registers per block : 65536 bytes
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 1950.756000 ms
==598== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==598== Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (1 of 60)... 
	active_cycles
	elapsed_cycles_sm
	inst_executed
	inst_issued1
	inst_issued2
	active_warps
	shared_ld_transactions
	shared_st_transactions
==598== [1A
[K[9A[K
[K
[K
[K
[K
[K
[K
[K
[K
[9A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (2 of 60)... 
	elapsed_cycles_pm
	fb_subp1_read_sectors
	fb1_subp1_read_sectors
	fb_subp1_write_sectors
	fb1_subp1_write_sectors
	inst_executed
	not_predicated_off_thread_inst_executed
	inst_is

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 8192 8192 #all four, no time mismatch in row and col

==1588== NVPROF is profiling process 1588, command: /content/src/transpose 8192 8192
+++ m = 8192 n = 8192, CPU Time taken = 2339.930000 ms
grid dim = 256
==1588== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==1588== Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (1 of 60)... 
	4 internal events
==1588== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (2 of 60)... 
	1 internal events
==1588== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (3 of 60)... 
	tex1_cache_sector_misses
	tex3_cache_sector_misses
	l2_subp3_read_sector_misses
	l2_subp3_write_sector_misses
	l1_local_store_hit
	global_store_transaction
	6 internal events
==1588== [1A
[K[8A[K
[K
[K
[K
[K
[K
[K
[K
[8A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (4 of 60)... 
	4 internal events
==1588== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, 

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 4096 4096 # when in transpose col x, y are interchanged


==266== NVPROF is profiling process 266, command: /content/src/transpose 4096 4096
+++ m = 4096 n = 4096, CPU Time taken = 427.366000 ms
grid dim = 128
==266== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==266== Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (1 of 60)... 
	4 internal events
==266== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (2 of 60)... 
	warps_launched
==266== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (3 of 60)... 
	2 internal events
==266== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (4 of 60)... 
	elapsed_cycles_pm
	fb_subp1_read_sectors
	fb1_subp1_read_sectors
	fb_subp1_write_sectors
	fb1_subp1_write_sectors
	inst_executed
	not_predicated_off_thread_inst_executed
	inst_issued1
	inst_issued2
	l2_subp0_write_sysmem_sector_queries
	l2_subp1_write_sysmem_sector_queries
	4 internal ev

In [None]:
!/content/src/transpose 16 32

+++ m = 16 n = 32, CPU Time taken = 0.034000 ms
grid dim = 1
+++ m = 16 n = 32, GPU Transpose (Read Rows, Store Cols) Time taken = 0.113472 ms
+++ m = 16 n = 32, GPU Transpose (Read Cols, Store Rows) Time taken = 0.025088 ms


In [None]:
!/content/src/transpose 2048 2048 old

+++ m = 2048 n = 2048, CPU Time taken = 75.434000 ms
grid dim = 64
+++ m = 2048 n = 2048, GPU Transpose (Read Rows, Store Cols) Time taken = 0.297600 ms
+++ m = 2048 n = 2048, GPU Transpose (Read Cols, Store Rows) Time taken = 0.298752 ms


In [None]:
!/content/src/transpose 2048 2048

+++ m = 2048 n = 2048, CPU Time taken = 123.055000 ms
grid dim = 64
+++ m = 2048 n = 2048, GPU Transpose (Read Rows, Store Cols) Time taken = 0.941664 ms
+++ m = 2048 n = 2048, GPU Transpose (Read Cols, Store Rows) Time taken = 1.393664 ms


In [None]:
!/content/src/transpose 512 512 old

+++ m = 512 n = 512, CPU Time taken = 2.600000 ms
grid dim = 16
+++ m = 512 n = 512, GPU Transpose (Read Rows, Store Cols) Time taken = 0.032192 ms
+++ m = 512 n = 512, GPU Transpose (Read Cols, Store Rows) Time taken = 0.018080 ms


In [None]:
!/content/src/transpose 512 512 old

+++ m = 512 n = 512, CPU Time taken = 6.556000 ms
grid dim = 16
+++ m = 512 n = 512, GPU Transpose (Read Rows, Store Cols) Time taken = 0.142432 ms
+++ m = 512 n = 512, GPU Transpose (Read Cols, Store Rows) Time taken = 0.114848 ms


In [None]:
!/content/src/transpose 4096 4096 old

+++ m = 4096 n = 4096, CPU Time taken = 511.065000 ms
grid dim = 128
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 3.163520 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Cols, Store Rows) Time taken = 5.492896 ms


In [None]:
!/content/src/transpose 4096 4096

+++ Name : Tesla P4
    Total global memory : 7981694976 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM :98304 bytes
    Total registers per block : 65536 bytes
    Warp size : 32
+++ m = 4096 n = 4096, CPU Time taken = 371.211000 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 2.497568 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Cols, Store Rows) Time taken = 1.671328 ms
+++ m = 4096 n = 4096, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.937568 ms
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 1.523840 ms
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.993024 ms


In [None]:
!nvidia-smi

Thu Apr 16 14:12:47 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
!nvidia-smi

Tue Apr 14 06:03:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
!nvidia-smi

Mon Apr 13 05:57:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
!/content/src/transpose 512 512

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 512 n = 512, CPU Time taken = 6.842000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (16, 16)
+++ m = 512 n = 512, GPU Transpose (Read Rows, Store Cols) Time taken = 0.182336 ms
+++ m = 512 n = 512, GPU Transpose (Read Cols, Store Rows) Time taken = 0.118592 ms
+++ m = 512 n = 512, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.045952 ms [Reference Time]
+++ m = 512 n = 512, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 0.068128 ms
+++ m = 512 n = 512, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.046784 ms
+++ m = 512 n = 512, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 0.062048 ms
+++ m = 512 n = 512, GPU Transpose (padded 1D sh. mem. no bank conflicts) Time taken

In [None]:
!/content/src/transpose 1024 1024

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 1024 n = 1024, CPU Time taken = 36.665000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (32, 32)
+++ m = 1024 n = 1024, GPU Transpose (Read Rows, Store Cols) Time taken = 0.334784 ms
+++ m = 1024 n = 1024, GPU Transpose (Read Cols, Store Rows) Time taken = 0.372704 ms
+++ m = 1024 n = 1024, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.109888 ms [Reference Time]
+++ m = 1024 n = 1024, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 0.178880 ms
+++ m = 1024 n = 1024, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.127264 ms
+++ m = 1024 n = 1024, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 0.179808 ms
+++ m = 1024 n = 1024, GPU Transpose (padded 1D sh. mem. no bank conf

In [None]:
!/content/src/transpose 2048 2048

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 2048 n = 2048, CPU Time taken = 135.685000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (64, 64)
+++ m = 2048 n = 2048, GPU Transpose (Read Rows, Store Cols) Time taken = 0.869728 ms
+++ m = 2048 n = 2048, GPU Transpose (Read Cols, Store Rows) Time taken = 1.410976 ms
+++ m = 2048 n = 2048, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.383552 ms [Reference Time]
+++ m = 2048 n = 2048, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 0.650624 ms
+++ m = 2048 n = 2048, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.399296 ms
+++ m = 2048 n = 2048, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 0.655104 ms
+++ m = 2048 n = 2048, GPU Transpose (padded 1D sh. mem. no bank con

In [None]:
!/content/src/transpose 4096 4096

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 4096 n = 4096, CPU Time taken = 571.516000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (128, 128)
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 3.163520 ms
+++ m = 4096 n = 4096, GPU Transpose (Read Cols, Store Rows) Time taken = 5.527008 ms
+++ m = 4096 n = 4096, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 1.385952 ms [Reference Time]
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 2.508064 ms
+++ m = 4096 n = 4096, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 1.478880 ms
+++ m = 4096 n = 4096, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 2.508960 ms
+++ m = 4096 n = 4096, GPU Transpose (padded 1D sh. mem. no bank c

In [None]:
!/content/src/transpose 8192 8192

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 8192 n = 8192, CPU Time taken = 2364.602000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (256, 256)
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Time taken = 12.302528 ms
+++ m = 8192 n = 8192, GPU Transpose (Read Cols, Store Rows) Time taken = 22.016672 ms
+++ m = 8192 n = 8192, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 5.528704 ms [Reference Time]
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 9.959264 ms
+++ m = 8192 n = 8192, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 6.037600 ms
+++ m = 8192 n = 8192, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 9.954592 ms
+++ m = 8192 n = 8192, GPU Transpose (padded 1D sh. mem. no ban

In [None]:
!/content/src/transpose 3200 3200

+++ Name : Tesla K80
    Total global memory : 11996954624 bytes
    Total shared memory per block : 49152 bytes
    Total shared memory per SM : 114688 bytes
    Memory bus width : 384 bits
    Memory clock rate : 2505000 kHz
    Warp size : 32
+++ m = 3200 n = 3200, CPU Time taken = 301.519000 ms
+++ Block Dim = (32, 32)
    Grid Dim = (100, 100)
+++ m = 3200 n = 3200, GPU Transpose (Read Rows, Store Cols) Time taken = 1.993088 ms
+++ m = 3200 n = 3200, GPU Transpose (Read Cols, Store Rows) Time taken = 3.364128 ms
+++ m = 3200 n = 3200, GPU simple copy (square sh. mem. no bank conflicts) Time taken = 0.923872 ms [Reference Time]
+++ m = 3200 n = 3200, GPU Transpose (square sh. mem. with bank conflicts) Time taken = 1.535904 ms
+++ m = 3200 n = 3200, GPU Transpose (square sh. mem. no bank conflicts) Time taken = 0.971904 ms
+++ m = 3200 n = 3200, GPU Transpose (1D sh. mem. with bank conflicts) Time taken = 1.541632 ms
+++ m = 3200 n = 3200, GPU Transpose (padded 1D sh. mem. no bank c

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 4096 4096 #Tesla K-80 Profiled

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[K
[K
[K
[K
[K
[K
[10A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (57 of 60)... 
	elapsed_cycles_sm
	fb_subp0_write_sectors
	fb_subp0_read_sectors
	l2_subp0_read_sector_misses
	l2_subp0_write_sector_misses
	inst_issued1
	inst_issued2
	active_cycles
	8 internal events
==1343== [1A
[K[10A[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[10A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (58 of 60)... 
	l2_subp2_read_l1_hit_sectors
==1343== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (59 of 60)... 
	l2_subp2_write_sysmem_sector_queries
==1343== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (60 of 60)... 
	l2_subp0_write_sysmem_sector_queries
==1343== [1A
Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (done)
+++ m = 4096 n = 4096, GPU Transpose (Read Rows, Store Cols) Time taken = 1294.274

In [None]:
!nvprof --events warps_launched,local_load --metrics all /content/src/transpose 8192 8192 #Tesla K-80 Profiled

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	rocache_gld_inst_16bit
==1365== [1A
[K[5A[K
[K
[K
[K
[K
[5A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (57 of 60)... 
	l2_subp1_write_sysmem_sector_queries
==1365== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (58 of 60)... 
	l2_subp3_read_tex_hit_sectors
==1365== [1A
[K[2A[K
[K
[2A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (59 of 60)... 
	tex0_cache_sector_misses
	tex2_cache_sector_misses
	l2_subp2_read_sector_misses
	l2_subp2_write_sector_misses
	l1_local_load_hit
	l1_local_load_miss
	l1_local_store_hit
	5 internal events
==1365== [1A
[K[9A[K
[K
[K
[K
[K
[K
[K
[K
[K
[9A[KReplaying kernel "gpuTransposeRow(float*, float*, int, int)" (60 of 60)... 
	4 internal events
==1365== [1A
Replaying kernel "gpuTransposeRow(float*, float*, int, int)" (done)
+++ m = 8192 n = 8192, GPU Transpose (Read Rows, Store Cols) Tim