In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#!/usr/bin/env python
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import time
from scipy.signal import convolve2d

In [2]:
class ConvLayerOperation:
    def __init__(self):
        # """
        # Attributes for instance of EncoderDecoder module
        # """
        self.mod = None
        self.getSourceModule("convkernel.cu")
        self.TILE_WIDTH = 32

    def getSourceModule(self, path):
        """
        Get kernel from .cu file

        Args:
        - path: the path of the kernel.cu file
        """
        self.mod = SourceModule(open(path,"r").read())

    def forward_naive(self, X, Masks, N, C, M, H, W, K):
        """
        Naive parallel convolution without using shared or constant memory,
        the number and shape of threads blocks equals the shape of output matrix

        Properties:
        convolution layer:
        mode = valid
        stride = 1
        mask_width = K

        Parameters
        ----------
        X: input matrix with size [N, C, H, W]
        Masks: masks with size [M, C, K, K]
        N: number of samples 
        C: number of channels of input matrix
        M: number of channels of output matrix
        H: height of input matrix
        W: width of input matrix
        K: width of masks 

        Returns
        -------
        Y: output matrix with size [N, M, H-K+1, W-K+1]
        """

        X_d = gpuarray.to_gpu(X)
        Masks_d = gpuarray.to_gpu(Masks)
        w_y = W-K+1
        h_y = H-K+1
        Y_d = gpuarray.zeros((N, M,h_y,w_y), dtype=np.float32)

        
        BlockDim = (self.TILE_WIDTH, self.TILE_WIDTH, 1)
        Num_tiles = (h_y//self.TILE_WIDTH+1)*(w_y//self.TILE_WIDTH+1)

        print(BlockDim)
        GridDim = (N, M, Num_tiles)
        print(GridDim)

        func = self.mod.get_function("convLayer_forward_naive")

        func(X_d, Masks_d, Y_d, np.int32(N), np.int32(C), np.int32(M), np.int32(H), np.int32(W), np.int32(K), block=BlockDim, grid = GridDim)
        
        Y = Y_d.get()

        return Y

In [5]:
layer = ConvLayerOperation()
batch = 1000
C, M, H, W, K = 3, 16, 32, 32, 5

x_shape = (batch, C,H,W)
m_shape = (M,C,K,K)
y_shape = (batch, M,H-K+1,W-K+1)

X = np.random.rand(*x_shape).astype(np.float32)
print(X.shape)
Masks = np.random.rand(*m_shape).astype(np.float32)

Y = layer.forward_naive(X, Masks, batch, C, M, H, W, K)
print(Y.shape)
print(Y)



(1000, 3, 32, 32)
(32, 32, 1)
(1000, 16, 1)
(1000, 16, 28, 28)
[[[[17.276     16.60843   16.041813  ... 18.710882  18.578817
    17.297611 ]
   [18.775116  17.345558  17.885307  ... 17.186623  18.57766
    18.590147 ]
   [19.207272  19.320639  19.352549  ... 18.093784  17.1438
    17.671978 ]
   ...
   [15.909266  15.864985  16.067701  ... 16.678307  15.835943
    17.6843   ]
   [15.407876  15.80098   17.40252   ... 17.196241  16.485737
    18.250763 ]
   [16.775452  18.287928  18.260826  ... 16.770344  16.495394
    16.669353 ]]

  [[18.751013  18.765388  18.04704   ... 19.036425  19.025764
    18.817335 ]
   [20.768885  21.0157    21.07103   ... 19.79534   21.205862
    18.481638 ]
   [21.638947  21.01863   20.838007  ... 19.59606   20.721313
    19.74678  ]
   ...
   [18.654358  19.312435  18.875996  ... 18.479761  17.971756
    18.30664  ]
   [16.743828  17.71812   16.670742  ... 18.678373  18.949394
    19.392136 ]
   [18.894659  20.162678  19.571497  ... 18.904978  17.96561
    2