In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#!/usr/bin/env python
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import time
from scipy.signal import convolve2d

In [31]:
class ConvLayer_parallel:
    def __init__(self):
        # """
        # Attributes for instance of EncoderDecoder module
        # """
        self.mod = None
        self.getSourceModule("convkernel.cu")
        self.TILE_WIDTH = 32

        
    def getSourceModule(self, path):
        """
        Get kernel from .cu file

        Args:
        - path: the path of the kernel.cu file
        """
        self.mod = SourceModule(open(path,"r").read())
        

    def conv_naive0(self, X, Masks, C, M, H, W, K):
        """
        Naive parallel convolution without using shared or constant memory,
        the number and shape of threads blocks equals the shape of output matrix

        Properties:
        convolution layer:
        mode = valid
        stride = 1
        mask_width = K

        Args:
        - X: input matrix with size [C, H, W]
        - Masks: masks with size [M, C, K, K]
        - Y: output matrix with size [M, H-K+1, W-K+1]
        - C: number of channels of input matrix
        - M: number of channels of output matrix
        - H: height of input matrix
        - W: width of input matrix
        - K: width of masks 
        """

        X_d = gpuarray.to_gpu(X)
        Masks_d = gpuarray.to_gpu(Masks)
        Y_d = gpuarray.zeros((M,H-K+1,W-K+1), dtype=np.float32)

        blocksize = 32
        BlockDim = (blocksize, blocksize, 1)
        print(BlockDim)
        GridDim = (H//blocksize+1, W//blocksize+1, M)
        print(GridDim)

        func = self.mod.get_function("convLayer_forward_naive")

        func(X_d, Masks_d, Y_d, np.int32(C), np.int32(M), np.int32(H), np.int32(W), np.int32(K), block=BlockDim, grid = GridDim)
        
        Y = Y_d.get()

        return Y


    def conv_naive(self, X, Masks, N, C, M, H, W, K):
        """
        Naive parallel convolution without using shared or constant memory,
        the number and shape of threads blocks equals the shape of output matrix

        Properties:
        convolution layer:
        mode = valid
        stride = 1
        mask_width = K

        Parameters
        ----------
        X: input matrix with size [N, C, H, W]
        Masks: masks with size [M, C, K, K]
        N: number of samples 
        C: number of channels of input matrix
        M: number of channels of output matrix
        H: height of input matrix
        W: width of input matrix
        K: width of masks 

        Returns
        -------
        Y: output matrix with size [N, M, H-K+1, W-K+1]
        """

        X_d = gpuarray.to_gpu(X)
        Masks_d = gpuarray.to_gpu(Masks)
        w_y = W-K+1
        h_y = H-K+1
        Y_d = gpuarray.zeros((N, M,h_y,w_y), dtype=np.float32)

        
        BlockDim = (self.TILE_WIDTH, self.TILE_WIDTH, 1)
        Num_tiles = (h_y//self.TILE_WIDTH+1)*(w_y//self.TILE_WIDTH+1)

        print(BlockDim)
        GridDim = (N, M, Num_tiles)
        print(GridDim)

        func = self.mod.get_function("convLayer_forward_naive")

        func(X_d, Masks_d, Y_d, np.int32(N), np.int32(C), np.int32(M), np.int32(H), np.int32(W), np.int32(K), block=BlockDim, grid = GridDim)
        
        Y = Y_d.get()

        return Y

In [32]:
layer = ConvLayer_parallel()
batch = 10
C, M, H, W, K = 2, 4, 10, 5, 3

x_shape = (batch, C,H,W)
m_shape = (M,C,K,K)
y_shape = (batch, M,H-K+1,W-K+1)

X = np.random.rand(*x_shape).astype(np.float32)
print(X.shape)
Masks = np.random.rand(*m_shape).astype(np.float32)

Y = layer.conv_naive(X, Masks, batch, C, M, H, W, K)
print(Y.shape)
print(Y)



(10, 2, 10, 5)
(32, 32, 1)
(10, 4, 1)
(10, 4, 8, 3)
[[[[ 3.022827   2.81257    2.6293583]
   [ 2.860289   2.921743   3.0413663]
   [ 4.0335584  4.139762   4.0639334]
   [ 3.6652086  3.6018121  3.8356323]
   [ 4.445178   4.3464236  3.820384 ]
   [ 3.9790823  3.9035451  3.6917942]
   [ 3.3491101  2.7387516  3.6134596]
   [ 3.3325572  3.2540553  3.4076204]]

  [[ 3.8052843  4.3187523  3.436792 ]
   [ 3.0889552  3.7545485  4.718373 ]
   [ 4.9009676  4.9103637  4.9503593]
   [ 4.519154   5.424477   5.568035 ]
   [ 5.173346   5.4280305  4.838955 ]
   [ 3.8582077  4.384446   4.004127 ]
   [ 4.0625167  4.4568777  4.105605 ]
   [ 5.301694   5.4305925  4.251933 ]]

  [[ 3.7987194  5.2026024  5.214854 ]
   [ 3.9826903  3.9642396  4.036494 ]
   [ 4.903667   5.922354   6.090111 ]
   [ 5.022668   4.6391945  5.06235  ]
   [ 5.2406363  5.460975   4.732216 ]
   [ 5.2449746  5.8173304  5.3275123]
   [ 5.044727   4.7474837  4.8244824]
   [ 4.605649   4.250189   4.978131 ]]

  [[ 4.1379952  3.3575861  2.7