## 说明：
### 1. 实现了2D的conv操作，并与pytorch的conv2d结果对比，完成了验证
### 2. 基于实现的conv操作，对完整的多通道输入输出的场景进行了实现，并完成验证

In [41]:
import sys
import pycuda
import pycuda.driver as drv
import pycuda.autoinit
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pycuda.elementwise import ElementwiseKernel
from pycuda.scan import InclusiveScanKernel
from pycuda.reduction import ReductionKernel
from pycuda.compiler import SourceModule

import pycuda.gpuarray as gpuarray
import numpy as np
from copy import deepcopy

import torch
import torch.nn.functional as F

In [42]:
print("pycuda版本:{}".format(pycuda.VERSION))
print("python版本:{}".format(sys.version))

pycuda版本:(2019, 1, 2)
python版本:3.6.5 (default, Sep 29 2018, 16:40:34) 
[GCC 5.4.0 20160609]


In [43]:
drv.init() # 使用前需要初始化
print("当前环境可用gpu设备数量:{}".format(drv.Device.count()))
i= 2
gpu_device = drv.Device(i)
print("得到第{}个gpu设备: drv.Device(i)".format(i))
print("当前gpu设备的计算能力:{}".format(gpu_device.compute_capability()))
print("当前gpu设备的总内存:{} MB".format(gpu_device.total_memory()//(1024**2)))
print("\n gpu设备的所有属性名称及其值:")
attr_dict = gpu_device.get_attributes()
for key,val in attr_dict.items():
    print("{} : {}".format(key, val))

当前环境可用gpu设备数量:4
得到第2个gpu设备: drv.Device(i)
当前gpu设备的计算能力:(6, 1)
当前gpu设备的总内存:11178 MB

 gpu设备的所有属性名称及其值:
ASYNC_ENGINE_COUNT : 2
CAN_MAP_HOST_MEMORY : 1
CLOCK_RATE : 1620000
COMPUTE_CAPABILITY_MAJOR : 6
COMPUTE_CAPABILITY_MINOR : 1
COMPUTE_MODE : DEFAULT
CONCURRENT_KERNELS : 1
ECC_ENABLED : 0
GLOBAL_L1_CACHE_SUPPORTED : 1
GLOBAL_MEMORY_BUS_WIDTH : 352
GPU_OVERLAP : 1
INTEGRATED : 0
KERNEL_EXEC_TIMEOUT : 0
L2_CACHE_SIZE : 2883584
LOCAL_L1_CACHE_SUPPORTED : 1
MANAGED_MEMORY : 1
MAXIMUM_SURFACE1D_LAYERED_LAYERS : 2048
MAXIMUM_SURFACE1D_LAYERED_WIDTH : 32768
MAXIMUM_SURFACE1D_WIDTH : 32768
MAXIMUM_SURFACE2D_HEIGHT : 65536
MAXIMUM_SURFACE2D_LAYERED_HEIGHT : 32768
MAXIMUM_SURFACE2D_LAYERED_LAYERS : 2048
MAXIMUM_SURFACE2D_LAYERED_WIDTH : 32768
MAXIMUM_SURFACE2D_WIDTH : 131072
MAXIMUM_SURFACE3D_DEPTH : 16384
MAXIMUM_SURFACE3D_HEIGHT : 16384
MAXIMUM_SURFACE3D_WIDTH : 16384
MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS : 2046
MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH : 32768
MAXIMUM_SURFACECUBEMAP_WIDTH : 32768


## 基于cuda的单通道conv2D模块

In [48]:
mod = SourceModule('''   
__global__ void conv_gpu(float *outputs, float *inputs, float *weights, int inputs_w, int inputs_h )
{
     const int kernel_size = 3;
     const int   kernel_radius = 1; //(kernel_size-1)/2;  
    __shared__ float shared_kernel[kernel_size*kernel_size];
     int col = threadIdx.y + blockDim.y * blockIdx.y;
     int row = threadIdx.x + blockDim.x * blockIdx.x;
     int gLoc = row + inputs_w*col;
     

     for(int i=0 ;  i< kernel_size*kernel_size ; i+=1 )
     shared_kernel[i]= weights[i];
     
    
     float sum = 0; 
     float value = 0;
     for(int i = -kernel_radius; i<=kernel_radius ; i++)
        for(int j = -kernel_radius; j<=kernel_radius ;j++ ){  
          if( (col+j)<0 ||(row+i) < 0 ||(row+i) > (inputs_w-1) ||(col+j )>(inputs_h-1) )
          value = 0;
          else        
          value = inputs[gLoc + i + j * inputs_h];
          sum += value * shared_kernel[(i+kernel_radius) + (j+kernel_radius)*kernel_size];
    }
       outputs[gLoc] = sum;
 }
''')


### 单通道验证

In [51]:
inputs_torch = torch.randn(1,1,5,5)
filters_torch = torch.randn(1,1,3,3)
outputs_torch = F.conv2d(inputs_torch, filters_torch, padding=1)
print(outputs_torch)

tensor([[[[-0.5344, -2.9753, -3.3508, -0.9424, -1.2960],
          [-0.6342, -0.5778,  3.5405,  2.5334,  0.7346],
          [ 0.9215, -0.7080, -3.9251, -6.9739, -1.2933],
          [ 2.1979,  2.7678,  4.5203, -1.9241, -1.8704],
          [-0.6884,  3.4865,  0.3182, -4.7315,  1.0770]]]])


In [53]:
conv_gpu = mod.get_function("conv_gpu")

inputs_np = inputs_torch.numpy()[0,0,:,:]
filters_np = filters_torch.numpy()[0,0,:,:]
(inputs_h,  inputs_w) = inputs_np.shape

# 转换到cuda
inputs_gpu = gpuarray.to_gpu(inputs_np)
filters_gpu = gpuarray.to_gpu(filters_np)
outputs_gpu = gpuarray.to_gpu(inputs_np.copy())

conv_gpu(outputs_gpu, inputs_gpu , filters_gpu,  np.int32(inputs_w),  np.int32(inputs_h),  block=(5,1,1), grid=(1,5))
# Pull the data back from the GPU.
#cuda.memcpy_dtoh(destImage, destImage_gpu)
outputs_np = outputs_gpu.get()
print(outputs_np)

[[-0.53436834 -2.9753168  -3.3508215  -0.9423924  -1.2960488 ]
 [-0.63421124 -0.57777417  3.5404694   2.5333502   0.7346365 ]
 [ 0.9214522  -0.708026   -3.9251387  -6.9739065  -1.2932689 ]
 [ 2.197879    2.7677586   4.5202622  -1.9240698  -1.8704407 ]
 [-0.68839467  3.4864678   0.31819493 -4.7314754   1.077028  ]]


In [55]:
isEqual = np.allclose(outputs_torch[0,0,:,:].numpy(), outputs_np)
print("pytorch卷积接口计算结果与自定义cuda卷积模块计算结果是否相等:{}".format(isEqual))

pytorch卷积接口计算结果与自定义cuda卷积模块计算结果是否相等:True


## 完整的多通道输入输出卷积实现及验证

In [70]:
filters_torch = torch.randn(8,4,3,3)
inputs_torch = torch.randn(2,4,5,5)
outputs_torch = F.conv2d(inputs_torch, filters_torch, padding=1)
(B, C_in, H, W) = inputs_torch.shape
C_out = filters_torch.shape[0]
outputs_np = np.full((B, C_out, H, W), 0, dtype=np.float32)

for b in range(B):
    for c_o in range(C_out):
        for c_i in range(C_in):
            inputs_np = inputs_torch.numpy()[b,c_i,:,:]
            filters_np = filters_torch.numpy()[c_o,c_i,:,:]
            (inputs_h,  inputs_w) = inputs_np.shape

            # 转换到cuda
            inputs_gpu = gpuarray.to_gpu(inputs_np)
            filters_gpu = gpuarray.to_gpu(filters_np)
            outputs_gpu = gpuarray.to_gpu(inputs_np.copy())

            conv_gpu(outputs_gpu, inputs_gpu , filters_gpu,  np.int32(inputs_w),  np.int32(inputs_h),  block=(5,1,1), grid=(1,5))
            # Pull the data back from the GPU.
            #cuda.memcpy_dtoh(destImage, destImage_gpu)
            outputs_tmp = outputs_gpu.get()
            outputs_np[b, c_o, :, :] += outputs_tmp

print(outputs_torch)
print(outputs_np)

tensor([[[[  2.7610,  -0.0778,  -0.4686,   3.4005,  -3.2742],
          [  5.6520,   0.9814,   9.5312,  11.2443,  -1.0960],
          [  4.3053,   4.6576,   0.8058,  -4.2460,   4.2447],
          [  3.2987,   7.4277,   1.0068,   3.6235,  11.8115],
          [ -1.7737,   5.0722,  -0.7889,  -0.5678,  -1.6542]],

         [[ -7.5778,   5.9519,   8.4655,   1.7258,   6.6949],
          [ -1.7466,   6.9836,  -0.9286,   6.9171,  -2.8638],
          [  5.8839,   4.2034, -16.9111,  -1.0687,  -0.1345],
          [  4.3121,   5.9809,  -0.9009,  13.5289,   6.2836],
          [  4.2167,   5.1541,  10.2324,  -7.6381,  -4.3288]],

         [[ -1.7280,  11.8286,  -4.3150,  -9.1788,  -0.1405],
          [  2.8384,   9.2456, -12.0575,  -1.7388,  13.9064],
          [ -9.1081,   8.4183,   9.8954,   4.7973, -11.2288],
          [ -0.3122,  -6.1656,   3.4151,  -6.0125,  -1.2578],
          [ -4.2608,  -3.1783,   1.5610,   8.8211,  -1.1834]],

         [[ -4.1348,   8.6961,  -1.0085,  -0.9764,  -3.1796],
  

In [75]:
print('pytorch输出的shape为:{}'.format(outputs_np.shape))
print('自定义cuda模块输出的shape为:{}'.format(outputs_torch.shape))
isEqual = np.allclose(outputs_torch.numpy(), outputs_np)
print("pytorch卷积接口计算结果与自定义cuda卷积模块计算结果是否相等:{}".format(isEqual))

pytorch输出的shape为:(2, 8, 5, 5)
自定义cuda模块输出的shape为:torch.Size([2, 8, 5, 5])
pytorch卷积接口计算结果与自定义cuda卷积模块计算结果是否相等:True


## 分析：
### 目前只实现了2D结构的conv,对于实际场景中多通道输入和输出的高维场景，是通过for循环依次计算再合并来实现，并非最优结果， 最优结果目前尚在调试验证中