In [2]:
import numpy as np
from scipy import signal
from scipy import ndimage
from scipy.signal import fftconvolve, oaconvolve
import cupy as cp
from cupyx.scipy.ndimage import filters

In [14]:
# Compare results from five different convolution methods

# 4 CPU methods 
a = np.array([[1, 2, 0, 0],[5, 3, 0, 4],[0, 0, 0, 7],[9, 3, 0, 0]])
k = np.array([[1,1,1,1,1],[1,1,1,1,0],[1,1,1,0,0],[1,1,0,0,0],[1,0,0,0,0]])
print(ndimage.convolve(a, k, mode='constant', cval=0.0))
print(signal.convolve2d(a,k, mode='same'))
print(fftconvolve(a,k, mode='same'))
print(oaconvolve(a,k, mode='same'))

# 1 GPU method
a = cp.array([[1, 2, 0, 0],[5, 3, 0, 4],[0, 0, 0, 7],[9, 3, 0, 0]])
k = cp.array([[1,1,1,1,1],[1,1,1,1,0],[1,1,1,0,0],[1,1,0,0,0],[1,0,0,0,0]])
print(filters.convolve(a, k, mode='constant', cval=0.0))

[[11 21 14 11]
 [22 26 23 14]
 [15 23 14  7]
 [12 14  7  0]]
[[11 21 14 11]
 [22 26 23 14]
 [15 23 14  7]
 [12 14  7  0]]
[[1.10000000e+01 2.10000000e+01 1.40000000e+01 1.10000000e+01]
 [2.20000000e+01 2.60000000e+01 2.30000000e+01 1.40000000e+01]
 [1.50000000e+01 2.30000000e+01 1.40000000e+01 7.00000000e+00]
 [1.20000000e+01 1.40000000e+01 7.00000000e+00 1.77635684e-15]]
[[1.10000000e+01 2.10000000e+01 1.40000000e+01 1.10000000e+01]
 [2.20000000e+01 2.60000000e+01 2.30000000e+01 1.40000000e+01]
 [1.50000000e+01 2.30000000e+01 1.40000000e+01 7.00000000e+00]
 [1.20000000e+01 1.40000000e+01 7.00000000e+00 1.77635684e-15]]
[[11 21 14 11]
 [22 26 23 14]
 [15 23 14  7]
 [12 14  7  0]]


In [16]:
# Compare timings with different matrix size N 
# N > 100 can take significant time
N = 10
a = np.random.randint(5, size=(N, N))
k = np.random.randint(5, size=(2*N, 2*N))
%timeit -n 3 -r 4 ndimage.convolve(a, k, mode='constant', cval=0.0)
%timeit -n 3 -r 4 signal.convolve2d(a, k, mode='same')
%timeit -n 3 -r 4 fftconvolve(a, k, mode='same')
%timeit -n 3 -r 4 oaconvolve(a, k, mode='same')

a = cp.random.randint(5, size=(N, N))
k = cp.random.randint(5, size=(2*N, 2*N))
%timeit -n 3 -r 4 filters.convolve(a, k, mode='constant', cval=0.0)

1.34 ms ± 318 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)
280 µs ± 68.9 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)
537 µs ± 124 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)
416 µs ± 97.1 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)
79.2 µs ± 20.3 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)


In [24]:
# Check the time for array assignments
N = 100
%timeit -n 3 -r 4 a = np.random.randint(5, size=(N, N))
%timeit -n 3 -r 4 a = cp.random.randint(5, size=(N, N))

598 µs ± 104 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)
1.54 ms ± 171 µs per loop (mean ± std. dev. of 4 runs, 3 loops each)


In [None]:
# Not sure if this function is required to time GPU commands
cupy.cuda.runtime.deviceSynchronize() 