In [1]:
import numpy as np
from pycuda import driver, compiler, gpuarray, tools

# -- initialize the device
import pycuda.autoinit

In [2]:
MATRIX_SIZE = 100

In [3]:
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)

b_cpu_T = b_cpu.T.copy()

In [4]:
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

In [5]:
#c_gpu[0,0] = gpuarray.dot(a_gpu[0,:], b_gpu[:,0])

In [6]:
c_gpu[0,0] = gpuarray.dot(a_gpu[0,:], b_gpu[0,:])

In [7]:
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu_T)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

In [8]:
for x in range(MATRIX_SIZE):
    for y in range(MATRIX_SIZE):
        c_gpu[y,x] = gpuarray.dot(a_gpu[y], b_gpu[x])

In [9]:
c_gpu.get()

array([[-15.943892 ,   8.425479 ,   8.740665 , ...,  12.395147 ,
         20.13782  ,  -3.8797064],
       [ -6.9981346,  -2.3953748,  -8.22378  , ...,  -9.395988 ,
         16.733772 ,   2.8975835],
       [ -9.956326 ,  -1.59469  ,   2.7385802, ...,  -1.4993839,
          7.380319 , -12.53579  ],
       ...,
       [-12.122833 , -18.453775 ,  -6.8814554, ...,   3.0014193,
         13.673326 ,   2.9579916],
       [ 19.295204 ,   9.464533 ,   5.381792 , ...,  -9.062072 ,
        -12.94082  ,   5.5380535],
       [  5.358036 ,  -2.5441356,   2.0077386, ...,  -6.541136 ,
         -5.8639827,   1.4111109]], dtype=float32)

In [10]:
a_cpu @ b_cpu

array([[-15.943894 ,   8.425481 ,   8.740665 , ...,  12.395147 ,
         20.137825 ,  -3.879705 ],
       [ -6.998134 ,  -2.3953762,  -8.223779 , ...,  -9.395988 ,
         16.733767 ,   2.8975844],
       [ -9.956324 ,  -1.5946876,   2.7385788, ...,  -1.4993857,
          7.3803167, -12.53579  ],
       ...,
       [-12.122831 , -18.453777 ,  -6.8814554, ...,   3.0014184,
         13.6733265,   2.9579895],
       [ 19.295198 ,   9.464533 ,   5.3817916, ...,  -9.062075 ,
        -12.940815 ,   5.538054 ],
       [  5.358035 ,  -2.5441353,   2.0077388, ...,  -6.5411367,
         -5.863979 ,   1.4111106]], dtype=float32)

In [11]:
a_cpu = np.random.randn(MATRIX_SIZE * 3, MATRIX_SIZE * 5).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE * 5, MATRIX_SIZE * 2).astype(np.float32)

b_cpu_T = b_cpu.T.copy()
c_cpu = np.random.randn(MATRIX_SIZE * 3, MATRIX_SIZE * 2).astype(np.float32)

In [12]:
for y_blk in range(3):
    for x_blk in range(2):
        
        y_start = y_blk * MATRIX_SIZE
        x_start = x_blk * MATRIX_SIZE
        
        y_end = y_start + MATRIX_SIZE
        x_end = x_start + MATRIX_SIZE

        a_gpu = gpuarray.to_gpu(a_cpu[y_start : y_end])
        b_gpu = gpuarray.to_gpu(b_cpu[x_start : x_end])
        c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
        
        for x in range(MATRIX_SIZE):
            for y in range(MATRIX_SIZE):
                c_gpu[y,x] = gpuarray.dot(a_gpu[y], b_gpu[x])
        
        c_cpu[ y_start : y_end, x_start : x_end ] = c_gpu.get()

In [13]:
c_cpu.shape

(300, 200)

In [14]:
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

for y_blk in range(3):
    y_start = y_blk * MATRIX_SIZE
    y_end = y_start + MATRIX_SIZE
    a_gpu = gpuarray.to_gpu(a_cpu[y_start : y_end])    

    for x_blk in range(2):
        x_start = x_blk * MATRIX_SIZE    
        x_end = x_start + MATRIX_SIZE
        b_gpu = gpuarray.to_gpu(b_cpu[x_start : x_end])
        
        for x in range(MATRIX_SIZE):
            for y in range(MATRIX_SIZE):
                c_gpu[y,x] = gpuarray.dot(a_gpu[y], b_gpu[x])
        
        c_cpu[ y_start : y_end, x_start : x_end ] = c_gpu.get()