In [1]:
import numpy as np
from pycuda import driver, compiler, gpuarray, tools

# -- initialize the device
import pycuda.autoinit

In [2]:
kernel_code_template = """
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
    // 2D Thread ID (assuming that only *one* block will be executed)
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // sum is used to store the element of the matrix
    // that is computed by the thread
    float sum = 0;

    // Each thread loads one row of M and one column of N, 
    //   to produce one element of P.
    for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
        float Aelement = a[ty * %(MATRIX_SIZE)s + k];
        float Belement = b[k * %(MATRIX_SIZE)s + tx];
        sum += Aelement * Belement;
    }

    // Write the matrix to device memory;
    // each thread writes one element
    c[ty * %(MATRIX_SIZE)s + tx] = sum;
}
"""

In [3]:
MATRIX_SIZE = 10

kernel_code = kernel_code_template % {
    'MATRIX_SIZE': MATRIX_SIZE 
    }

# compile the kernel code 
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

In [4]:
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)

# transfer host (CPU) memory to device (GPU) memory 
a_gpu = gpuarray.to_gpu(a_cpu) 
b_gpu = gpuarray.to_gpu(b_cpu)

# create empty gpu array for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

In [5]:
matrixmul( a_gpu, b_gpu, c_gpu, block = (MATRIX_SIZE, MATRIX_SIZE, 1) )

In [6]:
print(c_gpu.get())

[[-3.4771717   5.482945   -6.292933    4.405063    0.51688236 -3.7065926
   1.4674895  -2.6236298   0.5188382   2.271932  ]
 [-1.8849337   2.7866294   1.4785069  -2.1530461   4.509437   -0.05617753
   3.543174   -0.3682125   2.4228332   5.6167655 ]
 [-0.24747853  1.5168964  -2.2270796   0.13722806  1.0699996  -2.634846
   2.7079709  -1.625223   -2.9609628   0.20578751]
 [ 0.6587145   0.2736324   1.727121   -3.2834382   2.3084679   0.44580218
   4.0302997  -1.6936392   3.9337013   3.5753314 ]
 [-3.6179368  -0.05588313 -0.3640145   2.540673   -3.2270606   0.6209002
  -0.7559828  -1.3442806   0.06099317 -1.5197484 ]
 [-5.6789513   4.888655    1.2472756   0.30557597 -1.7830266  -0.19065507
   3.510964   -4.01609    -1.1655627   1.5731109 ]
 [ 1.0808862   3.7633176  -2.8094559   1.7364875  -3.7730677  -0.26596048
  -3.2028282   0.13750798 -3.9100955  -2.3720872 ]
 [ 4.036247   -5.00309     2.2084363   0.47176293 -4.434005   -1.0028977
  -5.2980075   3.7076604  -3.0437567  -5.400503  ]
 [ 4.

In [7]:
# call the kernel on the card
%timeit -n 1 -r 1 matrixmul( a_gpu, b_gpu, c_gpu, block = (MATRIX_SIZE, MATRIX_SIZE, 1) )

96.1 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
x_cpu = np.random.randn(4000, 4000).astype(np.float32)
x_gpu = gpuarray.to_gpu(x_cpu)

In [9]:
x_gpu.shape

(4000, 4000)

In [10]:
x_gpu_2 = x_gpu * 2

In [11]:
gpuarray.max(x_gpu_2 / x_gpu)

array(2., dtype=float32)

In [12]:
gpuarray.min(x_gpu_2 / x_gpu)

array(2., dtype=float32)

In [13]:
%timeit -n 1 -r 1 x_sum_gpu = gpuarray.sum(x_gpu)

199 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
%timeit -n 1 -r 1 x_sum_gpu2 = x_gpu.get().sum()

55 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [15]:
w_cpu = np.random.randn(1000, 1000).astype(np.float32)
v_cpu = np.random.randn(1000, 1000).astype(np.float32)

In [16]:
w_gpu = gpuarray.to_gpu(w_cpu)
v_gpu = gpuarray.to_gpu(v_cpu)