In [1]:
%load_ext Cython
import numpy as np
from scipy.stats import entropy

## Matrix multiplication

### Pure Python implementation

In [8]:
def mul_py(x,y):

    m1,n1 = x.shape
    m2,n2 = y.shape
    
    z = np.zeros((m1,n2))

    for i in range(m1): 
        for j in range(n2): 
            val = 0
            for k in range(m2): 
                val += x[i][k] * y[k][j]
            z[i][j] = val
                
    return z


### Cythonized version

In [9]:
%%cython

import numpy as np
cimport numpy as np


def mul_cy(np.ndarray x, np.ndarray y):
    
    cdef int m1 = x.shape[0]
    cdef int n1 = x.shape[1]
    cdef int m2 = y.shape[0]
    cdef int n2 = y.shape[1]
    cdef np.ndarray z = np.zeros([m1, n2], dtype=float)
    
    cdef int i 
    cdef int j
    cdef int k
    
    cdef double value

    for i in range(m1): 
        for j in range(n2): 
            value = 0
            for k in range(m2): 
                value += x[i][k] * y[k][j]
            
            z[i][j] = value
            
    return z


In [10]:
N = 100
arr1 = np.random.random((N,N))
arr2 = np.random.random((N,N))

In [11]:
%timeit -n 1  mul_py(arr1,arr2)
%timeit -n 1  mul_cy(arr1,arr2)

744 ms ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
474 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## NumPy buffer  declaration

The `cnp.ndarray[double]` declares a NumPy array _buffer_ object.  Cython knows how to interact with this array-like object efficiently.  The `double` in square brackets is the (scalar) dtype of the array elements.

In [6]:
%%cython

import numpy as np
cimport numpy as np

def mul_v2(np.ndarray[np.float64_t, ndim=2] x, np.ndarray[np.float64_t, ndim=2] y):
    
    cdef int m1 = x.shape[0]
    cdef int n1 = x.shape[1]
    cdef int m2 = y.shape[0]
    cdef int n2 = y.shape[1]
    cdef np.ndarray[np.float64_t, ndim=2] z = np.zeros([m1, n2], dtype=float)
    
    cdef int i 
    cdef int j
    cdef int k
    
    cdef double value

    for i in range(m1): 
        for j in range(n2):
            value = 0
            for k in range(m2): 
                value += x[i][k] * y[k][j]
            z[i][j] = value
                
    return z


In [3]:
%load_ext cython

In [4]:
%%cython

import numpy as np
cimport numpy as np

def mul_v2(np.ndarray[double, ndim=2] x, np.ndarray[double, ndim=2] y):
    
    cdef int m1 = x.shape[0]
    cdef int n1 = x.shape[1]
    cdef int m2 = y.shape[0]
    cdef int n2 = y.shape[1]
    cdef np.ndarray[double, ndim=2] z = np.zeros([m1, n2], dtype=float)
    
    cdef int i 
    cdef int j
    cdef int k
    
    cdef double value

    for i in range(m1): 
        for j in range(n2):
            value = 0
            for k in range(m2): 
                value += x[i][k] * y[k][j]
            z[i][j] = value
                
    return z

In [7]:
%timeit -n 1 mul_v2(arr1,arr2)

469 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Typed memoryview 

- Typed memoryview allow efficient access to memory buffers, without any python overhead



![name](img/mem_view.png)

[Image source](https://github.com/kwmsmith/scipy-2015-cython-tutorial/blob/master/cython-scipy-2015-kurt-smith.pdf)

In [8]:
%%cython
import numpy as np

narr = np.arange(27, dtype=np.dtype("i")).reshape((3, 3, 3))
cdef int [:, :, :] narr_view = narr


print("NumPy sum of the NumPy array before assignments: %s" % narr.sum())

# NumPy-style syntax for assigning a single value to all elements.
narr_view[:, :, :] = 3

print(f"NumPy sum of NumPy array after assignments: %s" % narr.sum())

NumPy sum of the NumPy array before assignments: 351
NumPy sum of NumPy array after assignments: 81


In [9]:
%%cython

def sum3d(int[:, :, :] arr):
    cdef size_t i, j, k, I, J, K
    cdef int total = 0
    I = arr.shape[0]
    J = arr.shape[1]
    K = arr.shape[2]
    for i in range(I):
        for j in range(J):
            for k in range(K):
                total += arr[i, j, k]
    return total


In [10]:
%timeit -n 10000 narr.sum()

3.35 µs ± 197 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%timeit -n 10000 sum3d(narr)

837 ns ± 47.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


The declaration

```python
def mul_mv(double[:,:] x, double[:,:] y)
   ...
```

Declares `x` and `y` to be two dimensional contiguous typed memoryview 

In [12]:
%%cython

import numpy as np
cimport numpy as np

DTYPE = np.float64
ctypedef np.float_t DTYPE_t

def mul_mv(double[:,:] x, double[:,:] y):
    
    cdef int m1 = x.shape[0]
    cdef int n1 = x.shape[1]
    cdef int m2 = y.shape[0]
    cdef int n2 = y.shape[1]
    
    cdef double[:,:] z = np.zeros([m1, n2], dtype=float)
    
    cdef int i 
    cdef int j
    cdef int k
    
    cdef double value

    for i in range(m1): 
        for j in range(n2):
            value = 0
            for k in range(m2): 
                value += x[i,k] * y[k,j]
            z[i,j] = value
                
    return z


In [13]:
%timeit -n 100 mul_mv(arr1,arr2)

1.06 ms ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### How fast is numpy?

In [14]:
def mul_np(x,y):
    """
    Multiply two arrays using numpy.
    """
    return np.matmul(x,y)

In [15]:
%timeit -n 100 mul_mv(arr1,arr2)
%timeit -n 100 mul_np(arr1,arr2)

1.05 ms ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
30.2 µs ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### How fast is C++?

In [16]:
!./mymatmul 100 100

checksum is 30376775.000000
N: 100, <T_avg>: 8.731600e-04 sec 
