# Cythonup: one-day Cython workshop
## Quick Prototyping for Speed (2)
### Tzer-jen Wei http://github.com/tjwei


# 用 C 的方式寫 Python

_1._ 記憶體存取效率

[knapsack_0_original.py](/edit/knapsack/knapsack_0_original.py) 的問題：

第 21 行
```python
left = [item for item in items if item.weight <= K]
```

第 39 行

```python
 v, idxs = search(left[i + 1:], K-item.weight, best_v, current_v+item.value, current_line+[item.index])
```

list copy/create 太過昂貴（雖然很方便）

解決方法：
* 非必要的東西不傳/不建立
* 一定要傳的東西，共用/還原
* 但是要很小心
* np.array 和 cython 的 typed memoryview (類似 go 的 slice)
* c array 自己追範圍
* C++ 的 vector 取代 list

In [None]:
%load_ext Cython

標準例子 from http://docs.cython.org/src/userguide/memoryviews.html

In [None]:
%%cython
# cython: boundscheck=False
from cython.view cimport array as cvarray
import numpy as np

# Memoryview on a NumPy array
narr = np.arange(27, dtype=np.dtype("i")).reshape((3, 3, 3))
cdef int [:, :, :] narr_view = narr

# Memoryview on a C array
cdef int carr[3][3][3]
cdef int [:, :, :] carr_view = carr

# Memoryview on a Cython array
cyarr = cvarray(shape=(3, 3, 3), itemsize=sizeof(int), format="i")
cdef int [:, :, :] cyarr_view = cyarr

# Show the sum of all the arrays before altering it
print "NumPy sum of the NumPy array before assignments:", narr.sum()

# We can copy the values from one memoryview into another using a single
# statement, by either indexing with ... or (NumPy-style) with a colon.
carr_view[...] = narr_view
cyarr_view[:] = narr_view
# NumPy-style syntax for assigning a single value to all elements.
narr_view[:, :, :] = 3

# Just to distinguish the arrays
carr_view[0, 0, 0] = 100
cyarr_view[0, 0, 0] = 1000

# Assigning into the memoryview on the NumPy array alters the latter
print "NumPy sum of NumPy array after assignments:", narr.sum()

# A function using a memoryview does not usually need the GIL
cpdef int sum3d(int[:, :, :] arr) nogil:
    cdef int total = 0
    I = arr.shape[0]
    J = arr.shape[1]
    K = arr.shape[2]
    for i in range(I):
        for j in range(J):
            for k in range(K):
                total += arr[i, j, k]
    return total

# A function accepting a memoryview knows how to use a NumPy array,
# a C array, a Cython array...
print "Memoryview sum of NumPy array is", sum3d(narr)
print "Memoryview sum of C array is", sum3d(carr)
print "Memoryview sum of Cython array is", sum3d(cyarr)
# ... and of course, a memoryview.
print "Memoryview sum of C memoryview is", sum3d(carr_view)

Memoryview 的用法和思維接近 np.array

你可以用 np.array 來包
* np.array
* C array `cdef int carr[3][3][3]`
* Cython array `from cython.view cimport array`
* cython 會幫你自動轉換

In [None]:
%%cython
import numpy as np
from array import array
def f(long[:] a):
    print "type of a is", a
f(np.array([1,2,3]))
cdef long[:] b =np.array([1,2,3])
print "type of b is", b

通常不需要 gil

In [None]:
%%cython  --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, boundscheck=False 
from cython.parallel import prange
from libc.math cimport sin

# Single-threading version
cpdef double[:] x3(double[:] a):
    cdef Py_ssize_t i
    cdef int n = a.size, j
    for i in range(n):
        for j in range(10000):
            a[i] = sin(a[i]*a[i]+a[i])
    return a

# Multi-threading version
cpdef double[:] px3(double[:] a):
    cdef Py_ssize_t i
    cdef int n = a.size, j
    for i in prange(n, nogil=True):
        for j in range(10000):
            a[i] = sin(a[i]*a[i]+a[i])
    return a

測試 Single-threading version

In [None]:
%%time
import numpy as np
a = np.arange(10000, dtype=float)
x3(a)

測試 Multithreading version

In [None]:
%%time
import numpy as np
a = np.arange(10000, dtype=float)
px3(a)

### More about prange

某些情形能夠用變數(`j` cython 會自動分析)

In [None]:
%%cython -f --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, boundscheck=False 
from cython.parallel import prange
import numpy as np
cdef long psum(long[:] a):
    cdef Py_ssize_t i
    cdef long n = a.size, j=0
    for i in prange(n, nogil=True):    
        j+=a[i]
        # a[i] =  j  # try to uncomment this line
    return j
print psum(np.array(range(100)))

用 parallel context 設定 thread 數量

或者 prange 裡面也能設定

`cython.parallel.prange([start,] stop[, step][, nogil=False][, schedule=None[, chunksize=None]][, num_threads=None])`

schedule 可以參考[文件](http://docs.cython.org/src/userguide/parallelism.html)

prange 裡面能放 `with git:` 裡面可以呼叫 python 函數和物件

In [None]:
%%cython  -f --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, boundscheck=False 
from cython.parallel import prange, parallel
cdef fprint():
    cdef Py_ssize_t i
    with nogil, parallel(num_threads=8):
        for i in prange(10):
            with gil:
                print i
fprint()

固定大小的記憶體存取通常都能 `nogil`, 但是自己要小心(注意輸出結果可能會有重複數字)

In [None]:
%%cython  -f --cplus --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, infer_types.verbose=True, boundscheck=False 
from cython.parallel import prange
from libcpp.vector cimport vector
import numpy as np
cimport numpy as np
cdef passign(vector[long] a):
    cdef Py_ssize_t i
    #cdef np.ndarray[long, ndim=1] v= np.array([0])    
    #cdef int[long] v=np.array([0])
    cdef vector[long] v=[0]
    for i in prange(20, nogil=True, num_threads = 20):
        v[0] = v[0]+1
        a[i] = v[0]
    return a

print passign([0]*20)

但是要改變記憶體大小就需要 lock

下面這段程式碼可以試試看把 `withgil` 註解掉(然後準備 crash)

In [None]:
%%cython  -f --cplus --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, infer_types.verbose=True, boundscheck=False 
from cython.parallel import prange
from libcpp.vector cimport vector
cdef pappend():
    cdef Py_ssize_t i
    cdef vector[long] v=[]
    for i in prange(10, nogil=True, num_threads = 4):
        #with gil:
            v.push_back(i)    
    return v
print pappend()

In [None]:
# 如果 crash, 重新載入 Cython magic
%load_ext Cython

不用  gil 可以用 no-global lock

In [None]:
%%cython  -f --cplus --compile-args=-fopenmp  --link-args=-fopenmp
# cython: infer_types=True, infer_types.verbose=True, boundscheck=False 
from cython.parallel import prange
from libcpp.vector cimport vector
cimport openmp as omp
cdef pappend():
    cdef Py_ssize_t i
    cdef vector[long] v=[]
    cdef omp.omp_lock_t lock
    omp.omp_init_lock(&lock)
    for i in prange(10, nogil=True):
        omp.omp_set_lock(&lock)
        v.push_back(i)
        omp.omp_unset_lock(&lock)                        
    return v
print pappend()

# 用 Python 的方式寫 C

* list 可以用 vector 替代 http://www.cplusplus.com/reference/vector/vector/
* dict 可以用 map 替代（不過 map 比較像 defaultdict）http://www.cplusplus.com/reference/map/map/

In [None]:
%%cython --cplus
from libcpp.vector cimport vector
from libcpp.map cimport map
from libcpp.string cimport string
cdef vector[long] v=[0,3,9]
cdef map[string, long] m;
m["aaa"]=1;
print v
print m

但怎麼共存？
* len(list)   $\neq$ vector.size
* list.append $\neq$ vector.push_back

例子:  brainfuck4_merge

in bf4_merge.pxd
```cython
from vector_list import vector
vector[char] cells
vector[char] P
```

in bf4_merge.py
```python
# 看起來跟普通的 list 一樣
cells = [0]*1000
P = [ord(x) for x in open(sys.argv[1], "r").read()]
```
in vector_list.pxd  (`list.__len__()` 在 python 也可以用)
```cython
        void append "push_back"(T&) except +
        size_t __len__ "size"()
```

In [None]:
# 一段簡單的 Python code
a = []
for i in range(10):
    a.append(i**2)
print a
print a.__len__()

In [None]:
%%cython -a  -f --cplus
# 加入簡單的定義
from vector_list cimport vector
cdef vector[long] a
# 下面是同樣的 Python code
a = []
for i in range(10):
    a.append(i**2)
print a
print a.__len__()

cdef 出來的函數在 python 中沒有怎麼辦?

例子: brainfuck5
in brainfuck5_improved.pxd
```cython
from libc.stdio cimport putchar
```
in brainfuck5_improved.py
```python
from __future__ import print_function
globals()['putchar'] = lambda x: print(chr(x), end="")
```

或者 `from xxx import putchar` 也行


In [None]:
from __future__ import print_function
globals()['putchar'] = lambda x: print(chr(x), end="")
for x in [72, 101, 108, 108, 111,32,119,111,114,108, 100, 33, 10]:
    putchar(x)

In [None]:
%%cython -f -a
from __future__ import print_function
from libc.stdio cimport putchar
# 一樣的 Python code
globals()['putchar'] = lambda x: print(chr(x), end="")
for x in [72, 101, 108, 108, 111,32,119,111,114,108, 100, 33, 10]:
    putchar(x)

_2._ 指標

cython 不像 C++ 有 `p->x` 和 `p.x` 的差別，在 cython 要特別小心 