In [1]:
%load_ext cython
import numpy as np

## Mac OS X: `clang` does not support `openmp`

 - You have to use GCC
 - install from homebrew

In [2]:
!brew ls --versions gcc

gcc 6.3.0


**The `gcc` command maps back to clang. The "real" GCC is different:**

Let's find all executables beginning with "gcc":

In [3]:
!compgen -c | grep ^gcc

gcc
gcc-6
gcc-ar-6
gcc-nm-6
gcc-ranlib-6
gcc-6
gcc-ar-6
gcc-nm-6
gcc-ranlib-6
gcc


**My "real" GCC command is `gcc-5`**

You can switch the compiler by changing the `CC` environment variable:

In [5]:
import os
os.environ['CC'] = 'gcc-6'

<div style="margin-top: 100px;"></div>
# Compare `range()` and `prange()`

In [7]:
%%cython -f
# distutils: extra_compile_args = -fopenmp
# distutils: extra_link_args = -fopenmp
# cython: boundscheck = False
from libc.math cimport log
from cython.parallel cimport prange

def f1(double[:] x, double[:] out):
    cdef int i, n = x.shape[0]
    for i in range(n):
        out[i] = log(x[i])
        
def f2(double[:] x, double[:] out):
    cdef int i, n = x.shape[0]
    for i in prange(n, nogil=True):
        out[i] = log(x[i])

## Make some data

In [8]:
data = np.random.rand(10000000)
out = np.zeros_like(data)

## Timings

In [9]:
%timeit f1(data, out)
%timeit np.log(data, out=out)
%timeit f2(data, out)

10 loops, best of 3: 90.1 ms per loop
10 loops, best of 3: 81.4 ms per loop
100 loops, best of 3: 20.2 ms per loop


# Some things are tricky

In [10]:
%%cython
# distutils: extra_compile_args = -fopenmp
# distutils: extra_link_args = -fopenmp
# cython: boundscheck = False
from libc.math cimport log
from cython.parallel cimport prange

def f_single(double[:] x):
    cdef int i, n = x.shape[0]
    cdef double result = 0
    for i in range(n):
        if x[i] > 0.5:
            result += log(x[i])
        else:
            result += 1.0
    return result
        
def f_parallel(double[:] x):
    cdef int i, n = x.shape[0]
    cdef double result = 0
    for i in prange(n, nogil=True):
        if x[i] > 0.5:
            result += log(x[i])
        else:
            result += 1.0
    return result



Error compiling Cython file:
------------------------------------------------------------
...
    cdef double result = 0
    for i in prange(n, nogil=True):
        if x[i] > 0.5:
            result += log(x[i])
        else:
            result += 1.0
                  ^
------------------------------------------------------------

/Users/siuser/.ipython/cython/_cython_magic_aeefecf77f3f9e22a3a2881a69a44280.pyx:24:19: local variable 'result' referenced before assignment


## Confusing explanation from the Cython docs:

> If you assign to a variable in a prange block, it becomes lastprivate, meaning that the variable will contain the value from the last iteration. <u>**If you use an inplace operator on a variable, it becomes a reduction**</u>, meaning that the values from the thread-local copies of the variable will be reduced with the operator and assigned to the original variable **after the loop**. The index variable is always lastprivate. Variables assigned to in a parallel with block will be private and unusable after the block, as there is no concept of a sequentially last value.

## Solution: need a temporary variable

In [11]:
%%cython
# distutils: extra_compile_args = -fopenmp
# distutils: extra_link_args = -fopenmp
# cython: boundscheck = False
from libc.math cimport log
from cython.parallel cimport prange

def g_single(double[:] x):
    cdef int i, n = x.shape[0]
    cdef double result = 0
    for i in range(n):
        if x[i] > 0.5:
            result += log(x[i])
        else:
            result += 1.0
    return result
        
def g_parallel(double[:] x):
    cdef int i, n = x.shape[0]
    cdef double tmp, result = 0
    for i in prange(n, nogil=True):
        if x[i] > 0.5:
            tmp = log(x[i])
        else:
            tmp = 1.0
        result += tmp
    return result


In [12]:
print(g_single(data[:3]))
print(g_parallel(data[:3]))

0.4868732156091631
0.4868732156091631


In [13]:
%timeit g_single(data)
%timeit g_parallel(data)

10 loops, best of 3: 93.1 ms per loop
100 loops, best of 3: 20.9 ms per loop
