## Python - A Crash Course By Example
## Parallel Computing - A Simple Start
Dr. Victor Pankratius<br>
Massachusetts Institute of Technology<br>
http://www.victorpankratius.com

<hr>

### Tutorials
Python in HPC: https://hpc.nih.gov/training/handouts/171121_python_in_hpc.pdf 
<br><br>

### Multiprocessing package
<hr>

In [1]:
import time
import multiprocessing as mp  #parallel "map"

In [2]:
def long_runtime_f(x):
    return x**2

p = mp.Pool(4)

In [3]:
start = time.time()
result = p.map(long_runtime_f, range(100))
end = time.time()
print ("%s sec" % (end - start))

0.0022580623626708984 sec


In [4]:
#%% Benchmarking
import platform


def print_sysinfo():
    print('CPU count     : %s' % (mp.cpu_count()))
    print('Python version: %s' % (platform.python_version()))
    print('Compiler      : %s' % (platform.python_compiler()))
    print('System        : %s' % (platform.system()))
    print('Release       : %s' % (platform.release()))
    print('Machine       : %s' % (platform.machine()))
    print('Processor     : %s' % (platform.processor()))
    print('Interpreter   : %s' % (platform.architecture()[0]))
    
print_sysinfo()

CPU count     : 8
Python version: 3.6.2
Compiler      : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)
System        : Darwin
Release       : 16.7.0
Machine       : x86_64
Processor     : i386
Interpreter   : 64bit


### JobLib
https://pythonhosted.org/joblib/index.html
<hr>

#### Embarassing Parallelism Demo

In [5]:
import math as m
import joblib as jb

def myF(x):
    return m.cos(m.sqrt(x**4))

In [6]:
#sequential
start = time.time()
result_seq=[myF(i) for i in range(10000)]
end = time.time()
print ("%s sec" % (end - start))
#> 0.0117011070251 sec

0.009360074996948242 sec


In [7]:
#delayed returns a function, its arguments, and its keyword arguments as a tuple.
print (jb.delayed(myF)(3))
#> (<function myF at 0x133658e60>, (3,), {})

(<function myF at 0x109046378>, (3,), {})


In [8]:
#parallel
start = time.time()
result_par = jb.Parallel(n_jobs=4,verbose=0)(jb.delayed(myF)(i) for i in range(10000))
end = time.time()
print ("%s sec" % (end - start))
#> 2.59109902382 sec  -> slower than sequential; default uses multiprocessing package
#> 1.94945502281 sec  with backend="threading"

0.6531469821929932 sec


In [9]:
## Question / exercise: why is this parallel version slower than sequential version? Explain!

#### Fork - Join Parallelism Demo

In [10]:
import multiprocessing as mp
import random
import string

## Define an output queue
output = mp.Queue()

In [11]:
## define a example function
def rand_string(length, output):
    """ Generates a random string of numbers, lower- and uppercase chars. """
    rand_str = ''.join(random.choice(
                    string.ascii_lowercase
                    + string.ascii_uppercase
                    + string.digits)
               for i in range(length))
    output.put(rand_str)

In [12]:
## Setup a list of processes that we want to run
processes = [mp.Process(target=rand_string, args=(5, output)) for x in range(4)]

In [13]:
## Run processes
for p in processes:
    p.start()

In [14]:
## Exit the completed processes
for p in processes:
    p.join()

In [15]:
## Get process results from the output queue
results = [output.get() for p in processes]
print(results)

['e9C1x', 'bG3Fs', 'unt5z', 'CMfbH']


#### Parallel Map and Apply

In [16]:
def cube(x):
    return x**3

In [17]:
pool = mp.Pool(processes=4)
results = [pool.apply(cube, args=(x,)) for x in range(1,7)]
print(results)

[1, 8, 27, 64, 125, 216]


In [18]:
pool = mp.Pool(processes=4)
results = pool.map(cube, range(1,7))
print(results)

[1, 8, 27, 64, 125, 216]


In [19]:
pool = mp.Pool(processes=4)
results = [pool.apply_async(cube, args=(x,)) for x in range(1,7)]
output = [p.get() for p in results]
print(output)

[1, 8, 27, 64, 125, 216]


In [20]:
import timeit
print (timeit.Timer('cube(1000)',
        'from __main__ import cube').timeit(number=1))

9.14099655346945e-06
