Parallel Maps Implementations are Common
---------------------------

Almost every parallel computing framework implements an embarrassingly parallel map (applying the same function to many inputs.)  We have *many* options.  

They all perform about the same.

This notebook goes through the user interface of several of them on the same problem.

In [None]:
from glob import glob
import ujson as json
import pandas as pd

In [None]:
filenames = sorted(glob(os.path.join('..', 'data', 'json', '*.json')))  # ../data/json/*.json

def load_parse_store(fn):
    with open(fn) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    out_filename = fn[:-5] + '.h5'
    df.to_hdf(out_filename, '/data')



### Sequential for loops

In [None]:
%%time

for fn in filenames:
    load_parse_store(fn)

### Concurrent.futures

In [None]:
%%time

from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor(4)

list(e.map(load_parse_store, filenames))

### Multiprocessing

In [None]:
%%time 

from multiprocessing import Pool
p = Pool(4)

list(p.map(load_parse_store, filenames))

### Joblib

In [None]:
%%time 

from joblib import Parallel, delayed

result = Parallel(n_jobs=4, backend='multiprocessing')(delayed(load_parse_store)(fn) for fn in filenames)

### IPython Parallel

Start an IPython cluster with:

    ipcluster start

In [None]:
from subprocess import Popen
ipcluster = Popen(['ipcluster', 'start', '-n', '4'])

In [None]:
import ipyparallel as ipp
c = ipp.Client()

# use the same serialization as everyone else
c[:].use_cloudpickle()

view = c.load_balanced_view()

In [None]:
%%time

result = list(view.map(load_parse_store, filenames))

In [None]:
!ipcluster stop

### PySpark

In [None]:
%%time

import pyspark

sc = pyspark.SparkContext('local[4]')

In [None]:
%%time

rdd = sc.parallelize(filenames)
result = rdd.map(load_parse_store).collect()

### Dask.bag

In [None]:
%%time

import dask.bag as db

b = db.from_sequence(filenames)
b.map(load_parse_store).compute()

### Dask.delayed

In [None]:
%%time

from dask import delayed, compute
import dask.multiprocessing

delayed_values = [delayed(load_parse_store)(fn) for fn in filenames]

compute(*delayed_values, get=dask.multiprocessing.get)

### Dask.distributed

In [None]:
%%time
from distributed import Executor
e = Executor()  # creates local scheduler and workers

In [None]:
%%time

futures = e.map(load_parse_store, filenames)
e.gather(futures)