# Lesson 04: Concurrency & Parallelism

## 1. Parallelism

Compared to concurrency, parallelism is easier to use, and is _usually_ easier to think about and design.

We can explore this through a data-processing scenario: going through a very large CSV/JSON doc and filtering out columns or keys.
This is usually done when selecting a relevant subset of data from a very broad data set, usually sourced from a 3rd party.

e.g. You want to download the wikipedia dataset and filter for actors, movie titles and release years, so that you can make a simple and comprehensive list.

e.g.2 You want to make a demo example for this lesson, so you have to fake the data before demonstrating the filtering

### Example 1: Generating a text file

1. We need to generate a _very_ large NDJSON file (newline-delimited JSON). For simplicities sake, all lines are readable/same schema etc.

    ```json
    {"a": "B"}
    {"a": "C"}
    ```

1.1. There are many language and OS-level optimisations around doing the _exact_ same thing, like performing the same calculation over the same file line data. This means that we have to randomise the values in order to make a good test file.

    > use `faker`

In [None]:
from faker import (Faker, providers)
F = Faker()
F.add_provider(providers.misc)
F.add_provider(providers.geo)

In [None]:
def fkr_n(fkr, n): return [fkr() for _ in range(n)]

In [None]:
def gen_movie(f=F):
    return {
        "titleId":         f.uuid4(),
        "ordering":        f.random_int(),
        "title":           f.catch_phrase(),
        "region":          f.locale(),
        "language":        f.language_name(),
        "types":           fkr_n(f.name, 5),
        "attributes":      fkr_n(f.name, 5),
        "isOriginalTitle": f.boolean(),
        "tconst":          f.uuid4(),
        "titleType":       f.domain_name(),
        "primaryTitle":    f.catch_phrase(),
        "originalTitle":   ":".join([f.company(), f.catch_phrase()]),
        "isAdult":         f.boolean(),
        "startYear":       f.date(),
        "endYear":         f.year(),
        "runtimeMinutes":  f.random_int(),
        "genres":          fkr_n(f.country, 5),
        "tconst":          f.hex_color(),
        "directors":       fkr_n(f.name, 2),
        "writers":         fkr_n(f.name, 15),
        "actors":          fkr_n(f.name, 50),
    }

In [None]:
import json
from IPython.display import JSON
JSON(gen_movie(F))

Woohoo! Now we just need to write this to a file

Lets make a function that loops and `yields` data

In [None]:
class MovieTable:
    def records(fpath, n_records=10):
        print(f"Writing {n_records} records to {fpath}")
        with open(fpath, "w") as ostream:
            for line in MovieTable.iter(n_records):
                print(line, file=ostream)

    def iter(n_records=10):
        for _ in range(n_records):
            yield gen_movie()

In [None]:
%timeit MovieTable.records("/tmp/movies.ndjson", 20)

But I need to show you the CPU usage _per core_!

What about threading?

> Use psutil

In [None]:
def cpu():
    print("\t".join(map(str, psutil.cpu_percent(percpu=True))))

In [None]:
import math
class MovieTable:
    def records(fpath, n_records=10):
        print(f"Writing {n_records} records to {fpath}")
        with open(fpath, "w") as ostream:
            for i, line in enumerate(MovieTable.iter(n_records)):
                if (i+1)%(n_records/100) == 0:
                    cpu()
                print(line, file=ostream)

    def iter(n_records=10):
        for _ in range(n_records):
            yield gen_movie()

In [None]:
from itertools import count
import json
import asyncio
from dataclasses import dataclass
import functools

import psutil

@dataclass
class Timer:
    f: object
    sentinel: bool = False

    async def run(self):
        while not self.sentinel:
            self.f()
            await asyncio.sleep(1)

    def task(self):
        return asyncio.create_task(self.run())

    async def stop(self):
        self.sentinel = True

async def run_with_timer(f: functools.partial, t: Timer):
    tsk = t.task()
    await f()
    tsk.cancel()

    try:
        await tsk
    except asyncio.CancelledError:
        print("finished")

In [None]:
class AMovieTable:
    async def records(fpath, n_records=10):
        print(f"Writing {n_records} records to {fpath}")
        with open(fpath, "w") as ostream:
            for _ in range(n_records):
                print(json.dumps(gen_movie()), file=ostream, end="\n")
                await asyncio.sleep(0)

In [None]:
import asyncio

def run_async(f, args):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:  # 'RuntimeError: There is no current event loop...'
        loop = None

    ff = run_with_timer(functools.partial(f, *args), Timer(cpu))

    if loop and loop.is_running():
        task = loop.create_task(ff)
        task.add_done_callback(lambda _: print('fin!'))
    else:
        asyncio.run(ff)

In [None]:
!wc -l "/tmp/movies.ndjson" && ls -alh "/tmp/movies.ndjson"

In [19]:
from multiprocessing import Pool, Process

procs = []
for fpath in ["/tmp/movies.ndjson", "/tmp/movies2.ndjson"]:
    p = Process(target=MovieTable.records, args=(fpath,500))
    p.start()
    procs.append(p)

while True:
    if all(p.is_alive() for p in procs):
        print("fin!")
        break

In [31]:
def run_async_pool(args):
    run_async(AMovieTable.records, args)

def _():
    with Pool(2) as p:
        p.map(run_async_pool, [("/tmp/movies.ndjson", 500), ("/tmp/movies2.ndjson", 500)])

In [32]:
_()

Writing 500 records to /tmp/movies2.ndjsonWriting 500 records to /tmp/movies.ndjson

1.9	1.9	1.6	1.5	2.0	1.61.9	1.9	1.6	1.5	2.0	1.6

1.9	99.0	1.0	2.9	4.9	99.0
1.9	99.0	1.0	2.9	4.9	99.0
10.8	63.1	45.6	10.9	9.9	100.0
10.7	61.5	46.7	12.5	10.8	100.0
2.0	97.1	99.0	4.8	2.0	3.9
1.9	99.1	100.0	3.8	2.0	1.0
1.0	100.0	100.0	4.8	1.0	1.9
1.9	100.0	100.0	4.8	1.0	1.9
1.0	100.0	100.0	1.0	1.9	0.0
1.0	100.0	99.0	0.0	1.0	0.0
1.9	99.0	99.0	3.9	2.0	1.9
1.9	99.0	100.0	4.7	2.0	1.9
finished
finished
