# Lab: threading

Write a program to fetch photos from the site <https://jsonplaceholder.typicode.com/>. 

Try fetching with diferent levels of parallelism (number of threads).

Use the following to get started:

In [3]:
import requests

def get_photos():
    return [
        obj['url'] for obj in requests.get('https://jsonplaceholder.typicode.com/photos').json()
    ]

def fetch_photo(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return resp

In [4]:
%%time
# Serial version (1 thread)

photos = get_photos()
for i, url in enumerate(photos):
    r = fetch_photo(url)
    if i > 10:
        break


CPU times: user 225 ms, sys: 16.5 ms, total: 242 ms
Wall time: 1.52 s


Write a version to fetch with a specified number of threads

In [5]:
import threading
import queue

def worker(qjob, qresult):
    while True:
        job = qjob.get()
        if job is None:
            break
        try:
            r = fetch_photo(job)
        except Exception as err:
            qresult.put((None, err))
        else:
            qresult.put((r, None))

In [9]:
def thread_version(photos, nthread):
    qjob = queue.Queue()
    qresult = queue.Queue()
    threads = [
        threading.Thread(target=worker, args=(qjob, qresult))
        for i in range(nthread)
    ]
    for t in threads:
        t.start()
    for url in photos:
        qjob.put(url)
    num_err = 0
    for i in range(len(photos)):
        resp, err = qresult.get()
        if err:
            num_err += 1
    for t in threads:
        qjob.put(None)
    for t in threads:
        t.join()
    print(f'Fetched {len(photos)} with {num_err} errors')

Use the `%%time` Jupyter magic to see how long it takes to fetch 5000 photos with 10, 100, 1000 threads

In [10]:
%%time
thread_version(photos[:500], 10)

Fetched 500 with 0 errors
CPU times: user 7.54 s, sys: 681 ms, total: 8.22 s
Wall time: 7.04 s


In [11]:
%%time
thread_version(photos[:500], 100)

Fetched 500 with 1 errors
CPU times: user 9.97 s, sys: 2.53 s, total: 12.5 s
Wall time: 5.64 s


In [12]:
%%time
thread_version(photos[:500], 500)

Fetched 500 with 2 errors
CPU times: user 12.8 s, sys: 32.4 s, total: 45.2 s
Wall time: 13.6 s


Alternatively, use `multiprocessing.pool.ThreadPool`

In [13]:
import multiprocessing.pool

In [15]:
%%time
with multiprocessing.pool.ThreadPool(processes=10) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

CPU times: user 7.75 s, sys: 684 ms, total: 8.44 s
Wall time: 8.5 s


In [16]:
%%time
with multiprocessing.pool.ThreadPool(processes=100) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

CPU times: user 9.05 s, sys: 1.63 s, total: 10.7 s
Wall time: 4.85 s


In [18]:
%%time
with multiprocessing.pool.ThreadPool(processes=500) as pool:
    for r in pool.imap_unordered(fetch_photo, photos[:500]):
        pass

ConnectionError: HTTPSConnectionPool(host='via.placeholder.com', port=443): Max retries exceeded with url: /600/7dd663 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11300bcd0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

# Lab: multiprocessing

Using a `multiprocessing.Pool`, repeat the exercise above and see if it became faster or slower. Any hypotheses why?

In [19]:
%%time
with multiprocessing.Pool(processes=10) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 106 ms, sys: 81.2 ms, total: 187 ms
Wall time: 7.22 s


In [20]:
%%time
with multiprocessing.Pool(processes=20) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 72.3 ms, sys: 77.3 ms, total: 150 ms
Wall time: 4.92 s


In [21]:
%%time
with multiprocessing.Pool(processes=50) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 134 ms, sys: 171 ms, total: 305 ms
Wall time: 4.41 s


In [22]:
%%time
with multiprocessing.Pool(processes=500) as pool:
    pool.map(fetch_photo, photos[:500])

CPU times: user 1.27 s, sys: 2.52 s, total: 3.79 s
Wall time: 16.7 s
