# Quick Dask example

This notebook shows how, using `dask.distributed`, a local cluster can be set up using a pool of processes or a mix of threads and processes pools.

Check https://dask.pydata.org to know more.

In [1]:
import os
import random
from distributed import Client, LocalCluster

from helpers import fake_io, n_fibonacci, digits_fibonacci

In [2]:
n_cpu = os.cpu_count()
n_cpu

4

In [3]:
cluster45 = LocalCluster(diagnostics_port=37879, n_workers=n_cpu, threads_per_worker=5)
cluster4 = LocalCluster(diagnostics_port=42167)

cluster45, cluster4

(LocalCluster('tcp://127.0.0.1:42748', workers=4, ncores=20),
 LocalCluster('tcp://127.0.0.1:45539', workers=4, ncores=4))

In [4]:
client45 = Client(cluster45)
client45

0,1
Client  Scheduler: tcp://127.0.0.1:42748  Dashboard: http://127.0.0.1:37879,Cluster  Workers: 4  Cores: 20  Memory: 19.93 GB


In [5]:
client4 = Client(cluster4)
client4

0,1
Client  Scheduler: tcp://127.0.0.1:45539  Dashboard: http://127.0.0.1:42167,Cluster  Workers: 4  Cores: 4  Memory: 4.98 GB


In [6]:
# input_numbers = [
#     int(2E5) + random.randint(-1E5, 1E5) // 2 for __ in range(16)
# ]
random_input = [
    int(2E5) + random.randint(-1E5, 1E5) // 2 for __ in range(32)
]
print(list(random_input))

[227236, 186463, 150851, 249458, 170926, 241506, 205392, 194597, 186421, 170379, 178221, 194118, 163396, 162156, 199797, 162676, 197052, 195082, 229131, 184671, 155695, 245647, 210217, 220284, 166361, 199615, 160328, 222357, 188427, 232397, 231070, 197400]


In [7]:
def process_datum(n):
    m = fake_io(n) # e.g read data from disk/API
    n_digits = digits_fibonacci(m)
    return fake_io(n_digits) # e.g. write data to disk/API

In [8]:
%%time
list(map(process_datum, random_input[:8]))

CPU times: user 5.16 s, sys: 16 ms, total: 5.18 s
Wall time: 8.59 s


[47490, 38969, 31526, 52134, 35722, 50472, 42925, 40669]

In [9]:
%%time
f_read = client4.map(fake_io, random_input)
f_proc = client4.map(digits_fibonacci, f_read)
f_out  = client4.map(fake_io, f_proc)
client4.gather(f_out)

#     print(list(f.result() for f in f_out))

CPU times: user 2.22 s, sys: 384 ms, total: 2.6 s
Wall time: 8.97 s


In [17]:
%%time
f_read = client45.map(fake_io, random_input)
f_proc = client45.map(digits_fibonacci, f_read)
f_out  = client45.map(fake_io, f_proc)
client45.gather(f_out)

#     print(list(f.result() for f in f_out))

CPU times: user 1.11 s, sys: 288 ms, total: 1.4 s
Wall time: 6.03 s


In [13]:
%%time
futures = client45.map(digits_fibonacci, random_input)
client45.gather(futures)

CPU times: user 28 ms, sys: 8 ms, total: 36 ms
Wall time: 32.5 ms


Dask uses smart caching.
If you want to see the same thing again, you'll have to restart the client.

In [16]:
client45.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:42748  Dashboard: http://127.0.0.1:37879,Cluster  Workers: 4  Cores: 20  Memory: 19.93 GB
