In [2]:
import time
import json
from pprint import pprint

import pandas as pd

import dask
from dask.diagnostics import ProgressBar, Profiler
from dask.callbacks import Callback
import dask.dataframe as dd
from dask.distributed import Client, progress, LocalCluster

In [7]:
def dirprint(obj, private=False):
    keys = dir(obj)
    if not private:
        keys = filter(lambda x: not x.startswith('__'), keys)
    
    key = 'NAME'
    atype = 'TYPE'
    val = 'VALUE'
    pattern = '{key:<20}    {atype:<20}   {val}'
    print(pattern.format(key=key, atype=atype, val=val))
    for key in sorted(keys):
        attr = getattr(obj, key)
        atype = attr.__class__.__name__[:20]
        val = str(attr)[:50]
        print(pattern.format(key=key, atype=atype, val=val))

def last_response_to_progress(response):
    total = sum(response['all'].values())
    rem = response['remaining']
    complete = sum(rem.values())
    progress = float(complete) / total
    percent = f'{progress * 100:.2f}%'
    tasks_completed = sorted(list(filter(lambda x: rem[x] == 0, rem.keys())))
    tasks_remaining = sorted(list(filter(lambda x: rem[x] != 0, rem.keys())))

    output = dict(
        total=total,
        complete=complete,
        progress=progress,
        percent=percent,
        tasks_completed=tasks_completed,
        tasks_remaining=tasks_remaining,
    )
    return output

def func(x):
    time.sleep(0.1)
    return x

def get_data(n=100):
    d = pd.DataFrame()
    d['a'] = ([0] * n) + ([1] * n)
    d['b'] = list(range(2 * n))
    return d

In [8]:
workers = 2
cluster = LocalCluster(n_workers=workers)
client = Client(cluster)

d = get_data(n=200)
with dask.annotate(step='read'):
    d = dd.from_pandas(d, workers)
with dask.annotate(step='wait'):
    d = d.applymap(func)
with dask.annotate(step='group'):
    d = d.groupby('a')
with dask.annotate(step='sum'):
    d = d.sum()
with dask.annotate(step='wait'):
    d = d.apply(func, axis=1)

d = d.persist()
prog = progress(d, notebook=True, multi=True)
prog

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36233 instead
  f"Port {expected} is already in use.\n"


VBox()

In [12]:
results = set()
for i in range(100):
    time.sleep(1)
    if hasattr(prog, '_last_response'):
        msg = last_response_to_progress(prog._last_response)
        results.add(json.dumps(msg))
results

In [5]:
# !cat /home/ubuntu/.local/lib/python3.7/site-packages/distributed/diagnostics/progressbar.py

In [6]:
# !cat /home/ubuntu/.local/lib/python3.7/site-packages/dask/diagnostics/progress.py