### Running a single-machine cluster
When run on a single machine, this is equivalent to the dask 'processes' scheduler, with the added addition of a handy status dashboard. It's accessible by web browser via the link below. (The local cluster server can be set up and run outside the notebook for persistence.)

In [1]:
from dask import distributed

CLUSTER_SETTINGS = dict(
    # limiting this can help to avoid system hangs caused by google drivefs
    n_workers=8,
    threads_per_worker=1,
    # host=f"tcp://127.0.0.1:8786",
    dashboard_address=f"127.0.0.1:8787",
)

# cluster = distributed.LocalCluster(**CLUSTER_SETTINGS)
if 'scheduler' not in dir():
    cluster = distributed.LocalCluster(**CLUSTER_SETTINGS)
    scheduler = distributed.Client(cluster.scheduler_address)

cluster



0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 8,Total memory: 31.60 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:10247,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 31.60 GiB

0,1
Comm: tcp://127.0.0.1:10294,Total threads: 1
Dashboard: http://127.0.0.1:10304/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10251,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-w9igii5p,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-w9igii5p

0,1
Comm: tcp://127.0.0.1:10289,Total threads: 1
Dashboard: http://127.0.0.1:10295/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10252,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-wkm6xd4j,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-wkm6xd4j

0,1
Comm: tcp://127.0.0.1:10301,Total threads: 1
Dashboard: http://127.0.0.1:10306/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10253,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-kspn40bh,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-kspn40bh

0,1
Comm: tcp://127.0.0.1:10291,Total threads: 1
Dashboard: http://127.0.0.1:10299/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10254,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-1cs0bpja,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-1cs0bpja

0,1
Comm: tcp://127.0.0.1:10288,Total threads: 1
Dashboard: http://127.0.0.1:10296/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10255,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-ex4jawel,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-ex4jawel

0,1
Comm: tcp://127.0.0.1:10287,Total threads: 1
Dashboard: http://127.0.0.1:10290/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10256,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-lk7k722t,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-lk7k722t

0,1
Comm: tcp://127.0.0.1:10284,Total threads: 1
Dashboard: http://127.0.0.1:10285/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10257,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-hd8sr9eb,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-hd8sr9eb

0,1
Comm: tcp://127.0.0.1:10293,Total threads: 1
Dashboard: http://127.0.0.1:10302/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:10258,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-d18expt4,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-worker-space\worker-d18expt4


### Creating dask dataframe objects
The following assumes that you've downloaded the specified zip archive at the specified path.

In [7]:
from dask_ops import zipfile_dask_dfs, log_to_json, trace
import eliot
from pathlib import Path
from labbench import stopwatch
import pandas as pd

def capture_summary(partition_data: dict):
    """ this could be expanded into a function that does more, but for now, just make a capture summary """

    # partition_data is the result of running zipfile.read_seamf_zipfile on a subset of files in the zip archive.
    # it takes the form of a dictionary of pandas.DataFrame objects

    partition_data['capture_summary'] = pd.DataFrame.from_dict({
        'median_rms_pfp': trace(partition_data, 'pfp', capture_statistic='mean', detector='rms').median(axis=1),
        'max_max_pfp': trace(partition_data, 'pfp', capture_statistic='max', detector='peak').max(axis=1),
        'median_mean_power': trace(partition_data, 'psd', capture_statistic='mean').median(axis=1),
        'max_max_power': trace(partition_data, 'psd', capture_statistic='mean').max(axis=1),
        'noise_figure': trace(partition_data, 'channel_metadata')['cal_noise_figure_dB'],
        'gain': trace(partition_data, 'channel_metadata')['cal_gain_dB'],
        # TODO: add 'temperature' from sweep_metadata, which isn't easy yet b/c the 
    })

    return partition_data

data_path = Path('data')/'NIT-2022-12-13.zip'

# overwrite
log_to_json('data.log')

with stopwatch('setup'):
    # scan the zipfiles to map out the file contents. the resulting is a dictionary of dask dataframes.
    # this is fast, because it hasn't loaded much data yet; other operations trigger "compute" that can
    # take a while to scrape the data out of the zip archive
    ddfs = zipfile_dask_dfs(
        data_path,
        partition_func=capture_summary,
        partition_size=100
    )

  if isinstance(o, (numpy.bool, numpy.bool_)):
[1;30m INFO  [0m [32m2023-03-10 15:15:39.101[0m • [34mlabbench:[0m setup 2.564 s elapsed


In [11]:
with stopwatch('compute'):
    # head forces a compute operation
    df = ddfs['capture_summary'].head(10)
df

[1;30m INFO  [0m [32m2023-03-10 15:17:33.716[0m • [34mlabbench:[0m compute 1.643 s elapsed


Unnamed: 0_level_0,frequency,median_rms_pfp,max_max_pfp,median_mean_power,max_max_power,noise_figure,gain
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-10-26 20:47:37.094,3555000000.0,-86.1875,-64.8125,-155.375,-152.0,4.330333,31.440102
2022-10-26 20:47:41.699,3565000000.0,-88.5,-66.0625,-157.25,-154.125,4.265367,31.498794
2022-10-26 20:47:46.374,3575000000.0,-86.25,-64.0,-155.75,-152.875,4.256935,31.315659
2022-10-26 20:47:51.554,3585000000.0,-88.0625,-65.3125,-156.0,-153.875,4.031151,31.589694
2022-10-26 20:47:56.944,3595000000.0,-82.75,-60.125,-150.125,-146.375,4.00401,31.71745
2022-10-26 20:48:01.623,3605000000.0,-83.5625,-60.875,-150.75,-147.875,3.989708,31.672517
2022-10-26 20:48:07.579,3615000000.0,-83.875,-64.0625,-153.25,-150.0,4.088225,31.528059
2022-10-26 20:48:12.256,3625000000.0,-86.875,-64.125,-155.5,-152.625,4.056286,31.530697
2022-10-26 20:48:16.953,3635000000.0,-79.75,-59.28125,-151.375,-145.625,3.997318,31.689596
2022-10-26 20:48:22.166,3645000000.0,-81.1875,-57.3125,-148.75,-146.25,4.034233,31.668922


#### on-demand access to a subset of the data
The dask .loc accessor allows faster access to a specified subset of the data

In [None]:
with stopwatch('compute'):
    df = (
        ddfs['capture_summary']
        .loc['2022-11-21 20:50:00':'2022-11-21 20:51:00']
        .compute(scheduler=scheduler)
    )

#### saving data in files split by time period
Dask dataframes support data save operations split output files by partition. This means that the time span of each output file can be adjusted by simply calling 'repartition'.

In [44]:
import dask

with stopwatch('compute'):
    # repartition by month divided at the start of each month
    ddf = ddfs['capture_summary'].repartition(freq='2W')
    name_func = lambda i: ddf.divisions[i].strftime('%Y-%m-%d')
    capture_op = (
        ddf
        .to_csv('data/capture_summary/*.parquet', name_function=name_func, compute=False)
    )

    ddf = ddfs['pfp'].repartition(freq='1W')
    name_func = lambda i: ddf.divisions[i].strftime('%Y-%m-%d')
    pfp_op = (
        ddf
        .to_csv('data/pfp/*.parquet', name_function=name_func, compute=False)
    )

    files_saved = dask.compute(dict(pfp=pfp_op, capture_summary=capture_op), scheduler=scheduler)

    print('wrote the following files:')
    print(files_saved)

