### Running a single-machine cluster
When run on a single machine, this is equivalent to the dask 'processes' scheduler, with the added addition of a handy status dashboard. It's accessible by web browser via the link below. (The local cluster server can be set up and run outside the notebook for persistence.)

In [1]:
from dask import distributed

CLUSTER_SETTINGS = dict(
    # limiting this can help to avoid system hangs caused by google drivefs
    n_workers=8,
    threads_per_worker=1,
    # host=f"tcp://127.0.0.1:8786",
    dashboard_address=f"127.0.0.1:8787",
)

# cluster = distributed.LocalCluster(**CLUSTER_SETTINGS)
if 'scheduler' not in dir():
    cluster = distributed.LocalCluster(**CLUSTER_SETTINGS)
    scheduler = distributed.Client(cluster.scheduler_address)

cluster



0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 8,Total memory: 31.60 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:1465,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 31.60 GiB

0,1
Comm: tcp://127.0.0.1:1507,Total threads: 1
Dashboard: http://127.0.0.1:1519/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1468,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-ur9g6vjm,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-ur9g6vjm

0,1
Comm: tcp://127.0.0.1:1505,Total threads: 1
Dashboard: http://127.0.0.1:1513/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1469,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-3j16qnns,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-3j16qnns

0,1
Comm: tcp://127.0.0.1:1502,Total threads: 1
Dashboard: http://127.0.0.1:1511/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1470,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-v678s7oh,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-v678s7oh

0,1
Comm: tcp://127.0.0.1:1501,Total threads: 1
Dashboard: http://127.0.0.1:1509/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1471,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-_gbseapz,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-_gbseapz

0,1
Comm: tcp://127.0.0.1:1500,Total threads: 1
Dashboard: http://127.0.0.1:1503/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1472,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-bc0ln5y0,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-bc0ln5y0

0,1
Comm: tcp://127.0.0.1:1521,Total threads: 1
Dashboard: http://127.0.0.1:1522/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1473,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-nndctl8k,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-nndctl8k

0,1
Comm: tcp://127.0.0.1:1508,Total threads: 1
Dashboard: http://127.0.0.1:1517/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1474,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-6xw_fifz,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-6xw_fifz

0,1
Comm: tcp://127.0.0.1:1506,Total threads: 1
Dashboard: http://127.0.0.1:1515/status,Memory: 3.95 GiB
Nanny: tcp://127.0.0.1:1475,
Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-8lyakfz7,Local directory: C:\Users\dkuester\AppData\Local\Temp\1\dask-scratch-space\worker-8lyakfz7


### Creating dask dataframe objects
The following assumes that you've downloaded the specified zip archive at the specified path.

In [6]:
from dask_ops import log_to_json
# this is a hack for working inside the repo source tree;
# normally, for pip install, just `import sea_ingest`
import __init__ as sea_ingest
from labbench import stopwatch
import pandas as pd
import typing

def capture_summary(partition_data: typing.Dict[str, pd.DataFrame]):
    """ this could be expanded into a function that does more, but for now, just make a capture summary """

    # partition_data is the result of running zipfile.read_seamf_zipfile on a subset of files in the zip archive.
    # it takes the form of a dictionary of pandas.DataFrame objects

    partition_data['capture_summary'] = pd.DataFrame.from_dict({
        'median_rms_pfp': 
            sea_ingest.trace(partition_data, 'pfp', capture_statistic='mean', detector='rms').median(axis=1),
        'max_max_pfp': 
            sea_ingest.trace(partition_data, 'pfp', capture_statistic='max', detector='peak').max(axis=1),
        'median_mean_power': 
            sea_ingest.trace(partition_data, 'psd', capture_statistic='mean').median(axis=1),
        'max_max_power': 
            sea_ingest.trace(partition_data, 'psd', capture_statistic='mean').max(axis=1),
        'noise_figure': 
            sea_ingest.trace(partition_data, 'channel_metadata')['cal_noise_figure_dB'].astype('float16'),
        'gain': 
            sea_ingest.trace(partition_data, 'channel_metadata')['cal_gain_dB'].astype('float16'),
        # TODO: add 'temperature' from sweep_metadata, will probably need a restructure of that table
    })

    return partition_data

data_path = 'data/NIT-2022-12-13.zip'

# overwrite
log_to_json('data.log')

with stopwatch('setup'):
    # scan the zipfiles to map out the file contents. the resulting is a dictionary of dask dataframes.
    # this is fast, because it hasn't loaded much data yet; other operations trigger "compute" that can
    # take a while to scrape the data out of the zip archive
    ddfs = sea_ingest.read_seamf_zipfile_as_ddf(
        data_path,
        partition_func=capture_summary,
        partition_size=200,
        tz="America/New_York",
        localize=False
    )

[1;30m INFO  [0m [32m2023-06-12 13:11:11.593[0m • [34mlabbench:[0m setup 1.799 s elapsed


In [9]:
with stopwatch('compute'):
    # head forces a compute operation
    df = ddfs['capture_summary'].head(10)
df

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
[1;30m INFO  [0m [32m2023-06-12 13:11:44.539[0m • [34mlabbench:[0m compute 7.543 s elapsed


Unnamed: 0_level_0,frequency,median_rms_pfp,max_max_pfp,median_mean_power,max_max_power,noise_figure,gain
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-10-27 00:47:37.094000+00:00,3555000000.0,-86.1875,-64.8125,-155.375,-152.0,4.332031,31.4375
2022-10-27 00:47:41.699000+00:00,3565000000.0,-88.5,-66.0625,-157.25,-154.125,4.265625,31.5
2022-10-27 00:47:46.374000+00:00,3575000000.0,-86.25,-64.0,-155.75,-152.875,4.257812,31.3125
2022-10-27 00:47:51.554000+00:00,3585000000.0,-88.0625,-65.3125,-156.0,-153.875,4.03125,31.59375
2022-10-27 00:47:56.944000+00:00,3595000000.0,-82.75,-60.125,-150.125,-146.375,4.003906,31.71875
2022-10-27 00:48:01.623000+00:00,3605000000.0,-83.5625,-60.875,-150.75,-147.875,3.990234,31.671875
2022-10-27 00:48:07.579000+00:00,3615000000.0,-83.875,-64.0625,-153.25,-150.0,4.089844,31.53125
2022-10-27 00:48:12.256000+00:00,3625000000.0,-86.875,-64.125,-155.5,-152.625,4.054688,31.53125
2022-10-27 00:48:16.953000+00:00,3635000000.0,-79.75,-59.28125,-151.375,-145.625,3.998047,31.6875
2022-10-27 00:48:22.166000+00:00,3645000000.0,-81.1875,-57.3125,-148.75,-146.25,4.035156,31.671875


#### on-demand access to a subset of the data
The dask .loc accessor allows faster access to a specified subset of the data

In [10]:
with stopwatch('compute'):
    df = (
        ddfs['capture_summary']
        .loc['2022-11-21 20:50:00':'2022-11-21 20:51:00']
        .compute(scheduler=scheduler)
    )

df

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
[1;30m INFO  [0m [32m2023-06-12 13:11:57.405[0m • [34mlabbench:[0m compute 8.794 s elapsed


Unnamed: 0_level_0,frequency,median_rms_pfp,max_max_pfp,median_mean_power,max_max_power,noise_figure,gain
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-21 20:50:15.076000+00:00,3555000000.0,-82.625,-62.25,-152.0,-148.625,4.175781,31.578125
2022-11-21 20:50:19.703000+00:00,3565000000.0,-83.0,-60.9375,-151.375,-148.125,4.105469,31.640625
2022-11-21 20:50:24.428000+00:00,3575000000.0,-80.6875,-63.9375,-151.625,-146.75,4.109375,31.4375
2022-11-21 20:50:29.652000+00:00,3585000000.0,-80.3125,-62.375,-151.0,-147.875,3.871094,31.734375
2022-11-21 20:50:35.117000+00:00,3595000000.0,-76.625,-56.21875,-146.75,-143.625,3.849609,31.859375
2022-11-21 20:50:39.813000+00:00,3605000000.0,-78.5,-57.46875,-147.5,-144.125,3.826172,31.8125
2022-11-21 20:50:45.831000+00:00,3615000000.0,-81.5,-61.375,-150.75,-148.125,3.925781,31.671875
2022-11-21 20:50:50.528000+00:00,3625000000.0,-81.9375,-61.34375,-151.0,-148.75,3.904297,31.65625
2022-11-21 20:50:55.257000+00:00,3635000000.0,-78.5625,-58.90625,-148.5,-145.0,3.828125,31.828125
2022-11-21 20:51:00.487000+00:00,3645000000.0,-79.375,-57.875,-147.75,-146.0,3.876953,31.796875


#### Bulk data
Dask dataframes support data save operations split output files by partition. This means that the time span of each output file can be adjusted with 'repartition'. In this example, a file save function is applied to each partition.

In [11]:
import dask
from pathlib import Path

def write_feather(df: pd.DataFrame, dirpath):
    """ example for feather, but could substitute code for e.g. csv, database, etc. instead """
    path = Path(dirpath)/df.index[0].strftime('%Y-%m-%d.feather')

    # feather like most formats requires string column names
    df.columns = df.columns.astype('str')
    df.reset_index().to_feather(path, compression='zstd')
    return str(path)

# collecting the write operations into a single options
# allows dask to optimize the execution so that the zip archives
# only need to be read once.
#
# the choice of dictionary means that the returned list of
# files written will be returned as a dictionary with the same keys.
ddf = ddfs['capture_summary']

save_ops = dict(
    capture_summary=(
        ddfs['capture_summary']
        .repartition(freq='1MS')
        .map_partitions(write_feather, 'data/capture_summary')
    ),

    pfp=(
        ddfs['pfp']
        .repartition(freq='1W')
        .map_partitions(write_feather, 'data/pfp')
    ),
)

with stopwatch('compute'):
    files_saved = dask.compute(save_ops, scheduler=scheduler)

    print('wrote the following files:')
    print(files_saved)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
[1;30m INFO  [0m [32m2023-06-12 13:16:32.968[0m • [34mlabbench:[0m compute 275.375 s elapsed


wrote the following files:
({'capture_summary': 0    data\capture_summary\2022-10-27.feather
1    data\capture_summary\2022-11-01.feather
2    data\capture_summary\2022-12-01.feather
dtype: object, 'pfp': 0    data\pfp\2022-10-27.feather
1    data\pfp\2022-10-30.feather
2    data\pfp\2022-11-06.feather
3    data\pfp\2022-11-13.feather
4    data\pfp\2022-11-20.feather
5    data\pfp\2022-11-27.feather
6    data\pfp\2022-12-04.feather
dtype: object},)
