Record the time:
- convert each Sv and then open_mfdataset
- convert Sv and append

Record the time:
- combine inidividual daily Sv files into time windows
- make a big dataset and slice out time windows

Try to see if can see the to_zarr in parallel said here:
- https://xarray.pydata.org/en/stable/user-guide/io.html#appending-to-existing-zarr-stores

Keep the NaNs in the organized dataset.

In [1]:
from pathlib import Path
import datetime as dt

import xarray as xr

import echopype as ep
import gen_mvbs_utils

In [2]:
output_path = Path("./tmp_outputs/")
if not output_path.exists():
    output_path.mkdir()

In [3]:
ooi_CE04OSPS = (
    "https://rawdata.oceanobservatories.org/files/"
    "CE04OSPS/PC01B/ZPLSCB102_10.33.10.143/"
)

In [4]:
start = dt.datetime(year=2017, month=8, day=30)
end = dt.datetime(year=2017, month=8, day=30)

In [5]:
raw_file_list = gen_mvbs_utils.get_raw_file_url(
    file_url=ooi_CE04OSPS,
    start=start,
    end=end
)
len(raw_file_list)

15

## Convert individually and then combine

In [6]:
%time
for raw_file in raw_file_list:
    ed = ep.open_raw(raw_file=raw_file, sonar_model="EK60")
    ed.to_zarr(save_path=output_path, overwrite=True)
    ds_Sv = ep.calibrate.compute_Sv(ed)
    Sv_fname = Path(raw_file).with_name(Path(raw_file).stem + "_Sv")
    ds_Sv = ds_Sv.chunk({dim: ds_Sv[dim].size for dim in ds_Sv.dims})
    ds_Sv.to_zarr(output_path / Sv_fname.with_suffix(".zarr").name, mode="w")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
17:56:04  parsing file OOI-D20170830-T000000.raw, time of first ping: 2017-Aug-30 00:00:00
17:56:09  saving tmp_outputs/OOI-D20170830-T000000.zarr
17:56:12  parsing file OOI-D20170830-T013905.raw, time of first ping: 2017-Aug-30 01:39:05
17:56:16  saving tmp_outputs/OOI-D20170830-T013905.zarr
17:56:20  parsing file OOI-D20170830-T031813.raw, time of first ping: 2017-Aug-30 03:18:13
17:56:24  saving tmp_outputs/OOI-D20170830-T031813.zarr
17:56:28  parsing file OOI-D20170830-T045714.raw, time of first ping: 2017-Aug-30 04:57:14
17:56:33  saving tmp_outputs/OOI-D20170830-T045714.zarr
17:56:36  parsing file OOI-D20170830-T063625.raw, time of first ping: 2017-Aug-30 06:36:25
17:56:40  saving tmp_outputs/OOI-D20170830-T063625.zarr
17:56:44  parsing file OOI-D20170830-T081527.raw, time of first ping: 2017-Aug-30 08:15:27
17:56:48  saving tmp_outputs/OOI-D20170830-T081527.zarr
17:56:52  parsing file OOI-D20170830-T095428.raw, time

In [7]:
Sv_zarr_list = list((output_path).glob("*_Sv.zarr"))

In [8]:
xr.open_dataset(Sv_zarr_list[0], engine="zarr")

In [9]:
ds_Sv_zarr_all = xr.open_mfdataset(
    Sv_zarr_list,
    engine="zarr",
    data_vars="minimal",
    coords="minimal",
    compat="override"  # this can be removed if filenames is a coordinate
)

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the

In [10]:
ds_Sv_zarr_all

Unnamed: 0,Array,Chunk
Bytes,2.06 GiB,2.06 GiB
Shape,"(3, 85795, 1072)","(3, 85795, 1072)"
Count,13 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.06 GiB 2.06 GiB Shape (3, 85795, 1072) (3, 85795, 1072) Count 13 Tasks 1 Chunks Type float64 numpy.ndarray",1072  85795  3,

Unnamed: 0,Array,Chunk
Bytes,2.06 GiB,2.06 GiB
Shape,"(3, 85795, 1072)","(3, 85795, 1072)"
Count,13 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.06 GiB,2.06 GiB
Shape,"(3, 85795, 1072)","(3, 85795, 1072)"
Count,13 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.06 GiB 2.06 GiB Shape (3, 85795, 1072) (3, 85795, 1072) Count 13 Tasks 1 Chunks Type float64 numpy.ndarray",1072  85795  3,

Unnamed: 0,Array,Chunk
Bytes,2.06 GiB,2.06 GiB
Shape,"(3, 85795, 1072)","(3, 85795, 1072)"
Count,13 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 1.96 MiB Shape (3, 85795) (3, 85795) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",85795  3,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24 B 24 B Shape (3,) (3,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(85795, 3)","(85795, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 1.96 MiB Shape (85795, 3) (85795, 3) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",3  85795,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(85795, 3)","(85795, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(85795, 3)","(85795, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 1.96 MiB Shape (85795, 3) (85795, 3) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",3  85795,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(85795, 3)","(85795, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 1.96 MiB Shape (3, 85795) (3, 85795) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",85795  3,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 1.96 MiB Shape (3, 85795) (3, 85795) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",85795  3,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,1.96 MiB
Shape,"(3, 85795)","(3, 85795)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480 B,480 B
Shape,"(1,)","(1,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 480 B 480 B Shape (1,) (1,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1  1,

Unnamed: 0,Array,Chunk
Bytes,480 B,480 B
Shape,"(1,)","(1,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,138.84 kiB
Shape,"(3, 85795)","(3, 5924)"
Count,45 Tasks,15 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.96 MiB 138.84 kiB Shape (3, 85795) (3, 5924) Count 45 Tasks 15 Chunks Type float64 numpy.ndarray",85795  3,

Unnamed: 0,Array,Chunk
Bytes,1.96 MiB,138.84 kiB
Shape,"(3, 85795)","(3, 5924)"
Count,45 Tasks,15 Chunks
Type,float64,numpy.ndarray


## Try to append zarr file

In [11]:
ds_Sv["water_level"] = (
    ds_Sv["water_level"]
    .assign_coords({"ping_time": ("time3", ds_Sv["ping_time"].values)})
    .swap_dims({"time3": "ping_time"}).drop("time3")
)
# da.assign_coords({"lon_2": ("lon", lon_2)}))

In [12]:
%time
for seq, raw_file in enumerate(raw_file_list):
    ed = ep.open_raw(raw_file=raw_file, sonar_model="EK60")
    ed.to_zarr(save_path=output_path, overwrite=True)
    ds_Sv = ep.calibrate.compute_Sv(ed)
    # Change time3 to ping_time (the same for EK60)
    ds_Sv["water_level"] = (
        ds_Sv["water_level"]
        .assign_coords({"ping_time": ("time3", ds_Sv["ping_time"].values)})
        .swap_dims({"time3": "ping_time"}).drop("time3")
    )
    ds_Sv = ds_Sv.drop_dims("time3")
    # Assign chunk size
    ds_Sv = ds_Sv.chunk({dim: ds_Sv[dim].size for dim in ds_Sv.dims})
    if seq == 0:
        ds_Sv.to_zarr(output_path / "append_test.zarr", mode="w")
    else:
        ds_Sv.to_zarr(output_path / "append_test.zarr", mode="a", append_dim="ping_time")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 3.81 µs
17:58:08  parsing file OOI-D20170830-T000000.raw, time of first ping: 2017-Aug-30 00:00:00
17:58:12  overwriting tmp_outputs/OOI-D20170830-T000000.zarr
17:58:16  parsing file OOI-D20170830-T013905.raw, time of first ping: 2017-Aug-30 01:39:05
17:58:20  overwriting tmp_outputs/OOI-D20170830-T013905.zarr
17:58:24  parsing file OOI-D20170830-T031813.raw, time of first ping: 2017-Aug-30 03:18:13
17:58:28  overwriting tmp_outputs/OOI-D20170830-T031813.zarr
17:58:32  parsing file OOI-D20170830-T045714.raw, time of first ping: 2017-Aug-30 04:57:14
17:58:36  overwriting tmp_outputs/OOI-D20170830-T045714.zarr
17:58:40  parsing file OOI-D20170830-T063625.raw, time of first ping: 2017-Aug-30 06:36:25
17:58:44  overwriting tmp_outputs/OOI-D20170830-T063625.zarr
17:58:48  parsing file OOI-D20170830-T081527.raw, time of first ping: 2017-Aug-30 08:15:27
17:58:53  overwriting tmp_outputs/OOI-D20170830-T081527.zarr
17:58:56  parsing file O

In [13]:
ds_append = xr.open_dataset(output_path / "append_test.zarr", engine="zarr")

In [14]:
ds_append