This notebook allows to download datasets and save them locally as Zarr stores.

In [1]:
import os

import clouddrift as cd
import copernicusmarine as cm
import numpy as np
import pandas as pd
from tqdm import tqdm
import xarray as xr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
start_datetime_str = "1994-06-01"
end_datetime_str = "2025-08-01"
output_directory = "/summer/meom/workdir/bertrava/data"

**GDP1h** (https://doi.org/10.1002/2016JC011716)

- Position: latitude, longitude, time,
- Velocity

In [3]:
dataset_id = "gdp-v2.01.1"
output_filename = f"{dataset_id}_{start_datetime_str}_{end_datetime_str}.zarr"

if not os.path.exists(os.path.join(output_directory, output_filename)):
    url_path = f"https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/{dataset_id}.zarr/"
    ds = xr.open_zarr(url_path)
    
    ds = ds[
        ["rowsize", "typebuoy", "drogue_status", "err_ve", "err_vn", "time", "lat", "lon", "ve", "vn"]
    ]

    ds.typebuoy.load()
    ds = cd.ragged.subset(
        ds, {"typebuoy": lambda tb: np.char.find(tb.astype(str), "SVP") != -1}, row_dim_name="traj"
    )

    ds = ds.drop_vars("typebuoy")

    ds.drogue_status.load()
    ds = cd.ragged.subset(
        ds, {"drogue_status": lambda ds: ds == True}, row_dim_name="traj"
    )

    ds = ds.drop_vars("drogue_status")

    ds.time.load()
    ds = cd.ragged.subset(
        ds,
        {"time": lambda t: (t >= np.datetime64(start_datetime_str)) & (t < np.datetime64(end_datetime_str))},
        row_dim_name="traj"
    )

    ds.ve.load()
    ds.vn.load()
    ds.err_ve.load()
    ds.err_vn.load()
    ds.lat.load()
    ds.lon.load()

    def remove_nan(ve, vn, err_ve, err_vn, time, lat, lon):
        mask = (
            np.isfinite(ve) & np.isfinite(vn) & 
            np.isfinite(err_ve) & np.isfinite(err_vn) & 
            ~np.isnat(time) & 
            np.isfinite(lat) & np.isfinite(lon)
        )
        return mask

    ds = cd.ragged.subset(
        ds, {("ve", "vn", "err_ve", "err_vn", "time", "lat", "lon"): remove_nan}, row_dim_name="traj"
    )

    ds = ds.drop_vars(["err_ve", "err_vn"])

    ds = cd.ragged.subset(
        ds, {("ve", "vn"): lambda ve, vn: (np.abs(ve) <= 3) & (np.abs(vn) <= 3)}, row_dim_name="traj"
    )

    ds = xr.Dataset(
        data_vars={
            "id": ("points", np.repeat(ds.id.values, ds.rowsize.values), {"long_name": "Drifter ID"}),
            "time": ("points", ds.time.values, ds.time.attrs),
            "lat": ("points", ds.lat.values.astype(np.float32), ds.lat.attrs),
            "lon": ("points", ds.lon.values.astype(np.float32), ds.lon.attrs),
            "ve": ("points", ds.ve.values.astype(np.float32), ds.ve.attrs),
            "vn": ("points", ds.vn.values.astype(np.float32), ds.vn.attrs)
        }
    )

    ds.to_zarr(os.path.join(output_directory, output_filename), compute=True, consolidated=True)
else:
    ds = xr.open_zarr(os.path.join(output_directory, output_filename))

start_datetime = (ds.time.values.min() - np.timedelta64(1, "D"))
end_datetime = (ds.time.values.max() + np.timedelta64(1, "D"))



**DUACS** (https://doi.org/10.48670/moi-00148)

- SSH
- geostrophy

In [None]:
dataset_id = "cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.125deg_P1D"
output_filename = f"{dataset_id}_{start_datetime_str}_{end_datetime_str}.zarr"

if not os.path.exists(os.path.join(output_directory, output_filename)):
    cm.subset(
        dataset_id, 
        variables=["sla", "adt", "ugos", "vgos"],
        start_datetime=str(start_datetime),
        end_datetime=str(end_datetime),
        output_filename=output_filename,
        output_directory=output_directory
    )

**ERA5** (https://doi.org/10.48670/moi-00185)

- wind stress
- wind velocity

In [None]:
if end_datetime > np.datetime64("2008-01-01"):
    if start_datetime > np.datetime64("2008-01-01"):
        start_datetimes = [start_datetime,]
        end_datetimes = [end_datetime,]
        start_datetimes_str = [start_datetime_str,]
        end_datetimes_str = [end_datetime_str,]
        dataset_ids = ["cmems_obs-wind_glo_phy_my_l4_0.125deg_PT1H",]
    else:
        start_datetimes = [start_datetime, np.datetime64("2008-01-01")]
        end_datetimes = [np.datetime64("2008-01-01"), end_datetime]
        start_datetimes_str = [start_datetime_str, "2008-01-01"]
        end_datetimes_str = ["2008-01-01", end_datetime_str]
        dataset_ids = ["cmems_obs-wind_glo_phy_my_l4_0.25deg_PT1H", "cmems_obs-wind_glo_phy_my_l4_0.125deg_PT1H"]
else:
    start_datetimes = [start_datetime,]
    end_datetimes = [end_datetime,]
    start_datetimes_str = [start_datetime_str,]
    end_datetimes_str = [end_datetime_str,]
    dataset_ids = ["cmems_obs-wind_glo_phy_my_l4_0.25deg_PT1H",]

for dataset_id, _start_datetime, _end_datetime, _start_datetime_str, _end_datetime_str in zip(
    dataset_ids, start_datetimes, end_datetimes, start_datetimes_str, end_datetimes_str
):

    output_filename = f"{dataset_id}_{_start_datetime_str}_{_end_datetime_str}.zarr"
    
    if not os.path.exists(os.path.join(output_directory, output_filename)):
        cm.subset(
            dataset_id, 
            variables=["eastward_wind", "northward_wind", "eastward_stress", "northward_stress"],
            start_datetime=str(_start_datetime),
            end_datetime=str(_end_datetime),
            output_filename=output_filename,
            output_directory=output_directory
        )

**WAVERYS/MFWAM** (https://doi.org/10.48670/moi-00022)

- Stokes drift at the surface
- Waves (wind, primary swell, secondary swell) parameters (significant wave height, period, direction)

In [None]:
dataset_id = "cmems_mod_glo_wav_my_0.2deg_PT3H-i"
output_filename = f"{dataset_id}_{start_datetime_str}_{end_datetime_str}.zarr"

time_batches = pd.date_range(start=start_datetime, end=end_datetime, freq="1ME")

store = os.path.join(output_directory, output_filename)

for i in tqdm(range(len(time_batches) - 1)):
    t0 = time_batches[i]
    t1 = time_batches[i + 1]

    ds_batch = cm.open_dataset(
        dataset_id,
        variables=["VSDX", "VSDY"],
        start_datetime=str(t0),
        end_datetime=str(t1),
    )

    ds_batch = ds_batch.drop_encoding().chunk({"time": 72, "latitude": 90, "longitude": 180})

    mode = "w" if i == 0 else "a"
    append_dim = None if i == 0 else "time"

    ds_batch.to_zarr(store, mode=mode, append_dim=append_dim, align_chunks=True, consolidated=False)

    del ds_batch