Example link
# s3://nasa-waterinsight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc

In [1]:
import os
import json

# Load the config file and set it as JSON string in the environment
with open('../lithops_config.json', 'r') as f:
    config_data = json.load(f)

os.environ['LITHOPS_CONFIG'] = json.dumps(config_data)

from lithops import FunctionExecutor

def check_python(_):
    import sys
    return sys.version

with FunctionExecutor() as fexec:
    future = fexec.call_async(check_python, None)
    result = fexec.get_result([future])
    print(result)

2025-08-18 15:52:59,319 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-18 15:52:59,326 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-18 15:52:59,327 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-18 15:52:59,529 [INFO] invokers.py:119 -- ExecutorID 5c6225-0 | JobID A000 - Selected Runtime: /home/jovyan/rasi-icechunk/.venv/bin/python 
2025-08-18 15:52:59,534 [INFO] invokers.py:186 -- ExecutorID 5c6225-0 | JobID A000 - Starting function invocation: check_python() - Total: 1 activations
2025-08-18 15:52:59,536 [INFO] invokers.py:225 -- ExecutorID 5c6225-0 | JobID A000 - View execution logs at /tmp/lithops-root/logs/5c6225-0-A000.log
2025-08-18 15:52:59,537 [INFO] executors.py:494 -- ExecutorID 5c6225-0 - Getting results from 1 function activations
2025-08-18 15:52:59,538 [INFO] wait.py:101 -- ExecutorID 5c6225-0 - Waiting for 1 function activations to complete
  from .autonotebook import tqdm as notebook_tqdm
2025-08-18 15:53:00,070

3.12.10 (main, Apr  9 2025, 04:03:51) [Clang 20.1.0 ]


## First pass with fsspec.

In [12]:
import fsspec

fs = fsspec.filesystem('s3', anon=True)
files = fs.glob("s3://nasa-waterinsight/test/CASI/**/*.nc")
files

In [14]:
# check netcdf format
import netCDF4 as nc
# Method 1: Using s3fs to create a file-like object
s3_path = 'nasa-waterinsight/test/CASI/LSM/HISTORICAL2/PERCENTILES/Snowf_percentiles_195001.nc'

with fs.open(s3_path, 'rb') as f:
    with nc.Dataset('dummy', mode='r', memory=f.read()) as ds:
        print(f"File format: {ds.file_format}")
        print(f"Data model: {ds.data_model}")

File format: NETCDF3_64BIT_OFFSET
Data model: NETCDF3_64BIT_OFFSET


In [11]:
from virtualizarr import open_virtual_mfdataset, open_virtual_dataset
from virtualizarr.registry import ObjectStoreRegistry
from virtualizarr.parsers import NetCDF3Parser
import obstore
import xarray as xr
import pandas as pd

## Produce a virtual dataset from the list of files
bucket = "s3://nasa-waterinsight"
store = obstore.store.from_url(bucket, region="us-west-2", skip_signature=True)

registry = ObjectStoreRegistry({bucket: store})

parser = NetCDF3Parser()

urls = ["s3://" + file for file in files]

def preprocess(ds:xr.Dataset)-> xr.Dataset:
    # parse the int date into datetime and add as coordinate
    time = xr.DataArray(pd.to_datetime(ds.date.values.astype(str), format='%Y%m'), dims=['time'])
    # drop the old coord
    ds = ds.drop('date')
    #expand the data vars and then assign the time coordinate
    ds = ds.expand_dims('time')
    ds = ds.assign_coords({'time':time})
    return ds

vds = open_virtual_mfdataset(
    urls,
    parser=parser,
    registry=registry,
    parallel="lithops",
    preprocess=preprocess,
    loadable_variables=['date', 'lon', 'lat', 'percentile']
)

print(f"Virtual Dataset: {vds}")

2025-08-18 15:55:39,942 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-18 15:55:39,943 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-18 15:55:39,943 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-18 15:55:40,104 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-18 15:55:40,105 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-18 15:55:40,105 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-18 15:55:40,258 [INFO] invokers.py:119 -- ExecutorID 5c6225-6 | JobID M000 - Selected Runtime: /home/jovyan/rasi-icechunk/.venv/bin/python 
2025-08-18 15:55:40,267 [INFO] invokers.py:186 -- ExecutorID 5c6225-6 | JobID M000 - Starting function invocation: _open_and_preprocess() - Total: 24 activations
2025-08-18 15:55:40,272 [INFO] invokers.py:225 -- ExecutorID 5c6225-6 | JobID M000 - View execution logs at /tmp/lithops-root/logs/5c6225-6-M000.log
2025-08-18 15:55:40,274 [INFO] executors.py:494 -- 

Virtual Dataset: <xarray.Dataset> Size: 10MB
Dimensions:                  (time: 12, percentile: 5, lat: 152, lon: 132)
Coordinates:
  * percentile               (percentile) int32 20B 10 25 50 75 90
  * lat                      (lat) float32 608B 23.95 24.05 ... 38.95 39.05
  * lon                      (lon) float32 528B -109.5 -109.5 ... -96.55 -96.45
  * time                     (time) datetime64[ns] 96B 1950-01-01 ... 1950-12-01
Data variables:
    Snowf_percentiles        (time, percentile, lat, lon) float32 5MB Manifes...
    TotalPrecip_Percentiles  (time, percentile, lat, lon) float32 5MB Manifes...


In [15]:
import obstore

In [20]:
store = obstore.store.S3Store(
    bucket='s3://nasa-watersight',
    prefix='test/CASI/LSM/HISTORICAL2/PERCENTILES',
    skip_signature=True,
)

In [21]:
store.head('TotalPrecip_percentiles_201405.nc')

GenericError: Generic S3 error: Error performing HEAD https://s3.us-west-2.amazonaws.com/s3://nasa-watersight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc in 7.140821ms - Server returned non-2xx status code: 400 Bad Request: 

Debug source:
Generic {
    store: "S3",
    source: RetryError(
        RetryErrorImpl {
            method: HEAD,
            uri: Some(
                https://s3.us-west-2.amazonaws.com/s3://nasa-watersight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc,
            ),
            retries: 0,
            max_retries: 10,
            elapsed: 7.140821ms,
            retry_timeout: 180s,
            inner: Status {
                status: 400,
                body: Some(
                    "",
                ),
            },
        },
    ),
}

# Reloading the store

In [1]:
%time

# self contained read that works on the hub (make sure to run the pip install before)

import icechunk
import xarray as xr
import zarr

storage = icechunk.s3_storage(
    bucket='nasa-veda-scratch',
    prefix=f"jbusecke/RASI-test-0/",
    anonymous=False,
    from_env=True,
)

chunk_url = "s3://nasa-waterinsight/test/CASI/"

virtual_credentials = icechunk.containers_credentials(
    {
        chunk_url: icechunk.s3_anonymous_credentials()
    }
)
    
repo = icechunk.Repository.open(
    storage=storage,
    authorize_virtual_chunk_access=virtual_credentials,
)

session = repo.readonly_session('main')
ds = xr.open_zarr(session.store, consolidated=False, zarr_version=3)
ds

CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 6.2 μs


  ds = xr.open_zarr(session.store, consolidated=False, zarr_version=3)
