Example link
# s3://nasa-waterinsight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc

In [17]:
import os
import json

# Load the config file and set it as JSON string in the environment
with open('../lithops_config.json', 'r') as f:
    config_data = json.load(f)

os.environ['LITHOPS_CONFIG'] = json.dumps(config_data)

from lithops import FunctionExecutor

def check_python(_):
    import sys
    return sys.version

with FunctionExecutor() as fexec:
    future = fexec.call_async(check_python, None)
    result = fexec.get_result([future])
    print(result)

2025-08-26 20:44:01,709 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-26 20:44:01,710 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-26 20:44:01,710 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-26 20:44:01,912 [INFO] invokers.py:119 -- ExecutorID 97e168-5 | JobID A000 - Selected Runtime: /home/jovyan/rasi-icechunk/.venv/bin/python 
2025-08-26 20:44:01,914 [INFO] invokers.py:186 -- ExecutorID 97e168-5 | JobID A000 - Starting function invocation: check_python() - Total: 1 activations
2025-08-26 20:44:01,917 [INFO] invokers.py:225 -- ExecutorID 97e168-5 | JobID A000 - View execution logs at /tmp/lithops-root/logs/97e168-5-A000.log
2025-08-26 20:44:01,918 [INFO] executors.py:494 -- ExecutorID 97e168-5 - Getting results from 1 function activations
2025-08-26 20:44:01,918 [INFO] wait.py:101 -- ExecutorID 97e168-5 - Waiting for 1 function activations to complete
2025-08-26 20:44:02,522 [INFO] executors.py:618 -- ExecutorID 97e168-5 - 

3.12.10 (main, Apr  9 2025, 04:03:51) [Clang 20.1.0 ]


## First pass with fsspec.

In [3]:
import fsspec

fs = fsspec.filesystem('s3', anon=True)
experiment_id = "HISTORICAL"
data_dir = f"s3://nasa-waterinsight/RASI/ROUTING/{experiment_id}/"
files = fs.glob(f"{data_dir}**/*1951*.nc")
files

['nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195101.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195102.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195103.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195104.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195105.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195106.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195107.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195108.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195109.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195110.nc',
 'nasa-waterinsight/RASI/ROUTING/HISTORICAL/PERCENTILES/Streamflow_percentiles_195111.nc',

In [3]:
# check netcdf format
import netCDF4 as nc
# Method 1: Using s3fs to create a file-like object
s3_path = 'nasa-waterinsight/test/CASI/LSM/HISTORICAL2/PERCENTILES/Snowf_percentiles_195001.nc'

with fs.open(s3_path, 'rb') as f:
    with nc.Dataset('dummy', mode='r', memory=f.read()) as ds:
        print(f"File format: {ds.file_format}")
        print(f"Data model: {ds.data_model}")

File format: NETCDF3_64BIT_OFFSET
Data model: NETCDF3_64BIT_OFFSET


In [4]:
from virtualizarr import open_virtual_mfdataset, open_virtual_dataset
from virtualizarr.registry import ObjectStoreRegistry
from virtualizarr.parsers import NetCDF3Parser
import obstore
import xarray as xr
import pandas as pd

## Produce a virtual dataset from the list of files
bucket = "s3://nasa-waterinsight"
store = obstore.store.from_url(bucket, region="us-west-2", skip_signature=True)

registry = ObjectStoreRegistry({bucket: store})

parser = NetCDF3Parser()

urls = ["s3://" + file for file in files]

def combine_attrs(dicts, context):
    combined_attrs = {}

    # Get keys from first dict as reference
    all_keys = set(dicts[0].keys())

    # Check that every key exists in all dicts
    for i, d in enumerate(dicts[1:], 1):
        if set(d.keys()) != all_keys:
            missing = all_keys - set(d.keys())
            extra = set(d.keys()) - all_keys
            raise KeyError(f"Dict {i} key mismatch. Missing: {missing}, Extra: {extra}")

    for key in all_keys:
        values = [d[key] for d in dicts]
        unique_values = set(values)

        if len(unique_values) == 1:
            # All values are the same
            combined_attrs[key] = values[0]
        else:
            if key == "vmin":
                combined_attrs[key] = min(values)
            elif key == "vmax":
                combined_attrs[key] = max(values)
            elif key == "begin_date":
                combined_attrs[key] = min(values)
            elif key == "end_date":
                combined_attrs[key] = max(values)
            elif key == "history":
                combined_attrs[key] = "||".join(values)
            else:
                raise ValueError(f"No instructions provided how to handle {key=} with {values=}")
    return combined_attrs

def preprocess(ds:xr.Dataset)-> xr.Dataset:
    # parse the int date into datetime and add as coordinate
    time = xr.DataArray(pd.to_datetime(ds.date.values.astype(str), format='%Y%m'), dims=['time'])
    # drop the old coord
    ds = ds.drop('date')
    #expand the data vars and then assign the time coordinate
    ds = ds.expand_dims('time')
    ds = ds.assign_coords({'time':time})
    return ds

vds = open_virtual_mfdataset(
    urls,
    parser=parser,
    registry=registry,
    parallel="lithops",
    preprocess=preprocess,
    combine_attrs=combine_attrs,
    loadable_variables=['date', 'lon', 'lat', 'percentile']
)
vds

2025-08-26 20:00:49,010 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-26 20:00:49,011 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-26 20:00:49,012 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-26 20:00:49,212 [INFO] config.py:139 -- Lithops v3.6.1 - Python3.12
2025-08-26 20:00:49,213 [INFO] localhost.py:39 -- Localhost storage client created
2025-08-26 20:00:49,214 [INFO] localhost.py:78 -- Localhost compute v2 client created
2025-08-26 20:00:49,375 [INFO] invokers.py:119 -- ExecutorID 97e168-2 | JobID M000 - Selected Runtime: /home/jovyan/rasi-icechunk/.venv/bin/python 
2025-08-26 20:00:49,403 [INFO] invokers.py:186 -- ExecutorID 97e168-2 | JobID M000 - Starting function invocation: _open_and_preprocess() - Total: 108 activations
2025-08-26 20:00:49,439 [INFO] invokers.py:225 -- ExecutorID 97e168-2 | JobID M000 - View execution logs at /tmp/lithops-root/logs/97e168-2-M000.log
2025-08-26 20:00:49,444 [INFO] executors.py:494 --

Virtual Dataset: <xarray.Dataset> Size: 39MB
Dimensions:                  (time: 12, percentile: 5, lat: 152, lon: 132)
Coordinates:
  * percentile               (percentile) int32 20B 10 25 50 75 90
  * lat                      (lat) float32 608B 23.95 24.05 ... 38.95 39.05
  * lon                      (lon) float32 528B -109.5 -109.5 ... -96.55 -96.45
  * time                     (time) datetime64[ns] 96B 1951-01-01 ... 1951-12-01
Data variables:
    AvgSurfT_Percentiles     (time, percentile, lat, lon) float32 5MB Manifes...
    Qs_Percentiles           (time, percentile, lat, lon) float32 5MB Manifes...
    Qsb_Percentiles          (time, percentile, lat, lon) float32 5MB Manifes...
    RZSM_Percentiles         (time, percentile, lat, lon) float32 5MB Manifes...
    SWE_Percentiles          (time, percentile, lat, lon) float32 5MB Manifes...
    SnowDepth_Percentiles    (time, percentile, lat, lon) float32 5MB Manifes...
    Snowf_percentiles        (time, percentile, lat, lon) flo

In [5]:
vds

In [28]:
icechunk.__version__

'1.1.2'

In [30]:
!pip install icechunk

Collecting icechunk
  Downloading icechunk-1.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading icechunk-1.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: icechunk
Successfully installed icechunk-1.1.4


In [2]:
import icechunk
import fsspec


# change these as needed.
store_bucket = "nasa-veda-scratch"


## Find Files of interest

experiment_ids = ['HISTORICAL', 'SSP245', 'SSP585']
experiment_id = experiment_ids[0]

data_dir = f"s3://nasa-waterinsight/RASI/ROUTING/{experiment_id}/"
print(f"Processing {data_dir}")
store_prefix = f"jbusecke/RASI/test-temp2/{experiment_id}/"

## Write (commit) the virtual dataset into icechunk
storage = icechunk.s3_storage(
    bucket=store_bucket,
    prefix=store_prefix,
    anonymous=False,
    from_env=True,
)

config = icechunk.RepositoryConfig.default()

print(f"Open Repo at {store_prefix}")
repo = icechunk.Repository.open_or_create(
    storage=storage,
    config=config,
)

Processing s3://nasa-waterinsight/RASI/ROUTING/HISTORICAL/
Open Repo at jbusecke/RASI/test-temp2/HISTORICAL/


IcechunkError:   x error getting object from object store service error
  | 
  | context:
  |    0: icechunk::storage::s3::get_ref
  |            with ref_key="branch.main/ref.json"
  |              at icechunk/src/storage/s3.rs:754
  |    1: icechunk::refs::fetch_branch
  |            with name="main"
  |              at icechunk/src/refs.rs:381
  |    2: icechunk::refs::fetch_branch_tip
  |            with name="main"
  |              at icechunk/src/refs.rs:400
  |    3: icechunk::repository::exists
  |              at icechunk/src/repository.rs:341
  | 
  |-> error getting object from object store service error
  |-> service error
  |-> unhandled error (InvalidAccessKeyId)
  `-> Error { code: "InvalidAccessKeyId", message: "The AWS Access Key Id you provided does not exist in our records.", aws_request_id: "D085DT8HY6K8DKR5", s3_extended_request_id:
      "PxH8UMw4eE6CQK2xsp/K/h0By8ajrN/UaJALjXCN9b7PQRbiRSzkDy6IM20wlkrnCvV6sASCzjIyVHBfSoEN2w==" }


In [18]:
store_bucket = "nasa-veda-scratch"
store_prefix = "jbusecke/dummy/"

In [27]:
print(f"Virtual Dataset: {vds}")
## Write (commit) the virtual dataset into icechunk
storage = icechunk.s3_storage(
    bucket=store_bucket,
    prefix=''
    # prefix=store_prefix,
    region='us-west-2',
    from_env=True,
)

config = icechunk.RepositoryConfig.default()

print(f"Open Repo at {store_prefix}")
repo = icechunk.Repository.create(
    storage=storage,
    config=config,
)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2303070523.py, line 5)

In [24]:
import fsspec

In [25]:
fs = fsspec.filesystem('s3')
fs.ls('s3://nasa-veda-scratch')

['nasa-veda-scratch/jbusecke']

# I really would like to use obstore instead of fsspec for the glob, but low prio

In [15]:
import obstore

In [20]:
store = obstore.store.S3Store(
    bucket='s3://nasa-watersight',
    prefix='test/CASI/LSM/HISTORICAL2/PERCENTILES',
    skip_signature=True,
)

In [21]:
store.head('TotalPrecip_percentiles_201405.nc')

GenericError: Generic S3 error: Error performing HEAD https://s3.us-west-2.amazonaws.com/s3://nasa-watersight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc in 7.140821ms - Server returned non-2xx status code: 400 Bad Request: 

Debug source:
Generic {
    store: "S3",
    source: RetryError(
        RetryErrorImpl {
            method: HEAD,
            uri: Some(
                https://s3.us-west-2.amazonaws.com/s3://nasa-watersight/test/CASI/LSM/HISTORICAL2/PERCENTILES/TotalPrecip_percentiles_201405.nc,
            ),
            retries: 0,
            max_retries: 10,
            elapsed: 7.140821ms,
            retry_timeout: 180s,
            inner: Status {
                status: 400,
                body: Some(
                    "",
                ),
            },
        },
    ),
}

# Reloading the store

In [4]:
import icechunk
import xarray as xr
import zarr

storage = icechunk.s3_storage(
    bucket='nasa-veda-scratch',
    prefix=f"jbusecke/RASI-test/",
    anonymous=False,
    from_env=True,
)

chunk_url = "s3://nasa-waterinsight/RASI/"

virtual_credentials = icechunk.containers_credentials(
    {
        chunk_url: icechunk.s3_anonymous_credentials()
    }
)
    
repo = icechunk.Repository.open(
    storage=storage,
    authorize_virtual_chunk_access=virtual_credentials,
)

session = repo.readonly_session('main')
ds = xr.open_zarr(session.store, consolidated=False, zarr_version=3, chunks={})
ds

  ds = xr.open_zarr(session.store, consolidated=False, zarr_version=3, chunks={})


In [11]:
ds.nbytes/1e6

626.004196

In [5]:
import fsspec

fs = fsspec.filesystem('s3', anon=True)
files = fs.ls("s3://nasa-waterinsight/RASI/ROUTING/")
files

['nasa-waterinsight/RASI/ROUTING/HISTORICAL',
 'nasa-waterinsight/RASI/ROUTING/SSP245',
 'nasa-waterinsight/RASI/ROUTING/SSP585']

In [12]:
600*64*3*4/1000

460.8

In [14]:
import fsspec

In [15]:
fs = fsspec.filesystem('s3')

In [16]:
fs.ls('nasa-veda-scratch')

['nasa-veda-scratch/jbusecke']