In [22]:
import pystac
import xarray as xr
import pandas as pd
from datetime import datetime, date
import pystac_client
from pystac_client import Client
import numpy as np
import copernicusmarine
from copernicusmarine.core_functions import custom_open_zarr

import warnings
warnings.filterwarnings("ignore")


### Open the root catalog

Use pystac-client to connect to a STAC API endpoint (https://catalog.dive.edito.eu/).  We can also connect to a static STAC json catalog, that conforms to STAC and view modelled data from Bio-Oracle.  Also available in Zarr format.

In [47]:
# STAC API root URL
URL = 'https://api.dive.edito.eu/data/collections'
# URL = 'https://s3.waw3-1.cloudferro.com/emodnet/bio_oracle/stac/catalog.json'
# custom headers
headers = []

cat = Client.open(URL, headers=headers)
cat

In [50]:
colls = cat.get_collections()

for coll in colls:
    print(coll.id)

climate_forecast-age_of_sea_ice
emodnet-aggregate_extraction
climate_forecast-air_density
climate_forecast-air_pressure
climate_forecast-air_pressure_at_mean_sea_level
climate_forecast-air_temperature
emodnet-algae_production
emodnet-aquaculture
climate_forecast-atmosphere_upward_relative_vorticity
climate_forecast-barotropic_eastward_sea_water_velocity
climate_forecast-barotropic_northward_sea_water_velocity
emodnet-bathymetry_data_quality
emodnet-bathymetry_source_metadata
emodnet-beach_litter
climate_forecast-beaufort_wind_force
emodnet-cables
emodnet-calanus_helgolandicus_and_finmarchicus_temporal_change
climate_forecast-charnock_coefficient_for_surface_roughness_length_for_momentum_in_air
emodnet-coastal_behaviour
emodnet-coastlines
emodnet-composition_of_litter_according_to_material_categories_in_percent_normalized_per_beach_per_year_metal_percent
emodnet-concentration_of_cadmium
climate_forecast-concentration_of_colored_dissolved_organic_matter_in_sea_water_expressed_as_equivale

### Browse the Catalog
Navigate through the root catalog to find sub-catalogs and collections of interest


In [58]:
collections = cat.get_collections()
for collection in collections:
    if 'chlorophyll' in collection.id:
        print(collection.id)
        for item in collection.get_all_items():
            print(item.id)
            print(item.assets)
            break

emodnet-deepest_values_of_water_body_chlorophyll_a
e9d70b7e-80da-5db2-9053-5e3540ad0c60
{'xml': <Asset href=https://emodnet.ec.europa.eu/geonetwork/srv/api/records/a3461fb1-d209-440e-a49f-7acff7731395/formatters/xml>, 'csw': <Asset href=https://emodnet.ec.europa.eu/geonetwork/emodnet/eng/csw?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=a3461fb1-d209-440e-a49f-7acff7731395>, 'html': <Asset href=http://opendap.oceanbrowser.net/thredds/dodsC/data/emodnet-domains/By_sea_regions/Northeast_Atlantic_Ocean/Water_body_chlorophyll-a.4Danl.nc.html>, 'netcdf': <Asset href=https://s3.waw3-1.cloudferro.com/emodnet/emodnet_native/emodnet_chemistry/water_body_chlorophyll_a/water_body_chlorophyll_a_masked_using_relative_error_threshold_0.3_northeast_atlantic_ocean/Water_body_chlorophyll-a.4Danl.nc>, 'zarr': <Asset href=https://s3.waw3-1.cloudferro.com/emodnet/emodnet_arco/emodnet_chemistry/water_body_chlorophyll_a/deepest_values_of_water_body_chlorophyll_a_northeast_atlantic_o

### Search for a Collection
Identify a collection based on your variables (e.g., temperature, salinity). You can filter by collection metadata like keywords or spatial/temporal bounds.

In [62]:
all_items = []
collection_selection = ['oxygen', 'habitat', 'elevation', 'temperature']
for collection in cat.get_collections():
    if 'oxygen' in collection.id or 'habitat' in collection.id or 'temperature' in collection.id:
        collection_items = collection.get_all_items()
        try:
            for item in collection_items:
                # Append item information to the list
                all_items.append({  'Collection ID': collection.id, 
                                    'Item ID': item.id,
                                    'Item bounds': item.geometry, 
                                    'item_starttime': item.properties['start_datetime'],
                                    'item_endtime': item.properties['end_datetime'], 
                                    'Assets': item.assets})
        except Exception as e:
            print(e)
            print(f'Error with {collection.id}')
            continue
oxygen_habitat_temperature_items_df = pd.DataFrame(all_items)
oxygen_habitat_temperature_items_df.head()

Unnamed: 0,Collection ID,Item ID,Item bounds,item_starttime,item_endtime,Assets
0,climate_forecast-air_temperature,8ebf3d6d-1a6f-58f8-beb3-e6ff6eae3c90,"{'type': 'Polygon', 'coordinates': [[[-52.9000...",2024-12-11T00:00:00.000000Z,2025-01-10T10:10:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...
1,climate_forecast-air_temperature,33857744-4f8c-5eb2-acc7-3cf15e6026b7,"{'type': 'Polygon', 'coordinates': [[[-52.9000...",2024-12-11T00:00:00.000000Z,2025-01-10T10:10:00.000000Z,{'arco-geo-series': <Asset href=https://s3.waw...
2,climate_forecast-air_temperature,139f5e61-86d9-562a-b24d-d940bb0b25c7,"{'type': 'Polygon', 'coordinates': [[[-14.4, -...",2024-12-11T00:00:00.000000Z,2025-01-10T09:19:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...
3,climate_forecast-air_temperature,962e103b-9fec-5b95-ab3a-9f8633c03e38,"{'type': 'Polygon', 'coordinates': [[[-14.4, -...",2024-12-11T00:00:00.000000Z,2025-01-10T09:19:00.000000Z,{'arco-geo-series': <Asset href=https://s3.waw...
4,climate_forecast-air_temperature,99b5c8e6-696f-5512-9f59-2f6e53caf374,"{'type': 'Polygon', 'coordinates': [[[-171, -3...",2024-12-11T00:00:00.000000Z,2025-01-10T10:13:00.000000Z,{'arco-time-series': <Asset href=https://s3.wa...


### Select a Collection and Fetch Items
Choose a collection and list the available items (datasets), filtered by date range and geographic region.

In [63]:
def filter_items_by_time(items_df, start_date, end_date):
    """
    Filter items based on the time range.
    """
    items_df['item_starttime'] = pd.to_datetime(items_df['item_starttime'])
    items_df['item_endtime'] = pd.to_datetime(items_df['item_endtime'])
    items_df = items_df.sort_values(by='item_starttime')
    return items_df[(items_df['item_starttime'] > start_date) & (items_df['item_endtime'] < end_date)]

# Define time range
start_date = '2000-01-01'
end_date = '2030-12-31'

time_df = filter_items_by_time(oxygen_habitat_temperature_items_df, start_date, end_date)
time_df.head()

Unnamed: 0,Collection ID,Item ID,Item bounds,item_starttime,item_endtime,Assets
738,climate_forecast-sea_water_temperature,20c3b69e-e60d-540f-9319-33e5f0c1b5b9,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-12-31 23:59:00+00:00,{'arco-time-series': <Asset href=https://s3.wa...
739,climate_forecast-sea_water_temperature,a45cc962-398e-52fa-8d57-e36df1882cb3,"{'type': 'Polygon', 'coordinates': [[[-179.999...",2001-01-04 16:47:52+00:00,2023-12-31 23:59:00+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
294,climate_forecast-sea_surface_foundation_temper...,85c55083-3d82-5abe-899c-2be915fd8c99,"{'type': 'Polygon', 'coordinates': [[[-179.975...",2007-01-01 00:00:00+00:00,2025-01-09 00:00:00+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
148,climate_forecast-mole_concentration_of_dissolv...,e3e48210-8e39-5a33-ac19-9ee69d4f91fd,"{'type': 'Polygon', 'coordinates': [[[-180, 43...",2007-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,{'arco-geo-series': <Asset href=https://s3.waw...
147,climate_forecast-mole_concentration_of_dissolv...,7bec1fdc-3863-59e2-aec1-98d74001619d,"{'type': 'Polygon', 'coordinates': [[[-180, 43...",2007-01-01 00:00:00+00:00,2022-12-31 00:00:00+00:00,{'arco-time-series': <Asset href=https://s3.wa...


### Look for Cloud-Optimized Assets
From each item, find and extract cloud-optimized assets (like Zarr or Parquet) that can be processed further.

In [64]:
all_items_assets = []
           # Now filter the assets
all_items_assets = []
for _, row in time_df.iterrows():
    collection_id = row['Collection ID']
    item_id = row['Item ID']
    bounds = row['Item bounds']
    data_starttime = row['item_starttime']
    data_endtime = row['item_endtime']
    assets = row['Assets']
    
    for asset_key, asset in assets.items():
        if asset.href.endswith('.zarr') or asset.href.endswith('.zarr/') or asset.href.endswith('.parquet'):
            # Append asset information to the list
            all_items_assets.append({'Collection ID': collection_id, 'Item ID': item_id, 'Data Start': data_starttime, 'Data End': data_endtime, 'Bounds': bounds, 'Asset Key': asset_key, 'Asset Href': asset.href})

# Create a DataFrame for assets
assets_df = pd.DataFrame(all_items_assets)
assets_df.head()

assets_df.to_csv('temperature_oxygen_habitat_arco_assets.csv')