In [1]:
import fsspec
import s3fs
import json
import re
import xarray as xr
import pandas as pd

In [2]:
SM_PATH_PATTERN = "SMOS/L2SM/MIR_SMUDP2/{year}/{month}/{day}"
SM_NAME_PATTERN = r"SM_OPER_MIR_SMUDP2_(?P<sd>\d{8})T(?P<st>\d{6})_(?P<ed>\d{8})T(?P<et>\d{6})_\d{3}_\d{3}_\d{1}"

OS_PATH_PATTERN = "SMOS/L2OS/MIR_OSUDP2/{year}/{month}/{day}"
OS_NAME_PATTERN = r"SM_OPER_MIR_OSUDP2_(?P<sd>\d{8})T(?P<st>\d{6})_(?P<ed>\d{8})T(?P<et>\d{6})_\d{3}_\d{3}_\d{1}"

In [3]:
def get_sm_times(fs: fsspec.AbstractFileSystem,
                 path_prefix: str, year: int,
                 month: int,
                 day: int):
    return get_times(fs, path_prefix,
                     SM_PATH_PATTERN, SM_NAME_PATTERN,
                     year, month, day)

def get_os_times(fs: fsspec.AbstractFileSystem,
                 path_prefix: str,
                 year: int,
                 month: int,
                 day: int):
    return get_times(fs, path_prefix,
                     OS_PATH_PATTERN, OS_NAME_PATTERN,
                     year, month, day)

def get_times(fs: fsspec.AbstractFileSystem,
              path_prefix: str,
              path_pattern: str,
              name_pattern: str,
              year: int,
              month: int,
              day: int):
    path = path_prefix + "/" + path_pattern.format(
        year=year, 
        month=f'0{month}' if month < 10 else month, 
        day=f'0{day}' if day < 10 else day
    )
    result = []
    for item in fs.listdir(path):
        #display(item)
        if item["type"] == "directory":
            name = item["name"][len(path) + 1:]
            m = re.match(name_pattern, name)
            if m is not None:
                start = m.group("sd") + m.group("st")
                end = m.group("ed") + m.group("et")
                result.append((path + "/" + name, start, end))
    return sorted(result, key=lambda item: item[1]) 

In [33]:
with open("creodias-credentials.json") as f:
    credentials = json.load(f)
    s3_storage_options = dict(
        anon=False, 
        client_kwargs=dict(endpoint_url="https://s3.cloudferro.com"), 
        **credentials
    )    

s3 = s3fs.S3FileSystem(**s3_storage_options)

In [5]:
get_os_times(s3, "EODATA", 2023, 5, 2)

[('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T022917_20230502T032231_700_001_1',
  '20230502022917',
  '20230502032231'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T040922_20230502T050236_700_001_1',
  '20230502040922',
  '20230502050236'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T045919_20230502T055238_700_001_1',
  '20230502045919',
  '20230502055238'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T072931_20230502T082244_700_001_1',
  '20230502072931',
  '20230502082244'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T090934_20230502T100249_700_001_1',
  '20230502090934',
  '20230502100249'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T104941_20230502T114253_700_001_1',
  '20230502104941',
  '20230502114253'),
 ('EODATA/SMOS/L2OS/MIR_OSUDP2/2023/05/02/SM_OPER_MIR_OSUDP2_20230502T131941_20230502T141300_700_001_1',
  '20230502131941',

In [6]:
get_sm_times(s3, "EODATA", 2023, 5, 2)

[('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T004912_20230502T014226_700_001_1',
  '20230502004912',
  '20230502014226'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T022917_20230502T032231_700_001_1',
  '20230502022917',
  '20230502032231'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T040922_20230502T050236_700_001_1',
  '20230502040922',
  '20230502050236'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T045919_20230502T055238_700_001_1',
  '20230502045919',
  '20230502055238'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T072931_20230502T082244_700_001_1',
  '20230502072931',
  '20230502082244'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T090934_20230502T100249_700_001_1',
  '20230502090934',
  '20230502100249'),
 ('EODATA/SMOS/L2SM/MIR_SMUDP2/2023/05/02/SM_OPER_MIR_SMUDP2_20230502T104941_20230502T114253_700_001_1',
  '20230502104941',

In [7]:
FILENAME_DATETIME_FORMAT = "%Y%m%d%H%M%S"

In [8]:
pd.to_datetime('20230502131941', format=FILENAME_DATETIME_FORMAT)

Timestamp('2023-05-02 13:19:41')

In [9]:
time_range = "2025-05-01 12:00:00", "2025-05-31 12:00:00"

In [10]:
start, end = pd.to_datetime(time_range)

In [11]:
start.strftime(FILENAME_DATETIME_FORMAT)
    

'20250501120000'

In [22]:
def get_sm_files(fs: fsspec.AbstractFileSystem,
                 path_prefix: str,
                 time_range):
    ONE_DAY = pd.Timedelta(1, unit="days")
    
    start, end = pd.to_datetime(time_range)
    
    start_times = get_sm_times(fs, path_prefix, start.year, start.month, start.day)
    end_times = get_sm_times(fs, path_prefix, end.year, end.month, end.day)
    
    start_str = start.strftime(FILENAME_DATETIME_FORMAT)
    end_str = end.strftime(FILENAME_DATETIME_FORMAT)
    
    start_index = -1
    for index, (_, _, start_end_str) in enumerate(start_times):
        if start_end_str >= start_str:
            start_index = index
            break
            
    end_index = -1
    for index, (_, end_start_str, _) in enumerate(end_times):
        if end_start_str >= end_str:
            end_index = index
            break
        
    start_names = []
    if start_index >= 0:
        start_names.extend(map(lambda item: item[0], start_times[start_index:]))
        
    # Add everything between start + start.day and end - end.day 
    
    start_p1d = pd.Timestamp(year=start.year, month=start.month, day=start.day) \
                + ONE_DAY
    end_m1d = pd.Timestamp(year=end.year, month=end.month, day=end.day) \
              - ONE_DAY
    
    in_between_names = []
    if end_m1d > start_p1d:
        delta = end_m1d - start_p1d
        time = start_p1d
        while time <= end_m1d:
            in_between_names.extend(map(lambda item: item[0], 
                                        get_sm_times(fs, path_prefix,
                                                     time.year, time.month, time.day)))
            time += ONE_DAY
            
    end_names = []  
    if end_index >= 0:
        end_names.extend(map(lambda item: item[0], end_times[:end_index]))   
        
    return start_names + in_between_names + end_names

In [23]:
sm_files = get_sm_files(s3, "EODATA", ("2022-05-01 12:00:00", "2022-05-04 12:00:00"))
sm_files

['EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T113632_20220501T122945_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T122629_20220501T131948_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T131637_20220501T140950_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T140633_20220501T145953_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T145642_20220501T154954_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T154637_20220501T163956_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T163644_20220501T172959_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T172642_20220501T182001_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T181650_20220501T191002_700_001_1',
 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_2022

In [14]:
path = 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/30/SM_OPER_MIR_SMUDP2_20220530T001711_20220530T011030_700_001_1/SM_OPER_MIR_SMUDP2_20220530T001711_20220530T011030_700_001_1.nc'
path = 'EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/31/SM_OPER_MIR_SMUDP2_20220531T093840_20220531T103200_700_001_1/SM_OPER_MIR_SMUDP2_20220531T093840_20220531T103200_700_001_1.nc'


In [15]:
%%timeit -n 1 -r 1
with s3.open(path, 'rb') as f:
    ds = xr.open_dataset(f, engine="h5netcdf")
    # display(ds)

22.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [16]:
%%timeit -n 1 -r 1
s3.get(path, "test.nc")
ds = xr.open_dataset("test.nc")

23.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
ds = xr.open_dataset("test.nc")
ds

In [18]:
ds.Surface_Temperature_DQX.compute()

In [20]:
import kerchunk.hdf

In [29]:
index = []
for path in sm_files[0:4]:
    name = path.rsplit('/', maxsplit=1)[1]
    url = f'{path}/{name}.nc'
    print(url)
    with s3.open(url) as f:
        h5chunks = kerchunk.hdf.SingleHdf5ToZarr(f, url, inline_threshold=100)
        index.append(h5chunks.translate())

EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T113632_20220501T122945_700_001_1/SM_OPER_MIR_SMUDP2_20220501T113632_20220501T122945_700_001_1.nc
EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T122629_20220501T131948_700_001_1/SM_OPER_MIR_SMUDP2_20220501T122629_20220501T131948_700_001_1.nc
EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T131637_20220501T140950_700_001_1/SM_OPER_MIR_SMUDP2_20220501T131637_20220501T140950_700_001_1.nc
EODATA/SMOS/L2SM/MIR_SMUDP2/2022/05/01/SM_OPER_MIR_SMUDP2_20220501T140633_20220501T145953_700_001_1/SM_OPER_MIR_SMUDP2_20220501T140633_20220501T145953_700_001_1.nc


In [30]:
with open("index.json", "w") as f:
    json.dump(index, f)

In [31]:
with open("index.json", "r") as f:
    index = json.load(f)

In [35]:

ds = xr.open_dataset(
    "reference://", 
    engine="zarr",
    backend_kwargs={
        "storage_options": {
            "fo": index[0],
            "remote_protocol": "s3",
            "remote_options": s3_storage_options
        },
        "consolidated": False
    }
)
# do analysis...
ds

In [36]:
ds.M_AVA0.load()