Merge pull request #77 from tethys-ts/dev

fix to using hdf5 results chunks
tethys-ts · Oct 20, 2022 · 50dd6d5 · 50dd6d5
2 parents a1dc526 + e1ee02c
commit 50dd6d5
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 21 deletions.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "tethysts" %}
-{% set version = "4.5.3" %}
+{% set version = "4.5.4" %}
 # {% set sha256 = "ae2cc83fb5a75e8dc3e1b2c2137deea412c8a4c7c9acca52bf4ec59de52a80c9" %}
 
 # sha256 is the prefered checksum -- you can get it for a file with:
@@ -45,7 +45,7 @@ requirements:
     - shapely
     - tethys-data-models >=0.4.11
     - hdf5tools >=0.0.7
-    - s3tethys >=0.0.2
+    - s3tethys >=0.0.4
 
 test:
   imports:

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 name = 'tethysts'
 main_package = 'tethysts'
 datasets = 'datasets/time_series'
-version = '4.5.3'
+version = '4.5.4'
 descrip = 'tethys time series S3 extraction'
 
 # The below code is for readthedocs. To have sphinx/readthedocs interact with
@@ -19,7 +19,7 @@
 if os.environ.get('READTHEDOCS', False) == 'True':
     INSTALL_REQUIRES = []
 else:
-    INSTALL_REQUIRES = ['zstandard', 'pandas', 'xarray', 'scipy', 'orjson', 'requests', 'shapely', 'tethys-data-models>=0.4.11', 'hdf5tools>=0.0.7', 's3tethys>=0.0.2']
+    INSTALL_REQUIRES = ['zstandard', 'pandas', 'xarray', 'scipy', 'orjson', 'requests', 'shapely', 'tethys-data-models>=0.4.11', 'hdf5tools>=0.0.7', 's3tethys>=0.0.4']
 
 # Get the long description from the README file
 with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:

diff --git a/tethysts/main.py b/tethysts/main.py
@@ -544,6 +544,8 @@ def get_results(self,
 
             ## Clear xarray cache...because it loves caching everything...
             ## This is to ensure that xarray will open the file rather than opening a cache
+            ## The next xarray version should have this issue fixed:
+            ## https://github.com/pydata/xarray/pull/4879
             xr.backends.file_manager.FILE_CACHE.clear()
 
             ## combine results

diff --git a/tethysts/utils.py b/tethysts/utils.py
@@ -726,29 +726,15 @@ def result_filters(data, from_date=None, to_date=None, from_mod_date=None, to_mo
 #     return file_path2
 
 
-def process_dataset_obj(results, from_date=None, to_date=None):
+def process_dataset(data, from_date=None, to_date=None):
     """
-
+    Stupid xarray being inefficient at parsing file objects...
     """
-    if isinstance(results, io.BytesIO):
-        try:
-            data = xr.load_dataset(results, engine='h5netcdf', cache=False)
-        except:
-            data = xr.load_dataset(results)
-    elif isinstance(results, xr.Dataset):
-        data = results
-    else:
-        raise TypeError('Not the right data type.')
-
     data = result_filters(data, from_date, to_date)
 
     data_obj = io.BytesIO()
     hdf5tools.xr_to_hdf5(data, data_obj)
 
-    data.close()
-    del data
-    del results
-
     return data_obj
 
 
@@ -781,8 +767,16 @@ def download_results(chunk: dict, bucket: str, s3: botocore.client.BaseClient =
 
         if chunk['key'].endswith('.zst'):
             file_obj = s3tethys.decompress_stream_to_object(file_obj, 'zstd')
+            data = xr.load_dataset(file_obj.read(), engine='scipy')
+        else:
+            data = xr.load_dataset(io.BytesIO(file_obj.read()), engine='h5netcdf')
+
+        data_obj = process_dataset(data, from_date=from_date, to_date=to_date)
+
+        data.close()
+        del data
 
-        data_obj = process_dataset_obj(file_obj, from_date=from_date, to_date=to_date)
+    del file_obj
 
     return data_obj