Merge pull request #84 from tethys-ts/dev

updated docs
tethys-ts · May 13, 2023 · c4afd2e · c4afd2e
2 parents b2568aa + 7438dcd
commit c4afd2e
Show file tree

Hide file tree

Showing 10 changed files with 50 additions and 40 deletions.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "tethysts" %}
-{% set version = "4.5.10" %}
+{% set version = "4.5.11" %}
 # {% set sha256 = "ae2cc83fb5a75e8dc3e1b2c2137deea412c8a4c7c9acca52bf4ec59de52a80c9" %}
 
 # sha256 is the prefered checksum -- you can get it for a file with:
@@ -42,9 +42,9 @@ requirements:
     - scipy
     - orjson
     - requests
-    - shapely
+    - shapely >=2
     - tethys-data-models >=0.4.11
-    - hdf5tools >=0.1.10
+    - hdf5tools >=0.1.12
     - s3tethys >=0.0.4
 
 test:

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 name = 'tethysts'
 main_package = 'tethysts'
 datasets = 'datasets/time_series'
-version = '4.5.10'
+version = '4.5.11'
 descrip = 'tethys time series S3 extraction'
 
 # The below code is for readthedocs. To have sphinx/readthedocs interact with
@@ -19,7 +19,7 @@
 if os.environ.get('READTHEDOCS', False) == 'True':
     INSTALL_REQUIRES = []
 else:
-    INSTALL_REQUIRES = ['zstandard', 'pandas', 'xarray', 'scipy', 'orjson', 'requests', 'shapely', 'tethys-data-models>=0.4.11', 'hdf5tools>=0.1.10', 's3tethys>=0.0.4']
+    INSTALL_REQUIRES = ['zstandard', 'pandas', 'xarray', 'scipy', 'orjson', 'requests', 'shapely>=2.0.1', 'tethys-data-models>=0.4.11', 'hdf5tools>=0.1.12', 's3tethys>=0.0.4']
 
 # Get the long description from the README file
 with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:

diff --git a/sphinx/requirements.txt b/sphinx/requirements.txt
@@ -1,6 +1,7 @@
-pandas==1.4.1
-tethysts==0.4.23
-tethys-data-models==0.4.9
+pandas==1.5.2
+tethysts==4.5.11
+tethys-data-models==0.4.11
+hdf5tools==0.1.12
 numpydoc
 ipython
 matplotlib
diff --git a/sphinx/source/docs/Hilltop_Server_Manual.doc b/sphinx/source/docs/Hilltop_Server_Manual.doc
diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
@@ -4,7 +4,7 @@ This Python package contains a class for reading geospatial time series data sto
 
 This documentation and the tethysts package are currently under active development. Most of the fundamental aspects of the API are well established, but minor tweaks and new functionality are constantly being made.
 
-The package dependencies are: zstandard, pandas, xarray, scipy, boto3, orjson, requests, and shapely.
+The package dependencies are: zstandard, pandas, xarray, scipy, boto3, orjson, requests, shapely, tethys-data-models, s3tethys, and hdf5tools.
 
 The GitHub repository is found `here <https://github.com/tethys-ts/tethysts>`_.
 Go to the `Tethys Dataset Discovery <https://wai.tethys-ts.xyz/>`_ dashboard to easily see what datasets are available.

diff --git a/sphinx/source/installation.rst b/sphinx/source/installation.rst
@@ -10,4 +10,12 @@ Or conda::
 
 Requirements
 ------------
-The package dependencies are: tethys-data-models, zstandard, pandas, xarray, scipy, boto3, orjson, requests, shapely, and hdf5tools.
+The package dependencies are: tethys-data-models, zstandard, pandas, xarray, scipy, boto3, orjson, requests, shapely, s3tethys, and hdf5tools.
+
+Important note
+---------------
+This package as well as other dependencies are continuously being updated and improved. I am also updating and maintaining this package in my spare time. This does mean that I may accidentally cause some breaking changes to old versions across dependencies. If you find yourself with some weird error, try to update the following packages to the most recent version:
+
+  - tethysts
+  - hdf5tools
+  - shapely
diff --git a/sphinx/source/package_references.rst b/sphinx/source/package_references.rst
@@ -6,22 +6,22 @@ Base class
 
 .. autoclass:: tethysts.Tethys
 
-Get stations
+Get versions
 -------------
 
-.. automethod:: tethysts.Tethys.get_stations
+.. automethod:: tethysts.Tethys.get_versions
 
-Get versions
+Get stations
 -------------
 
-.. automethod:: tethysts.Tethys.get_versions
+.. automethod:: tethysts.Tethys.get_stations
 
 Get results
 -------------
 
 .. automethod:: tethysts.Tethys.get_results
 
-get_run_dates
+
 API Pages
 ---------
 

diff --git a/sphinx/source/usage_a.rst b/sphinx/source/usage_a.rst
@@ -5,11 +5,13 @@ Background
 -----------
 This section describes how to use the tethysts package. The functions depend heavily on the `xarray package <http://xarray.pydata.org/>`_. Nearly all outputs are either as xarray Datasets or python lists of dictionaries.
 
-The datasets are organised in three layers:
+The datasets are organised in three main layers:
   - Dataset metadata
   - Stations
   - Results
 
+There is also versioning of the Stations and Results. Dataset metadata is not currently versioned.
+
 Dataset metadata
 ----------------
 The first step is to figure out what datasets are available.
@@ -36,7 +38,7 @@ Import the Tethys class:
 
 Public datasets
 ~~~~~~~~~~~~~~~
-Initialising the Tethys class without any parameters will pull down all public remotes and parse the associated dataset metadata. The datasets object is a list of dictionaries with a lot of metadata about each dataset. It should tell you practically all you need to know about data contained in the results (e.g. parameter, units, data licence, owner, etc). Use normal python list comprehension to select the dataset(s) of interest:
+Initialising the Tethys class without any parameters will pull down all public remotes and parse the associated dataset metadata. The datasets object is a list of dictionaries with a lot of metadata about each dataset. It should tell you practically all you need to know about the data contained in the results (e.g. parameter, units, data licence, owner, etc). Use normal python list comprehension to select the dataset(s) of interest:
 
 
 .. ipython:: python
@@ -52,7 +54,7 @@ Initialising the Tethys class without any parameters will pull down all public r
 
 Private datasets
 ~~~~~~~~~~~~~~~~
-Some datasets are not available through the public repository. Accessing private datasets stored in S3 buckets requires remote connection configuration data. A remote configuration requires a list of dictionaries of bucket name, connection_config/public_url, and version as shown in the following example:
+Some datasets are not available through the public repository. Accessing private datasets stored in S3 buckets requires remote connection configuration data. A remote configuration requires a list of dictionaries of bucket name, connection_config/public_url, and system version as shown in the following example:
 
 
 .. code:: python
@@ -132,7 +134,7 @@ To get a bunch of stations within a specified area, you can pass a polygon GeoJS
 
 Results
 -------
-But what you'll need next is to pick a station and write down the station_id just like you did with the dataset_id.
+What you'll need next is to pick a station and write down the station_id just like you did with the dataset_id.
 
 To get the results (the 4D data), you'll need a dataset_id and station_id. Internally, the results are broken up by dataset and station.
 The get_results method has many input options. Take a look at the reference page for a description of all the options.
@@ -144,7 +146,7 @@ The get_results method has many input options. Take a look at the reference page
   results = ts.get_results(dataset_id, station_id)
   results
 
-Unlike the previously returned objects, the results object (in this case) is an xarray Dataset. This xarray Dataset contains both the results (temperature) and all of the dataset metadata. If the results represent geospatially sparse data, then the results are indexed by geometry, height, and time. If the results represent gridded data, then the results are indexed by lat, lon, height, and time. The geometry dimension is a hexadecimal encoded Well-Known Binary (WKB) representation of the geometry. This was used to be flexible on the geometry type (i.e. points, lines, or polygons) and the WKB ensures that the geometry is stored accurately. This is a standard format by the Open Geospatial Consortium (OGC) and can be parsed by many programs including shapely, PostGIS, etc. Using WKB in a geometry dimension does not follow CF conventions. This was a trade off between flexibility, simplicity, and following standards. I picked flexibility and simplicity.
+Unlike the previously returned objects, the results object (in this case) is an xarray Dataset. This xarray Dataset contains both the results (temperature) and all of the dataset metadata. If the results represent geospatially sparse data, then the results are indexed by geometry, height, and time. If the results represent gridded data, then the results are indexed by lat, lon, height, and time. The geometry dimension is a hexadecimal encoded Well-Known Binary (WKB) representation of the geometry. This was used to be flexible on the geometry type (i.e. points, lines, or polygons) and the WKB ensures that the geometry is stored accurately. This is a standard format by the Open Geospatial Consortium (OGC) and can be parsed by many programs including shapely, PostGIS, etc. Using WKB in a geometry dimension does not follow CF conventions, however. This was a trade off between flexibility, simplicity, and following standards. I leaned towards flexibility and simplicity on this one.
 
 In addition to the get_stations spatial queries, the get_results method has a built-in nearest neighbour query if you omit the station_id and pass either geometry dict or a combination of latitude and longitude. This is especially useful for gridded results when each station represents a large area rather than a single point.
 
@@ -156,7 +158,7 @@ In addition to the get_stations spatial queries, the get_results method has a bu
   results = ts.get_results(dataset_id, geometry=geometry, squeeze_dims=True)
   results
 
-If you want to get more than one station per dataset, then you can still use the get_results. The output will concatenate the xarray Datasets together and return a single xarray Dataset.
+If you want to get more than one station per dataset, then you can still use the get_results. The output will concatenate the xarray Datasets together and return a single xarray Dataset. Since the get_results method is multithreaded when downloading results, passing multiple station ids to it will be much faster than using a "for loop" over each station id.
 
 .. ipython:: python
 
@@ -168,19 +170,18 @@ If you want to get more than one station per dataset, then you can still use the
 
 Saving to hdf5 files
 ~~~~~~~~~~~~~~~~~~~~
-Starting in version 4.5, Tethys can now save results directly to hdf5 files that can be opened by xarray. You must specify an output_path and optionally a compression for the hdf5 file (gzip is the default compression). There's no consern for excessive data volume in this process. You can download results from one station or all stations in a dataset to a single file without much trouble. It's recommended to save the file with the .h5 extension rather than the .nc extension to make it clear that it's a normal hdf5 file rather than a fully netcdf4-compliant file. Future versions might be formatted to be fully netcdf4-compliant...if I can figure out all of the nuances...any help is appreciated!
+Starting in version 4.5, Tethys can now save results directly to hdf5 files that can be opened by xarray. You must specify an output_path and optionally a compression for the hdf5 file (lzf is the default compression). There's no concern for excessive data volume in this process. You can download results from one station or all stations in a dataset to a single file without much trouble. It's recommended to save the file with the .h5 extension rather than the .nc extension to make it clear that it's a normal hdf5 file rather than a fully netcdf4-compliant file. Future versions might be formatted to be fully netcdf4-compliant...if I can figure out all of the nuances...any help is appreciated! *Update using hdf5tools>=0.1.12*...I've managed to make the hdf5 file compatible with the python netcdf4 package. This means that files created by the tethysts package should be compatible with any python packages that read netcdf4 data (which of course includes xarray).
 
 .. code:: python
 
     results = ts.get_results(dataset_id, station_ids, output_path='/my/local/path/results.h5', compression='lzf')
 
 
-And if you'd like to reopen the hdf5 file with xarray later, then you need to set engine='h5netcdf' in the xr.open_dataset function.
+And if you'd like to reopen the hdf5 file with xarray later, then you can use the xr.open_dataset function as normal (even with advanced compression...somehow...).
 
 .. code:: python
 
-    results = xr.open_dataset('/my/local/path/results.h5', engine='h5netcdf')
-
+    results = xr.open_dataset('/my/local/path/results.h5')
 
 
 Selective filters

diff --git a/tethysts/tests/utest_tethysts.py b/tethysts/tests/utest_tethysts.py
@@ -13,6 +13,7 @@
 import tethys_utils as tu
 import numpy as np
 
+
 pd.options.display.max_columns = 10
 
 ##############################################
@@ -563,14 +564,11 @@
 
 
 
+# db = booklet.open('/media/nvme1/data/OLW/web_app/output/catch_lc.blt')
 
+# keys = list(db.keys())
 
-
-
-
-
-
-
+# db[9259625]
 
 
 

diff --git a/tethysts/utils.py b/tethysts/utils.py
@@ -153,12 +153,10 @@ def get_nearest_station(stns, geom_query):
 
     geom1 = [shape(s['geometry']) for i, s in stns.items()]
     strtree = STRtree(geom1)
-    res = strtree.nearest(geom_query)
-    res_id = res.wkb_hex
+    res_index = strtree.nearest(geom_query)
 
-    stn_id_dict = {shape(s['geometry']).wkb_hex: i for i, s in stns.items()}
-
-    stn_id = stn_id_dict[res_id]
+    stn_ids_list = list(stns.keys())
+    stn_id = stn_ids_list[res_index]
 
     return stn_id
 
@@ -170,14 +168,18 @@ def get_intersected_stations(stns, geom_query):
     if isinstance(geom_query, dict):
         geom_query = shape(geom_query)
 
+    stn_ids_list = list(stns.keys())
     geom1 = [shape(s['geometry']) for i, s in stns.items()]
     strtree = STRtree(geom1)
-    res = strtree.query(geom_query)
-    res_ids = [r.wkb_hex for r in res]
+    res_index = strtree.query(geom_query)
+
+    stn_ids = [stn_ids_list[r] for r in res_index]
+
+    # res_ids = [r.wkb_hex for r in res]
 
-    stn_id_dict = {shape(s['geometry']).wkb_hex: i for i, s in stns.items()}
+    # stn_id_dict = {shape(s['geometry']).wkb_hex: i for i, s in stns.items()}
 
-    stn_ids = [stn_id_dict[r] for r in res_ids]
+    # stn_ids = [stn_id_dict[r] for r in res_ids]
 
     return stn_ids