In [1]:
import datetime
import geopandas as gpd
import pandas as pd
import xarray as xr
import numpy as np
import xvec
import dask
from shapely.geometry import Polygon

In [2]:
# max_line_length = 88
# file_path = './accessor.py'

# with open(file_path, 'r') as file:
#     for line_number, line in enumerate(file, start=1):
#         if len(line) > max_line_length:
#             print(f"Line {line_number}: {line.strip()}")

### Create Sample Dask Dataset & set of geometries 

In [3]:

# Create a dataset with 2 variables and 3 time steps 
np.random.seed(0)

temperature = 15 + 8 * np.random.randn(20, 20, 3)
precipitation = 15 + 10 * np.random.randn(20, 20,3)
lat = np.linspace(30,40,20)
lon = np.linspace(10,20,20)



time = pd.date_range("2014-09-06", periods=3)
reference_time = pd.Timestamp("2014-09-05")


ds = xr.Dataset(
    data_vars=dict(
        temperature=(["x", "y", "time"], temperature),
        precipitation=(["x", "y", "time"], precipitation),
    ),
    coords=dict(
        x=lon,
        y=lat,
        time=time,
        reference_time=reference_time,
    ),
    attrs=dict(description="Weather related data."),
)
ds

In [4]:
# Create geometries over the dataset

from shapely.geometry import Polygon
num_polygons = 2  # Adjust the number of polygons as needed
polygons = []

for _ in range(num_polygons):
    # Generate random polygon coordinates within the bounding box of the downsampled dataset
    lon = np.random.uniform(ds.x.min(), ds.x.max(), 4)
    lat = np.random.uniform(ds.y.min(), ds.y.max(), 4)
    polygons.append(Polygon(zip(lon, lat)))


geoseries = gpd.GeoSeries(polygons)
gdf = gpd.GeoDataFrame(geometry=geoseries)

gdf = gdf.set_geometry('geometry')
gdf.crs = 'EPSG:4326'

polys = gdf.geometry.values

In [5]:
polys

<GeometryArray>
[<POLYGON ((10.554 38.441, 16.809 37.938, 19.233 36.647, 14.93 39.782, 10.554...>, <POLYGON ((18.022 33.995, 12.748 30.768, 17.89 32.144, 16.927 37.972, 18.022...>]
Length: 2, dtype: geometry

In [49]:


possible_dim_names = {'lat':'y',
                      'lon':'x'}

if 'lat' not in self._obj.sizes.keys():
    dim_name = possible_dim_names['lat']
    self._obj = self._obj.rename({dim_name: 'lat'})

    
if 'lon' not in self._obj.sizes.keys():
    dim_name = possible_dim_names['lon']
    self._obj = self._obj.rename({dim_name: 'lon'})
    
    

In [None]:
    def rename_dims(self):
        """Rename the dimension to lon, lat to be consistent ith the package requirment.
        Now it support rename (x, y) to (lon, lat)

        Returns
        -------
        dataset
            Dataset with renamed dimension (lon, lat).

        """
        
        
        possible_dim_names = {'lat':'y',
                              'lon':'x'}

        if 'lat' not in self._obj.sizes.keys():
            dim_name = possible_dim_names['lat']
            self._obj = self._obj.rename({dim_name: 'lat'})


        if 'lon' not in self._obj.sizes.keys():
            dim_name = possible_dim_names['lon']
            self._obj = self._obj.rename({dim_name: 'lon'})
        
        
        return self._obj

In [5]:
ds

In [18]:
var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
              'Latitude':{'Latitude':'lat','Longitude':'lon'},
              'Lat':{'Lat':'lat','Lon':'lon'},
              'latitude_1':{'latitude_1':'lat','longitude_1':'lon'},
              'nav_lat':{'nav_lat':'lat','nav_lon':'lon'},
              'Y':{'Y':'lat','X':'lon'},
              'y':{'y':'lat','x':'lon'}}

test_dims = [k for k in var_cipher.keys() if k in ds.sizes.keys()]
test_dims

['y']

In [None]:
   def zonal_stats(
        self,
        polygons,
        stat,

    ):
   
        self._obj = self._obj.rename_dims()
        ss = self._obj.tra.sagg(
            polygons, stat=stat
        )

        return ss

### Extract values from a dataset indexed by a set of geometries

In [6]:
# In case the input dataset is small and does not need dask
extracted = ds.xvec.zonal_stats(polys, stat="mean", dask = False, n_jobs = -1)
extracted

y
x


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]


In [53]:
# In case the input dataset is small and does not need dask
extracted = ds.xvec.zonal_stats(polys, stat="sum", dask = False, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]


In [6]:
ds1 = ds.chunk(dict(x=4,y=4))

extracted = ds1.xvec.zonal_stats(polys, stat="mean", dask = True, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.56it/s]


In [7]:
ds1 = ds.chunk(dict(x=4,y=4))

extracted = ds1.xvec.zonal_stats(polys, stat="sum", dask = True, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.46it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.52it/s]


### Testing

In [1]:
from geodatasets import get_path
import datetime
import geopandas as gpd
import pandas as pd
import xarray as xr
import numpy as np
import xvec
import dask
from shapely.geometry import Polygon

In [2]:
counties = gpd.read_file(get_path("geoda.natregimes"))


In [3]:
polys = counties.geometry.values

In [5]:
ds = xr.tutorial.open_dataset("eraint_uvz")
extracted = ds.xvec.zonal_stats(polys[:2], x_coords='longitude',y_coords='latitude', stat="mean", n_jobs = -1)
#extracted = ds.xvec.zonal_stats(polys[:2], x_coords='lon',y_coords='lat', stat="mean", n_jobs = -1)

extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.83it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.71it/s]


In [6]:
ds = xr.tutorial.open_dataset("air_temperature")
extracted = ds.xvec.zonal_stats(polys[:2], x_coords='lon',y_coords='lat', stat="mean", n_jobs = -1)

extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.67it/s]


### Pytest

In [9]:
import geopandas as gpd
import numpy as np
import pandas as pd
import pytest
import shapely
import xarray as xr
from geopandas.testing import assert_geodataframe_equal
from pandas.testing import assert_frame_equal

import xvec  # noqa
from xvec import GeometryIndex

In [10]:
from shapely.geometry import Polygon

In [11]:
def test_aggregate_raster_cubes():
    #### Test spatial aggregation using geometries - sum aggregation ####
    # Create the dataset
    da = xr.DataArray(
        np.zeros((10, 10, 5)),
        coords={
            "x": range(10),
            "y": range(20, 30),
            "time": pd.date_range("2023-01-01", periods=5),
        },
    )
    da = da.to_dataset(name="test")

    # Create the polygons
    polygon1 = shapely.geometry.Polygon([(1, 22), (4, 22), (4, 26), (1, 26)])
    polygon2 = shapely.geometry.Polygon([(6, 22), (9, 22), (9, 26), (6, 26)])
    polygons = gpd.GeoSeries([polygon1, polygon2], crs="EPSG:4326")

    # Expected results
    expected = xr.DataArray(
        np.zeros((2, 5)),
        coords={
            "geometry": polygons,
            "time": pd.date_range("2023-01-01", periods=5),
        },
    ).xvec.set_geom_indexes("geometry", crs="EPSG:4326")

    expected = expected.to_dataset(name="test")
    expected = expected.set_coords("geometry")

    # Actual results
    actual = da.xvec.zonal_stats(polygons,'x', 'y', stat="sum", dask=False)

    # Testing
    xr.testing.assert_identical(actual, expected)

In [12]:
test_aggregate_raster_cubes()

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.68it/s]
