In [1]:
import datetime
import geopandas as gpd
import pandas as pd
import xarray as xr
import numpy as np
import xvec
import dask
from shapely.geometry import Polygon

In [2]:
# max_line_length = 88
# file_path = './accessor.py'

# with open(file_path, 'r') as file:
#     for line_number, line in enumerate(file, start=1):
#         if len(line) > max_line_length:
#             print(f"Line {line_number}: {line.strip()}")

### Create Sample Dask Dataset & set of geometries 

In [3]:

# Create a dataset with 2 variables and 3 time steps 
np.random.seed(0)

temperature = 15 + 8 * np.random.randn(20, 20, 3)
precipitation = 15 + 10 * np.random.randn(20, 20,3)
lat = np.linspace(30,40,20)
lon = np.linspace(10,20,20)



time = pd.date_range("2014-09-06", periods=3)
reference_time = pd.Timestamp("2014-09-05")


ds = xr.Dataset(
    data_vars=dict(
        temperature=(["x", "y", "time"], temperature),
        precipitation=(["x", "y", "time"], precipitation),
    ),
    coords=dict(
        x=lon,
        y=lat,
        time=time,
        reference_time=reference_time,
    ),
    attrs=dict(description="Weather related data."),
)
ds

In [4]:
# Create geometries over the dataset

from shapely.geometry import Polygon
num_polygons = 2  # Adjust the number of polygons as needed
polygons = []

for _ in range(num_polygons):
    # Generate random polygon coordinates within the bounding box of the downsampled dataset
    lon = np.random.uniform(ds.x.min(), ds.x.max(), 4)
    lat = np.random.uniform(ds.y.min(), ds.y.max(), 4)
    polygons.append(Polygon(zip(lon, lat)))


geoseries = gpd.GeoSeries(polygons)
gdf = gpd.GeoDataFrame(geometry=geoseries)

gdf = gdf.set_geometry('geometry')
gdf.crs = 'EPSG:4326'

polys = gdf.geometry.values

In [5]:
polys

<GeometryArray>
[<POLYGON ((10.554 38.441, 16.809 37.938, 19.233 36.647, 14.93 39.782, 10.554...>, <POLYGON ((18.022 33.995, 12.748 30.768, 17.89 32.144, 16.927 37.972, 18.022...>]
Length: 2, dtype: geometry

In [49]:


possible_dim_names = {'lat':'y',
                      'lon':'x'}

if 'lat' not in self._obj.sizes.keys():
    dim_name = possible_dim_names['lat']
    self._obj = self._obj.rename({dim_name: 'lat'})

    
if 'lon' not in self._obj.sizes.keys():
    dim_name = possible_dim_names['lon']
    self._obj = self._obj.rename({dim_name: 'lon'})
    
    

In [None]:
    def rename_dims(self):
        """Rename the dimension to lon, lat to be consistent ith the package requirment.
        Now it support rename (x, y) to (lon, lat)

        Returns
        -------
        dataset
            Dataset with renamed dimension (lon, lat).

        """
        
        
        possible_dim_names = {'lat':'y',
                              'lon':'x'}

        if 'lat' not in self._obj.sizes.keys():
            dim_name = possible_dim_names['lat']
            self._obj = self._obj.rename({dim_name: 'lat'})


        if 'lon' not in self._obj.sizes.keys():
            dim_name = possible_dim_names['lon']
            self._obj = self._obj.rename({dim_name: 'lon'})
        
        
        return self._obj

In [5]:
ds

In [18]:
var_cipher = {'latitude':{'latitude':'lat','longitude':'lon'},
              'Latitude':{'Latitude':'lat','Longitude':'lon'},
              'Lat':{'Lat':'lat','Lon':'lon'},
              'latitude_1':{'latitude_1':'lat','longitude_1':'lon'},
              'nav_lat':{'nav_lat':'lat','nav_lon':'lon'},
              'Y':{'Y':'lat','X':'lon'},
              'y':{'y':'lat','x':'lon'}}

test_dims = [k for k in var_cipher.keys() if k in ds.sizes.keys()]
test_dims

['y']

In [None]:
   def zonal_stats(
        self,
        polygons,
        stat,

    ):
   
        self._obj = self._obj.rename_dims()
        ss = self._obj.tra.sagg(
            polygons, stat=stat
        )

        return ss

### Extract values from a dataset indexed by a set of geometries

In [6]:
# In case the input dataset is small and does not need dask
extracted = ds.xvec.zonal_stats(polys, stat="mean", dask = False, n_jobs = -1)
extracted

y
x


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]


In [53]:
# In case the input dataset is small and does not need dask
extracted = ds.xvec.zonal_stats(polys, stat="sum", dask = False, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]


In [6]:
ds1 = ds.chunk(dict(x=4,y=4))

extracted = ds1.xvec.zonal_stats(polys, stat="mean", dask = True, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.56it/s]


In [7]:
ds1 = ds.chunk(dict(x=4,y=4))

extracted = ds1.xvec.zonal_stats(polys, stat="sum", dask = True, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.46it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.52it/s]


### Testing

In [1]:
from geodatasets import get_path
import datetime
import geopandas as gpd
import pandas as pd
import xarray as xr
import numpy as np
import xvec
import dask
from shapely.geometry import Polygon

In [2]:
counties = gpd.read_file(get_path("geoda.natregimes"))


In [3]:
polys = counties.geometry.values

In [108]:
ds = xr.tutorial.open_dataset("eraint_uvz")
#ds = xr.tutorial.open_dataset("air_temperature")

ds

In [5]:
#extracted = ds.xvec.zonal_stats(polys[:2], x_coords='longitude',y_coords='latitude', stat="mean", n_jobs = -1)
extracted = ds.xvec.zonal_stats(polys[:2], x_coords='lon',y_coords='lat', stat="mean", n_jobs = -1)

extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.25it/s]


['time']
{'geometry': 0    POLYGON ((-95.34258 48.54670, -95.34081 48.715...
1    POLYGON ((-118.85050 47.94969, -118.84732 48.4...
Name: geometry, dtype: geometry, 'time': [numpy.datetime64('2013-01-01T00:00:00.000000000'), numpy.datetime64('2013-01-01T06:00:00.000000000'), numpy.datetime64('2013-01-01T12:00:00.000000000'), numpy.datetime64('2013-01-01T18:00:00.000000000'), numpy.datetime64('2013-01-02T00:00:00.000000000'), numpy.datetime64('2013-01-02T06:00:00.000000000'), numpy.datetime64('2013-01-02T12:00:00.000000000'), numpy.datetime64('2013-01-02T18:00:00.000000000'), numpy.datetime64('2013-01-03T00:00:00.000000000'), numpy.datetime64('2013-01-03T06:00:00.000000000'), numpy.datetime64('2013-01-03T12:00:00.000000000'), numpy.datetime64('2013-01-03T18:00:00.000000000'), numpy.datetime64('2013-01-04T00:00:00.000000000'), numpy.datetime64('2013-01-04T06:00:00.000000000'), numpy.datetime64('2013-01-04T12:00:00.000000000'), numpy.datetime64('2013-01-04T18:00:00.000000000'), numpy.date

TypeError: Variable 'air': Could not convert tuple of form (dims, data[, attrs, encoding]): (None,         air1        air2        air3        air4        air5        air6  \
0  274.16626  273.520203  273.233521  273.637146  273.751831  272.696838   
1  274.16626  273.520203  273.233521  273.637146  273.751831  272.696838   

         air7        air8        air9       air10  ...     air2911  \
0  272.672607  273.060486  273.737061  272.893738  ...  274.046661   
1  272.672607  273.060486  273.737061  272.893738  ...  274.046661   

      air2912     air2913     air2914     air2915     air2916     air2917  \
0  274.583679  274.690338  273.858917  273.800659  273.826477  273.917023   
1  274.583679  274.690338  273.858917  273.800659  273.826477  273.917023   

      air2918     air2919     air2920  
0  272.999939  273.044495  273.414978  
1  272.999939  273.044495  273.414978  

[2 rows x 2920 columns]) to Variable.

In [109]:
geometries = polys[:2]
chunk_size = 2
n_jobs = -1
x_coords='lon'
y_coords='lat'
stat="mean"

In [17]:
try:
    import geopandas as gpd
except ImportError as err:
    raise ImportError(
        "The geopandas package is required for `xvec._spatial_agg()`. "
        "You can install it using 'conda install -c conda-forge geopandas' or "
        "'pip install geopandas'."
    ) from err

try:
    import rioxarray  # noqa
except ImportError as err:
    raise ImportError(
        "The rioxarray package is required for `xvec._spatial_agg()`. "
        "You can install it using 'conda install -c conda-forge rioxarray' or "
        "'pip install rioxarray'."
    ) from err

try:
    from joblib import Parallel, delayed
except ImportError as err:
    raise ImportError(
        "The joblib package is required for `xvec._spatial_agg()`. "
        "You can install it using 'conda install -c conda-forge joblib' or "
        "'pip install joblib'."
    ) from err

try:
    from tqdm import tqdm
except ImportError as err:
    raise ImportError(
        "The tqdm package is required for `xvec._spatial_agg()`. "
        "You can install it using 'conda install -c conda-forge tqdm' or "
        "'pip install tqdm'."
    ) from err

import gc

transform = ds.rio.transform()
geometry_chunks = [
    geometries[i : i + chunk_size]
    for i in range(0, len(geometries), chunk_size)
]

stats_dic = {}
for var in ds.data_vars:
    stats_dic[var] = []

    computed_results = []
    for chunk in tqdm(geometry_chunks):
        # Create a list of delayed objects for the current chunk
        chunk_results = Parallel(n_jobs=n_jobs)(
            delayed(_agg_geom)(
                geom,
                transform,
                var,
                x_coords,
                y_coords,
                stat=stat,
            )
            for geom in chunk
        )
        computed_results.extend(chunk_results)
    stats_dic[var] = computed_results

    # Clean the space
    gc.collect()

  0%|                                                     | 0/1 [00:00<?, ?it/s]


ValueError: None is not in list

In [16]:
def _agg_geom(
    self,
    geom,
    trans,
    var: str,
    x_coords: str = None,
    y_coords: str = None,
    stat: str = "mean",
):
    """Aggregate the values from a dataset over a polygon geometry.

    The CRS of the raster and that of points need to be in wgs84.
    Xvec does not verify their equality.

    Parameters
    ----------
    geom : Polygon[shapely.Geometry]
        An arrray-like (1-D) of shapely geometry, like a numpy array or GeoPandas
        GeoSeries.
    trans : affine.Affine
        Affine transformer.
        Representing the geometric transformation applied to the data.
    x_coords : Hashable
        Name of the axis containing ``x`` coordinates.
    y_coords : Hashable
        Name of the axis containing ``y`` coordinates.
    var : Hashable
        Name of the variable in the dataset to aggregate its values.
    stat : Hashable
        Spatial aggregation statistic method, by default "mean". It supports the
        following statistcs: ['mean', 'median', 'min', 'max', 'sum']

    Returns
    -------
    Array
        Aggregated values over the geometry.

    """
    try:
        import rasterio
    except ImportError as err:
        raise ImportError(
            "The rasterio package is required for `xvec._agg_geom()`. "
            "You can install it using 'conda install -c conda-forge rasterio' or "
            "'pip install rasterio'."
        ) from err

    import gc

    xar_chunk = ds[var]
    data_dims = list(xar_chunk.dims)

    # Index of x_coords & y_coords
    y_dim = data_dims.index(y_coords)
    x_dim = data_dims.index(x_coords)

    # Sizes of x_coords & y_coords
    y_dim_size = xar_chunk.shape[y_dim]
    x_dim_size = xar_chunk.shape[x_dim]

    if x_dim < y_dim:
        mask = rasterio.features.geometry_mask(
            [geom],
            out_shape=(x_dim_size, y_dim_size),
            transform=trans,
        )
    else:
        mask = rasterio.features.geometry_mask(
            [geom],
            out_shape=(y_dim_size, x_dim_size),
            transform=trans,
        )


    diff_axes = [idx for idx, dim in enumerate(data_dims) if dim not in [x_coords, y_coords]]
    mask = np.expand_dims(mask, axis=diff_axes)
    masked_data = xar_chunk * mask

    del mask, xar_chunk; gc.collect()

    if stat == "sum":
        stat_within_polygons = masked_data.sum(dim=[y_coords, x_coords])
    elif stat == "mean":
        stat_within_polygons = masked_data.mean(dim=[y_coords, x_coords])
    elif stat == "median":
        stat_within_polygons = masked_data.median(dim=[y_coords, x_coords])
    elif stat == "max":
        stat_within_polygons = masked_data.max(dim=[y_coords, x_coords])
    elif stat == "min":
        stat_within_polygons = masked_data.min(dim=[y_coords, x_coords])

    result = stat_within_polygons.values

    del masked_data, stat_within_polygons; gc.collect()

    return result


In [139]:
ds

In [140]:
var = 'z'
xar_chunk = ds[var]

In [141]:
data_dims = list(xar_chunk.dims)
data_dims

['month', 'level', 'latitude', 'longitude']

In [142]:
data_dims1 = list(ds.dims)
data_dims1

['longitude', 'latitude', 'level', 'month']

In [290]:
# Index of x_coords & y_coords
x_coords='longitude'
y_coords='latitude'


In [291]:
import rasterio
trans = ds.rio.transform()
stat = "mean"

In [324]:
data_vars = []
all_variables_results = []
for var in ds.data_vars:
    data_vars.append(var)
    computed_results = []
    for p in polys[:2]:
        xar_chunk = ds[var]
        data_dims = list(xar_chunk.dims)

        y_dim = data_dims.index(y_coords)
        x_dim = data_dims.index(x_coords)
        # Sizes of x_coords & y_coords
        y_dim_size = xar_chunk.shape[y_dim]
        x_dim_size = xar_chunk.shape[x_dim]


        if x_dim < y_dim:
            mask = rasterio.features.geometry_mask(
            [p],
            out_shape=(x_dim_size, y_dim_size),
            transform=trans,
        )
        else:
            mask = rasterio.features.geometry_mask(
                [p],
                out_shape=(y_dim_size, x_dim_size),
                transform=trans,
            )



        diff_axes = [idx for idx, dim in enumerate(data_dims) if dim not in [x_coords, y_coords]]
        mask = np.expand_dims(mask, axis=diff_axes)
        masked_data = xar_chunk * mask


        if stat == "sum":
            stat_within_polygons = masked_data.sum(dim=[y_coords, x_coords])
        elif stat == "mean":
            stat_within_polygons = masked_data.mean(dim=[y_coords, x_coords])
        elif stat == "median":
            stat_within_polygons = masked_data.median(dim=[y_coords, x_coords])
        elif stat == "max":
            stat_within_polygons = masked_data.max(dim=[y_coords, x_coords])
        elif stat == "min":
            stat_within_polygons = masked_data.min(dim=[y_coords, x_coords])

        result = stat_within_polygons.values

        computed_results.append(result)
        
    computed_results = np.stack(computed_results, axis=-1)
    all_variables_results.append(computed_results)
    

In [326]:
all_variables_results = np.stack(all_variables_results, axis=0)

In [327]:
all_variables_results.shape

(3, 2, 3, 2)

In [341]:
dims = list(xar_chunk.dims)
dims

['month', 'level', 'latitude', 'longitude']

In [342]:
dims = list(ds[var].dims)
dims = [dim for dim in data_dims if dim not in [x_coords, y_coords]]

In [343]:
#data_dims.append(name)
dims

['month', 'level']

In [330]:
data_vars

['z', 'u', 'v']

In [344]:
coords = {"data_variables":data_vars}
for idx, dim in enumerate(dims):
    dim_values = list(ds[dim].values)
    coords[dim] = dim_values
        
coords[name] = polys[:2]

In [345]:
coords

{'data_variables': ['z', 'u', 'v'],
 'month': [1, 7],
 'level': [200, 500, 850],
 'geometry': <GeometryArray>
 [<POLYGON ((-95.343 48.547, -95.341 48.715, -95.094 48.717, -95.095 48.912, -...>, <POLYGON ((-118.851 47.95, -118.847 48.478, -118.869 48.479, -118.87 48.647,...>]
 Length: 2, dtype: geometry}

In [333]:
all_variables_results.shape

(3, 2, 3, 2)

In [346]:
polys[:2].crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [334]:
vec_cube = xr.DataArray(
    data=all_variables_results, coords=coords,
    #dims = ['month', 'level'],
).xvec.set_geom_indexes(name, crs=df.crs)

In [335]:
vec_cube

In [338]:
vec_cube.to_dataset(dim="data_variables")

In [173]:
data_dims = list(ds[var].dims)
data_dims = [dim for dim in data_dims if dim not in [x_coords, y_coords]]

In [174]:
data_dims

['month', 'level']

In [196]:
data_dims_dic = {}
for idx, dim in enumerate(data_dims):
    data_dims_dic[dim] = len(ds[dim])

In [197]:
data_dims_dic

{'month': 2, 'level': 3}

In [213]:
data_dims = ['month', 'level']
data_dims = {'month': 2, 'level': 3}


for k in stats_dic.keys():
    s = stats_dic[k]
    col = f"{k}"
    columns1 = []
    columns2 = []
    for idx, (dim, dim_len) in enumerate(data_dims.items()):
        for d in range(dim_len):
            if idx == 0:
                columns1.append(f"{k}_{dim}_{d}")
            else:
                for col in columns1:
                    col = f"{col}_{dim}_{d}"
                    columns2.append(col)

In [214]:
columns1

['z_month_0', 'z_month_1']

In [215]:
columns2

['z_month_0_level_0',
 'z_month_1_level_0',
 'z_month_0_level_1',
 'z_month_1_level_1',
 'z_month_0_level_2',
 'z_month_1_level_2']

In [216]:
columns = ['z_month_0_level_0',
 'z_month_0_level_1',
 'z_month_0_level_2',
 'z_month_1_level_0',
 'z_month_1_level_1',
 'z_month_1_level_2']

In [191]:
data_dims = ['month', 'level']
for k in stats_dic.keys():
    s = stats_dic[k]
    columns = []
    col = f"{k}"
    for idx, dim in enumerate(data_dims):
        col = col + f"_{dim}"
        for d in range(len(ds[dim])):
            col = col + f"_{d}"
            print(col)
            
        

z_month_0
z_month_0_1
z_month_0_1_level_0
z_month_0_1_level_0_1
z_month_0_1_level_0_1_2


In [175]:
columns = []
for idx, dim in enumerate(data_dims):
    for d in range(len(ds[dim])):
        columns.append(f"{k}_{dim}{d+1}")

In [None]:
        for k in stats_dic.keys():
            s = stats_dic[k]
            columns = []
            for idx, dim in enumerate(data_dims):
                for d in range(len(self._obj[dim])):
                    columns.append(f"{k}_{dim}{d+1}")

In [181]:
len(data_dims)

2

In [178]:
keys_items = {}
for k in stats_dic.keys():
    s = stats_dic[k]
    #columns = []
    column = [f"k_{m}_{level}" for month in len(ds[dim]) for level in levels]

In [None]:
column_list = [f"{month}_{level}" for month in months for level in levels]


In [176]:
columns

['air_month1', 'air_month2', 'air_level1', 'air_level2', 'air_level3']

In [76]:
keys_items = {}

In [77]:
keys_items[k] = columns

In [78]:
s.shape

(2920,)

In [79]:
len(columns)

2920

In [218]:
df = pd.DataFrame()

In [220]:
df_k = pd.DataFrame([s], columns=columns)

In [221]:
df = pd.concat([df, df_k], axis=1)

In [222]:
df

Unnamed: 0,z_month_0_level_0,z_month_0_level_1,z_month_0_level_2,z_month_1_level_0,z_month_1_level_1,z_month_1_level_2
0,115063.722708,53882.101985,13675.31061,116059.911367,54557.304249,13837.991869


In [223]:
df = gpd.GeoDataFrame(df, geometry=geometries[:1])

In [224]:
df

Unnamed: 0,z_month_0_level_0,z_month_0_level_1,z_month_0_level_2,z_month_1_level_0,z_month_1_level_1,z_month_1_level_2,geometry
0,115063.722708,53882.101985,13675.31061,116059.911367,54557.304249,13837.991869,"POLYGON ((-95.34258 48.54670, -95.34081 48.715..."


In [225]:
name = "geometry"

In [226]:
coords = {name: df.geometry}

In [227]:
for idx, dim in enumerate(data_dims):
    dim_values = list(ds[dim].values)
    coords[dim] = dim_values

In [229]:
data_dims = ['month', 'level']

In [234]:
keys_items[k] = columns

In [235]:
keys_items

{'z': ['z_month_0_level_0',
  'z_month_0_level_1',
  'z_month_0_level_2',
  'z_month_1_level_0',
  'z_month_1_level_1',
  'z_month_1_level_2']}

In [236]:
data_vars = {}
dims = [name]
dims.extend(data_dims)
for key in keys_items.keys():
    data_vars[key] = (dims, df[keys_items[key]])

In [238]:
## Create VectorCube
vec_cube = xr.Dataset(
    data_vars=data_vars, coords=coords
).xvec.set_geom_indexes(name, crs=df.crs)

ValueError: Variable 'z': Could not convert tuple of form (dims, data[, attrs, encoding]): (['geometry', 'month', 'level'],    z_month_0_level_0  z_month_0_level_1  z_month_0_level_2  z_month_1_level_0  \
0      115063.722708       53882.101985        13675.31061      116059.911367   

   z_month_1_level_1  z_month_1_level_2  
0       54557.304249       13837.991869  ) to Variable.

In [107]:
vec_cube

In [72]:
data_dims = list(ds.dims)
data_dims = [dim for dim in data_dims if dim not in [x_coords, y_coords]]
print(data_dims)
df = pd.DataFrame()
keys_items = {}
for k in stats_dic.keys():
    s = stats_dic[k]
    columns = []
    for idx, dim in enumerate(data_dims):
        for d in range(len(ds[dim])):
            columns.append(f"{k}_{dim}{d+1}")
        keys_items[k] = columns
        # Create a new DataFrame with the current data and columns
        df_k = pd.DataFrame([s], columns=columns)
        # Concatenate the new DataFrame with the existing DataFrame
        df = pd.concat([df, df_k], axis=1)

df = gpd.GeoDataFrame(df, geometry=geometries)

['time']


ValueError: Length of values (2) does not match length of index (1)

In [25]:
xar_chunk

In [6]:
??xr.tutorial.open_dataset

[0;31mSignature:[0m
[0mxr[0m[0;34m.[0m[0mtutorial[0m[0;34m.[0m[0mopen_dataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0;34m'str'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_dir[0m[0;34m:[0m [0;34m'None | str | os.PathLike'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mengine[0m[0;34m:[0m [0;34m'T_Engine'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkws[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'Dataset'[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mopen_dataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;3

In [7]:
data_dims = list(ds.dims)

In [5]:
#extracted = ds.xvec.zonal_stats(polys[:2], x_coords='longitude',y_coords='latitude', stat="mean", n_jobs = -1)
extracted = ds.xvec.zonal_stats(polys[:2], x_coords='lon',y_coords='lat', stat="mean", n_jobs = -1)

extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.19it/s]


2
[274.16626 273.5202  273.23352 ... 272.99994 273.0445  273.41498]
['air1', 'air2', 'air3', 'air4', 'air5', 'air6', 'air7', 'air8', 'air9', 'air10', 'air11', 'air12', 'air13', 'air14', 'air15', 'air16', 'air17', 'air18', 'air19', 'air20', 'air21', 'air22', 'air23', 'air24', 'air25', 'air26', 'air27', 'air28', 'air29', 'air30', 'air31', 'air32', 'air33', 'air34', 'air35', 'air36', 'air37', 'air38', 'air39', 'air40', 'air41', 'air42', 'air43', 'air44', 'air45', 'air46', 'air47', 'air48', 'air49', 'air50', 'air51', 'air52', 'air53', 'air54', 'air55', 'air56', 'air57', 'air58', 'air59', 'air60', 'air61', 'air62', 'air63', 'air64', 'air65', 'air66', 'air67', 'air68', 'air69', 'air70', 'air71', 'air72', 'air73', 'air74', 'air75', 'air76', 'air77', 'air78', 'air79', 'air80', 'air81', 'air82', 'air83', 'air84', 'air85', 'air86', 'air87', 'air88', 'air89', 'air90', 'air91', 'air92', 'air93', 'air94', 'air95', 'air96', 'air97', 'air98', 'air99', 'air100', 'air101', 'air102', 'air103', 'air104',

TypeError: unhashable type: 'dict'

In [13]:
ds2 = ds.chunk(dict(lon=4,lat=4))
extracted = ds2.xvec.zonal_stats(polys[:2], x_axis=2, y_axis=1, stat="sum", dask = True, n_jobs = -1)
extracted

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


In [7]:
ds

Unnamed: 0,Array,Chunk
Bytes,14.76 MiB,182.50 kiB
Shape,"(2920, 25, 53)","(2920, 4, 4)"
Dask graph,98 chunks in 2 graph layers,98 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 14.76 MiB 182.50 kiB Shape (2920, 25, 53) (2920, 4, 4) Dask graph 98 chunks in 2 graph layers Data type float32 numpy.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,14.76 MiB,182.50 kiB
Shape,"(2920, 25, 53)","(2920, 4, 4)"
Dask graph,98 chunks in 2 graph layers,98 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Pytest

In [9]:
import geopandas as gpd
import numpy as np
import pandas as pd
import pytest
import shapely
import xarray as xr
from geopandas.testing import assert_geodataframe_equal
from pandas.testing import assert_frame_equal

import xvec  # noqa
from xvec import GeometryIndex

In [10]:
from shapely.geometry import Polygon

In [11]:
def test_aggregate_raster_cubes():
    #### Test spatial aggregation using geometries - sum aggregation ####
    # Create the dataset
    da = xr.DataArray(
        np.zeros((10, 10, 5)),
        coords={
            "x": range(10),
            "y": range(20, 30),
            "time": pd.date_range("2023-01-01", periods=5),
        },
    )
    da = da.to_dataset(name="test")

    # Create the polygons
    polygon1 = shapely.geometry.Polygon([(1, 22), (4, 22), (4, 26), (1, 26)])
    polygon2 = shapely.geometry.Polygon([(6, 22), (9, 22), (9, 26), (6, 26)])
    polygons = gpd.GeoSeries([polygon1, polygon2], crs="EPSG:4326")

    # Expected results
    expected = xr.DataArray(
        np.zeros((2, 5)),
        coords={
            "geometry": polygons,
            "time": pd.date_range("2023-01-01", periods=5),
        },
    ).xvec.set_geom_indexes("geometry", crs="EPSG:4326")

    expected = expected.to_dataset(name="test")
    expected = expected.set_coords("geometry")

    # Actual results
    actual = da.xvec.zonal_stats(polygons,'x', 'y', stat="sum", dask=False)

    # Testing
    xr.testing.assert_identical(actual, expected)

In [12]:
test_aggregate_raster_cubes()

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.68it/s]


In [240]:
chicago = gpd.read_file(get_path("geoda.chicago health"))

origin = destination = chicago.geometry.array
mode = ["car", "bike", "foot"]
date = pd.date_range("2023-01-01", periods=100)
hours = range(24)
rng = np.random.default_rng(1)
data = rng.integers(1, 100, size=(3, 100, 24, len(chicago), len(chicago)))

In [244]:
data.shape

(3, 100, 24, 77, 77)

In [241]:
chicago

Unnamed: 0,ComAreaID,community,TRACTCnt,shape_area,shape_len,Pop2012,Pop2014,PopChng,PopM,PopMP,...,InfntMR,LungCancer,ProstateC,Stroke,ChlBLLS,ChlLeadP,GonorrF,GonorrM,Tuberc,geometry
0,35,DOUGLAS,10,4.600462e+07,31027.054510,18238,19430,3.1645,8294,42.6866,...,13.4,74.5,85.5,62.1,482.2,0,1063.3,727.4,4.2,"POLYGON ((-87.60914 41.84469, -87.60915 41.844..."
1,36,OAKLAND,3,1.691396e+07,19565.506153,5918,6473,4.4791,2499,38.6065,...,8.2,54.5,54.2,43.7,435.4,0,1655.4,1629.3,6.7,"POLYGON ((-87.59215 41.81693, -87.59231 41.816..."
2,37,FULLER PARK,2,1.991670e+07,25339.089750,2876,2543,-6.1450,1218,47.8962,...,22.6,89.6,70.5,82.4,489.9,2,1061.9,1556.4,0.0,"POLYGON ((-87.62880 41.80189, -87.62879 41.801..."
3,38,GRAND BOULEVARD,14,4.849250e+07,28196.837157,21929,22531,1.3540,9681,42.9675,...,12.1,63.8,39.0,46.7,590.4,1,1454.6,1680.0,13.2,"POLYGON ((-87.60671 41.81681, -87.60670 41.816..."
4,39,KENWOOD,7,2.907174e+07,23325.167906,17841,18217,1.0428,8543,46.8958,...,8.9,49.1,46.2,31.5,397.9,0,610.2,549.1,0.0,"POLYGON ((-87.59215 41.81693, -87.59215 41.816..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,74,MOUNT GREENWOOD,4,7.558429e+07,48665.130539,19093,18357,-1.9653,9078,49.4525,...,3.3,55.0,16.9,26.7,133.6,0,0.0,0.0,0.0,"POLYGON ((-87.69646 41.70714, -87.69644 41.706..."
73,75,MORGAN PARK,7,9.187734e+07,46396.419362,22530,29300,13.0619,13625,46.5017,...,13.1,50.0,39.8,47.9,298.8,1,800.5,741.1,2.6,"POLYGON ((-87.64215 41.68508, -87.64249 41.685..."
74,76,OHARE,6,3.718356e+08,173625.984660,12559,22239,27.8177,11358,51.0724,...,2.0,37.4,2.8,40.4,182.9,0,0.0,0.0,6.3,"MULTIPOLYGON (((-87.83658 41.98640, -87.83658 ..."
75,77,EDGEWATER,17,4.844999e+07,31004.830946,54891,55276,0.3495,27729,50.1646,...,6.9,40.1,23.7,31.5,308.6,0,120.1,427.5,10.5,"POLYGON ((-87.65456 41.99817, -87.65456 41.998..."


In [246]:
origin.shape

(77,)