In [1]:
import sparrow as sp

In [2]:
import os
import tempfile
import uuid

from datasets import sdata_resolve

OUTPUT_DIR =  tempfile.gettempdir()

sdata=sdata_resolve( path = None, output=os.path.join( OUTPUT_DIR, f"sdata_{uuid.uuid4()}.zarr" ) )

  warn(f"ignoring keyword argument {k!r}")


[34mINFO    [0m The SpatialData object is not self-contained [1m([0mi.e. it contains some elements that are Dask-backed from    
         locations outside                                                                                         
         [35m/var/folders/q5/7yhs0l6d0x771g7qdbhvkvmr0000gp/T/[0m[95msdata_85b099b4-159b-473f-bb79-057e54338c32.zarr[0m[1m)[0m. Please 
         see the documentation of `[1;35mis_self_contained[0m[1m([0m[1m)[0m` to understand the implications of working with SpatialData 
         objects that are not self-contained.                                                                      
[34mINFO    [0m The Zarr backing store has been changed from [3;35mNone[0m the new file path:                                      
         [35m/var/folders/q5/7yhs0l6d0x771g7qdbhvkvmr0000gp/T/[0m[95msdata_85b099b4-159b-473f-bb79-057e54338c32.zarr[0m          


  warn(f"ignoring keyword argument {k!r}")


In [3]:
sdata

SpatialData object, with associated Zarr store: /private/var/folders/q5/7yhs0l6d0x771g7qdbhvkvmr0000gp/T/sdata_85b099b4-159b-473f-bb79-057e54338c32.zarr
├── Images
│     ├── 'clahe': DataArray[cyx] (1, 6432, 6432)
│     ├── 'dummy_image': DataArray[cyx] (1, 12864, 10720)
│     ├── 'min_max_filtered': DataArray[cyx] (1, 6432, 6432)
│     ├── 'raw_image': DataTree[cyx] (1, 12864, 10720), (1, 6432, 5360), (1, 3216, 2680), (1, 1608, 1340), (1, 804, 670)
│     ├── 'tiling_correction': DataArray[cyx] (1, 6432, 6432)
│     └── 'transcript_density': DataArray[cyx] (1, 6432, 6432)
├── Labels
│     ├── 'segmentation_mask': DataArray[yx] (6432, 6432)
│     └── 'segmentation_mask_expanded': DataArray[yx] (6432, 6432)
├── Points
│     └── 'transcripts': DataFrame with shape: (<Delayed>, 3) (2D points)
├── Shapes
│     ├── 'segmentation_mask_boundaries': GeoDataFrame shape: (3377, 1) (2D shapes)
│     └── 'segmentation_mask_expanded_boundaries': GeoDataFrame shape: (3377, 1) (2D shapes)
└── Tables
 

In [None]:
print( sdata.is_backed() )
print( sdata.path )

In [None]:
print( f"Content of {sdata.path}:" )
! ls {sdata.path}
print( "\n" )

print( f"Content of {sdata.path}/images:" )
! ls {sdata.path}/images

Note: you can remove an element from the zarr store (e.g. on the command line with `rm -r dummy_image`), without 'breaking' the `SpatialData` object. After reloading it from the `.zarr` store, the element that was removed will no longer be an element of the `SpatialData` object.

If the `SpatialData` object is not backed by a `.zarr` store, elements can be removed in the Python shell via `del ...`.

Excercise:

Try removing `dummy_image` from the `.zarr` store.
Next reload the `SpatialData` object.

### Images

DAPI, PolyT, multiplex,...

In [None]:
sdata[ "clahe" ] # -> xarray.DataArray (or datatree.DataTree for multiscale )
sdata[ "clahe" ].data # -> Dask array
sdata[ "clahe" ].data.compute() # -> numpy array

In [None]:
sdata[ "clahe" ]

Image layers have c,(z),y,x dimension. z dimension is optional.

In [None]:
sdata[ "clahe" ].data.dtype

The data type of an image layer can be integer or float.

In [None]:
from sparrow.image._image import _get_spatial_element
sdata[ "raw_image" ] # -> datatree.DataTree
se=_get_spatial_element( sdata, layer="raw_image" )  # gets scale0 in case it is multiscale
se # ->xarray.DataArray
se.data # -> Dask array

Images, Labels and Points are lazy if the `SpatialData` object is backed by a `.zarr` store. Lazy means they will not be 'pulled' into RAM, unless you ask for it (e.g. calling `.compute()`, `.persist()` on the Dask objects).

[Dask](https://www.dask.org/) enables out-of-core computation, allowing you to process datasets that exceed the available RAM, and also facilitates parallelized computations.

Note that currently Tables and Shapes are not lazy, and will be loaded into memory when you load a `SpatialData` object. In the future shapes will probably also be lazy, https://github.com/scverse/spatialdata/issues/359.

Support for lazy Tables should also be coming soon in `SpatialData`, but note that there is limited `Dask` support in e.g. `Scanpy` https://scanpy-tutorials.readthedocs.io/en/latest/dask.html, which would mean Tables need to be pulled in memory when `Scanpy` functions are applied on it.

We can visualize the images:

Using SPArrOW:

In [None]:
sp.pl.plot_image( sdata, img_layer="clahe", figsize=( 5,5 ), colorbar=True )

Via `spatialdata-plot` (https://github.com/scverse/spatialdata-plot):

In [None]:
import spatialdata_plot

sdata.pl.render_images( "clahe" ).pl.show()

Excercise: use matplotlib to visualize the image layer with name `min_max_filtered`.

In [None]:
# solution

import matplotlib.pyplot as plt

plt.imshow( sdata[ "min_max_filtered" ].data[0].compute() )

Interactive exploration of `SpatialData` object:

In [None]:
from napari_spatialdata import Interactive

#Interactive( sdata )

Images can have multiple channels:

In [None]:
sdata_macsima=sp.datasets.macsima_example()
sdata_macsima.images[ "HumanLiverH35" ]

In [None]:
#Interactive( sdata_macsima )

In [None]:
sdata_macsima[ "HumanLiverH35" ]

sp.pl.plot_image( sdata_macsima, img_layer="HumanLiverH35", channel="R0 DAPI", figsize = (5,5,) )

### Labels

Typically representing a segmentation mask

Labels and images are sometimes referred to as `raster` data.

In [None]:
sdata[ "segmentation_mask" ]

In [None]:
sdata[ "segmentation_mask" ].data.compute()

In [None]:
sdata[ "segmentation_mask" ].data.compute().dtype

Data type of labels is always int.

In [None]:
sdata.pl.render_images( "clahe" ).pl.render_labels( "segmentation_mask" ).pl.show()

Excercise:

Calculate the total number of cells (based on the segmentation mask provided).

Bonus: try not to load the segmentation mask in memory.

In [None]:
# Solution:

import dask.array as da

da.unique( sdata[ "segmentation_mask" ].data ).compute().shape

### Shapes

Shapes either represent the boundaries of a segmentation mask, or an annotation (e.g. tumor region).

In [None]:
sdata.shapes[ "segmentation_mask_boundaries" ]

In [None]:
print(type(sdata.shapes[ "segmentation_mask_boundaries" ]))

Shapes are of type GeoDataFrame.

They can be manipulated using the [geopandas](https://geopandas.org/en/stable/) library

In [None]:
sdata.shapes[ "segmentation_mask_boundaries" ].geometry.head()

When using `SPArrOW` to generate shapes (via e.g. `sp.im.segment` or `sp.sh.vectorize`), the index of the shapes layer holds the cell id (its name is 'cell_ID') and corresponds to the labels in the corresponding labels layer.

In [None]:
sdata.shapes[ "segmentation_mask_boundaries" ].index.name

Plot the shapes using `SPArrOW`

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    shapes_layer="segmentation_mask_boundaries",
    crd = [ 2000,4000, 2000,4000 ],
    figsize=(5,5,),
      )

Or using `spatialdata-plot`.

In [None]:
sdata_small = sdata.query.bounding_box(
    min_coordinate=[0, 1000], max_coordinate=[3000, 3000], axes=("x", "y"), target_coordinate_system="global"
)

sdata_small.pl.render_images( "clahe" ).pl.render_shapes( "segmentation_mask_boundaries", fill_alpha=0.5  ).pl.show()

## Points

Points are DaskDataFrame objects.

In [None]:
sdata[ "transcripts" ]

In [None]:
sdata[ "transcripts" ].compute().head() # -> pandas.DataFrame 

Points represent the spatial location of a feature. In our case this will almost always be a gene.

Exercise: Use the points layer `transcripts` to estimate how many unique genes were measured.

In [None]:
# solution:

sdata[ "transcripts" ][ "gene" ].nunique().compute()

## Tables

Tables are [AnnData](https://anndata.readthedocs.io/en/latest/) objects.

In [None]:
sdata[ "table_transcriptomics" ]

An `AnnData` object (say `adata`) contains following attributes:

- `adata.X`:  The main data matrix (cells x genes).
- `adata.obs`: Metadata for each cell (e.g., cell type, cluster ID, ...)
- `adata.var`: Metadata for each variable (gene/feature).
- `adata.uns`:  Unstructured information, like color schemes or settings.
- `adata.obsm`: Embeddings or reduced dimensions (e.g., PCA or UMAP coordinates).

### `adata.X`

- This is the core data matrix of the AnnData object, typically an m×n matrix, where m is the number of observations (cells), and n is the number of variables (genes or features).
- Often a sparse or dense NumPy array or SciPy sparse matrix.
- Stores the primary quantitative data for each cell/gene pair, such as raw counts, normalized expression values, or any transformed measurements.

In [None]:
sdata[ "table_transcriptomics" ].X

In [None]:
sdata[ "table_transcriptomics" ].X.toarray().shape

In [None]:
sdata[ "table_transcriptomics" ].to_df().head()

### `adata.obs`

- This is a DataFrame-like structure where each row corresponds to an observation in adata.X, typically cells or samples.
- Data type is a `pandas.DataFrame`.
- Stores metadata about each cell, such as cell type labels, sample IDs, batch information, or cluster ID's, cell annotation...

In [None]:
sdata[ "table_transcriptomics" ].obs.head()

In [None]:
sdata[ "table_transcriptomics_clustered" ].obs.head()

### `adata.var`

- This is a DataFrame-like structure where each row corresponds to a variable in adata.X, typically representing individual genes or features.
- Data type is a `pandas.DataFrame`.
- Stores metadata about each variable (e.g. gene), such as quality metrics, counts,...

In [None]:
sdata[ "table_transcriptomics" ].var.head()

In [None]:
sdata[ "table_transcriptomics_clustered" ].var.head()

### `adata.uns`

- `.uns` (unstructured data) is a dictionary for storing additional, often unstructured, information relevant to the dataset.
- Data type is a dictionary where you can store various data types, such as strings, arrays, or even nested dictionaries.
- Typically used for storing global dataset information, annotations, and visualization settings, like color palettes for clusters or parameter settings for computational methods

In [None]:
sdata[ "table_transcriptomics" ].uns

### `adata.obsm`

- `.obsm` is a mapping of additional multi-dimensional arrays associated with each observation (cell).
- Data type is a dictionary-like structure where each entry is typically a matrix or array of coordinates.
- Stores embeddings, dimensional reductions, or other coordinate-based data associated with cells

In [None]:
sdata[ "table_transcriptomics" ].obsm

In [None]:
sdata[ "table_transcriptomics" ].obsm[ "spatial" ][:5] # -> cell center

In [None]:
sdata[ "table_transcriptomics_clustered" ].obsm

Region key and Instance key.

An AnnData object can be annotated by a spatial element (`labels`, `shapes`, `points`).

AnnData objects generated using `SPArrOW` will be annotated by a `labels` layer.

In [None]:
from spatialdata.models import TableModel

sdata[ "table_transcriptomics" ].uns[ TableModel.ATTRS_KEY ]

In [None]:
from sparrow.utils._keys import _INSTANCE_KEY, _REGION_KEY

print( _INSTANCE_KEY ) # -> column in .obs that will be used for cell_ID
print( _REGION_KEY ) # column in .obs that will be used for linking AnnData object to spatial element (a labels layer, e.g. a segmentation mask).

In [None]:
sdata[ "table_transcriptomics_clustered" ].obs.head()

In [None]:
da.unique(sdata[ "segmentation_mask" ].data).compute()[:5]

Via correct use of this instance and region key, we can visualize a column in `adata.obs` using `spatialdata-plot`:

In [None]:
plt.figure(figsize=(5, 5))
ax = plt.gca()

column = "shapeSize"

sdata.pl.render_images( "clahe" ).pl.render_labels("segmentation_mask", color=column, method="datashader", fill_alpha=1).pl.show(
    coordinate_systems="global", ax=ax, colorbar=False)

Or via `SPArrOW`:

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    shapes_layer="segmentation_mask_boundaries", # segmentation_mask_boundaries is linked to table layer 'table_transcriptomics_clustered' through cell_ID.
    table_layer="table_transcriptomics_clustered",
    region = "segmentation_mask", # can be set to None in this example, as AnnData is only annotated by one labels layer (single sample).
    column="shapeSize",
    crd = [ 2000,4000,2000,4000 ],
    figsize=(5,5,),
    fig_kwargs={ "dpi": 100 },
     )

Interactive visualization

In [None]:
from napari_spatialdata import Interactive

#Interactive( sdata )

# note this issue in napari-spatialdata when plotting categorical data: https://github.com/scverse/napari-spatialdata/issues/328

Excercise: visualize the gene expression of the gene "Acta2".

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    shapes_layer="segmentation_mask_boundaries", # segmentation_mask_boundaries is linked to table layer 'table_transcriptomics_clustered' through cell_ID.
    table_layer="table_transcriptomics",
    region = "segmentation_mask", # can be set to None in this example, as AnnData is only annotated by one labels layer (single sample).
    column="Vwf",
    crd = [ 2000,4000,2000,4000 ],
    figsize=(5,5,),
    fig_kwargs={ "dpi": 100 },
     )

## Vectorization and Rasterization

We can go back and forth between labels and shapes using `SPArrOW`. 

Note: `spatialdata` also implements vectorization (`spatialdata.to_polygons`) and rasterization ( `spatialdata.rasterize` ), but these implementations are slower, and require much more RAM.

In [None]:
# conversion between labels and shapes using sparrow
sdata=sp.sh.vectorize(
    sdata,
    labels_layer="segmentation_mask",
    output_layer="segmentation_mask_boundaries_redo",
    overwrite=True,
        )

In [None]:
sdata[ "segmentation_mask_boundaries_redo" ]

In [None]:
sdata=sp.im.rasterize(
    sdata,
    shapes_layer="segmentation_mask_boundaries_redo",
    output_layer="segmentation_mask_redo",
    chunks = 5000,
    overwrite=True,
                 )

Excercise:

Are the labels layers `segmentation_mask` and `segmentation_mask_redo` equal? Do you expect them to be equal?

In [None]:
# solution:

from napari_spatialdata import Interactive

import numpy as np

pixels_not_equal=(~ np.equal( sdata[ "segmentation_mask_redo" ].data.compute(), sdata[ "segmentation_mask" ].data.compute() )).sum() 

print( f"After roundtrip vectorization and rasterization, {pixels_not_equal} pixels are not equal." )

#Interactive( sdata )

In [None]:
print(da.unique( sdata[ "segmentation_mask_redo" ].data ).compute().shape)
print(da.unique( sdata[ "segmentation_mask" ].data ).compute().shape)

### Coordinate systems

All elements in a `SpatialData` object are assigned to one or more coordinate systems, which allows for storing multiple samples in the same `SpatialData` object.

We refer to [this notebook](https://harpy.readthedocs.io/en/latest/tutorials/advanced/coordinate_systems.html) for more information

In [None]:
from spatialdata.transformations import get_transformation

get_transformation( sdata[ "clahe" ], get_all=True )

In [None]:
get_transformation( sdata[ "transcripts" ], get_all=True )