# SPArrOW pipeline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sparrow as sp

## 1. Read in the data

The example dataset for this notebook will be downloaded and cached using `pooch` via `sparrow.dataset.registry`.

The image is then read in to a `SpatialData` object (see https://spatialdata.scverse.org/en/latest/ for more information).

We also use the `bioio` package to read in the image, see https://bioio-devs.github.io/bioio/.

In [None]:
import tempfile

from sparrow.datasets.registry import get_registry

unit_testing = True

# change this path. It is the directory where the spatialdata .zarr will be saved.
OUTPUT_DIR =  tempfile.gettempdir()

registry=get_registry()
path_image = registry.fetch( "transcriptomics/resolve/mouse/20272_slide1_A1-1_DAPI.tiff" )
path_coordinates = registry.fetch("transcriptomics/resolve/mouse/20272_slide1_A1-1_results.txt")

In [None]:
from bioio import BioImage

img=BioImage( path_image )
print(img.dims)
img.dask_data

In [None]:
array = img.dask_data.squeeze( ( 0, 2 ) ) # squeeze T and Z dimension
array

In [None]:
import os
import uuid

from spatialdata import SpatialData, read_zarr

sdata = SpatialData()

zarr_path = os.path.join( OUTPUT_DIR, f"sdata_{uuid.uuid4()}.zarr")

sdata.write( zarr_path )
sdata = read_zarr( sdata.path )

sdata.is_backed()

In [None]:
sdata=sp.im.add_image_layer(
    sdata,
    arr = array,
    dims=( "c", "y", "x" ),
    output_layer="raw_image",
    overwrite=True,
    )

In [None]:
sdata[ "raw_image" ]

In [None]:
sp.pl.plot_image( sdata, img_layer = "raw_image" , crd = [0, 6432, 0, 6432], figsize = (5,5) )

In [None]:
# or alternatively via spatialdata plot
import spatialdata_plot

sdata.pl.render_images( "raw_image" ).pl.show()

In [None]:
sdata.path

In [None]:
#from napari_spatialdata import Interactive

#Interactive( sdata )

Excercise, add as multiscale image

In [None]:
sdata=sp.im.add_image_layer(
    sdata,
    arr = array,
    dims=( "c", "y", "x" ),
    output_layer="raw_image",
    scale_factors=[ 2,2,2,2 ],
    overwrite=True,
    )

In [None]:
type(sdata[ "raw_image" ])  # Now it is a DataTree

In [None]:
# get associated dask array
from sparrow.image._image import _get_spatial_element

se=_get_spatial_element( sdata, layer="raw_image" )
se.data

## 2. Image preprocessing

### 2.1 tiling correction and inpainting

When working with RESOLVE data, the data is acquired in tiles, and the illumination within a tile isn't always constant. Sometimes one side of a tile is more illuminated than the other, influencing the downstream analysis greatly. RESOLVE assured us this isn't linked to the counts of the transcripts, but this can be checked further on.
In general this step is not necessary for other datatypes (you can check this by plotting the complete image). 

Basic is a tool that can correct for this, and is used in this function. The size of the tile needs to be known in order to run the function. The dfault value for this function is the tile size of RESOLVe (2144).

This step also corrects for black lines in between the tiles, by using inpainting. 

This step is very specific for RESOLVE data and should not be run when working with Merscope, Xenium,... data.

In [None]:
sdata, flatfields = sp.im.tiling_correction(
    sdata=sdata,
    img_layer="raw_image",
    output_layer="tiling_correction",
    crd =  [0, 6432, 0, 6432],
    overwrite=True
)

In [None]:
sp.pl.plot_image( sdata, img_layer=[ "raw_image", "tiling_correction" ], crd =  [2000, 6000, 2000, 6000], figsize=(10,10) )

### 2.2 min-max filtering and contrast enhancing
The second step of the preprocessing the data includes a couple of steps:


- A min max filter can be added. The goal of this function is to substract background noise, and make the borders of the nuclei/cells cleaner, plus it will delete the occasional debris. If you take the size too small, smaller then the size of your nuclei, the function will create donuts, with black spots in the center of your cells.  If the size of the min max filter is chosen too big, not enough background is substracted, so a tradeoff should be made. This might need some finetuning. For nuclei in RESOLVE data, 45-55 is a great starting point. Bigger for whole cells. Adapt this parameter to make sure you delete debris and HALO's. 

- We recommend to perform contrast enhancing on your image. SPArrOW does this by using histogram equalization (CLAHE function). The amount of correction needed can be decided by adapting the contrast_clip value. If the image is already quite bright, 3.5 might be a good starting value. For dark images, you can go up to 10 or even more. Make sure at the end the whole image is evenly illuminated and no cells are dark in the background.
 
If you think you need more image processing, you can perform other steps using our map_image function. These images can then be added to the SpatialData object.

In [None]:
sdata = sp.im.min_max_filtering(
    sdata=sdata,
    img_layer="tiling_correction",
    output_layer="min_max_filtered",
    size_min_max_filter=45,
    overwrite=True,
)

sp.pl.plot_image(
    sdata,
    img_layer="min_max_filtered",
    crd=[ 2000,6000,2000,6000 ],
    figsize=(5, 5),
    )

sdata = sp.im.enhance_contrast(
    sdata=sdata,
    img_layer="min_max_filtered",
    output_layer="clahe",
    contrast_clip=3.5,
    chunks=20000,
    overwrite=True
)

sp.pl.plot_image(
    sdata,
    img_layer="clahe",
    crd=[2000,6000,2000,6000],
    figsize=(5, 5),
    )

In [None]:
#from napari_spatialdata import Interactive

#Interactive( sdata )

In [None]:
#sdata=sp.im.enhance_contrast( sdata, img_layer="tiling_correction", output_layer="test", overwrite=True )

### 2.3 Custom distributed preprocessing of images using `sp.im.map_image` and `Dask`

See https://docs.dask.org/en/stable/generated/dask.array.map_blocks.html and https://docs.dask.org/en/latest/generated/dask.array.map_overlap.html

Set `blockwise==True` if you want to do distributed processing using `dask.array.map_blocks` or `dask.array.map_overlap`, set `blockwise==False` if your function is already distributed (e.g. when using `dask_image` filters https://image.dask.org/en/latest/dask_image.ndfilters.html.)

In [None]:
import numpy as np
from numpy.typing import NDArray


def _my_dummy_function(image: NDArray, parameter: int | float )->NDArray:
    # input (1,1,y,x)
    # output (1,1,y,x)
    print(  f"Type of the image is: {type(image)}" )
    print( image.shape )
    return image*parameter

fn_kwargs = { "parameter": 2 }

sdata=sp.im.map_image(
    sdata,
    func = _my_dummy_function,
    fn_kwargs=fn_kwargs,
    img_layer = "raw_image",
    output_layer="dummy_image",
    chunks = 5000,
    blockwise=True, # if blockwise == True --> input to _my_dummy_function is a numpy array of size chunks, else it is a Dask array (with chunksize chunks)
    depth = None, # if blockwise == True, and depth specified, will use map_overlap instead of map_blocks for distributed processing
    overwrite=True,
    dtype=np.uint16,
    meta=np.array((), dtype=np.uint16),
        )

In [None]:
from sparrow.image._image import _get_spatial_element

_get_spatial_element( sdata, layer="raw_image").data.compute()[ :, :10, :10 ]

In [None]:
_get_spatial_element( sdata, layer="dummy_image").data.compute()[ :, :10,:10 ]

## 3. Segmenting the image

For the segmentation, we here show an example on how to use cellpose, a deep learning network based on a UNET architecture.

Multiple paramters need to be given as an input to the cellpose algorithm. We recommend tuning for the optimal segmentation quality. 
 
- diameter: Includes an estimate of the diameter of a nucleus. If put to none, cellpose will do the estimation by himself, but this estimation might take longer than the actual segmentation, and if often far off. Estimate around 7 micrometer (in this case 50 pixels at 0.138 micrometer per pixel) for a standard nucleus, and more for whole cells. Input is in pixels.This of course is tissue and method dependent. You can run the algorthim on a small piece (I[0:1000,0:1000] for example), to get an estimate of the size. However, this estimate isn't always accurate. So check the quality at the end. If you see all nuclei/cells are estimated too small, enlarge this parameter.
- device: Defines the device you want to work on, if you only have cpu, you can skip this input parameter. If only having CPU, please tune the parameters on a small subset, and then make it to the big one. This might take a while for large images, but it should work. 
- flow_threshold: Indicates something about the shape of the masks, if you increase it, more masks with less round shapes will be accepted. Up to one:  I take it between 0.6 and 0.95, depending on the cell shapes. Higher is less round. Lower it if you start segmenting artefacts, up it if you miss non-round shaped cells.
- mask_threshold: Indicates how many of the possible masks are kept. Making it smaller (up to -6), will give you more masks, bigger is less masks. I take it between 0 and -6. Be careful, you can oversegment: always check the quality 
- min_size: Indicates the minimal size of a nucleus. 
- If segmenting whole cells instead of nuclei, set the parameter model_type to 'cyto'.
- If using nuclei together with whole cells, put model_type to 'cyto', make sure your image is 3D and and that the first channel is you complete cell staining and you second one is the nucleus channel, put the parameter channel to np.array([1,0])

In [None]:
#from dask.distributed import Client, LocalCluster

#cluster = LocalCluster(
#    n_workers=1,
#    threads_per_worker=10,
#    memory_limit="32GB",
#)

#client = Client(cluster)

#print(client.dashboard_link)

In [None]:
import torch
from cellpose import models

from sparrow.image import cellpose_callable

gpu = False
device = "cpu"
model=models.CellposeModel( gpu=gpu, pretrained_model='nuclei', device = torch.device(device ) )

#model = client.scatter(model) # pass a loaded model to _cellpose, but we scatter the model to avoid large task graph

sdata = sp.im.segment(
    sdata,
    img_layer="clahe",
    chunks=2048,
    depth=200,
    model=cellpose_callable,
    # parameters that will be passed to the callable _cellpose
    pretrained_model = model,
    diameter=50,
    flow_threshold=0.9,
    cellprob_threshold=-4,
    output_labels_layer="segmentation_mask",
    output_shapes_layer="segmentation_mask_boundaries",
    crd= [2000, 4000, 2000, 4000] if unit_testing else None,  # region to segment [x_min, xmax, y_min, y_max],
    overwrite=True,
)

#client.close()

In [None]:
sp.pl.plot_shapes( sdata, img_layer="clahe", shapes_layer="segmentation_mask_boundaries", figsize=( 5,5 ), crd = [  2000, 4000, 2000, 4000  ] )

In [None]:
# or via spatialdata plot
sdata.pl.render_images( "clahe" ).pl.render_labels( "segmentation_mask" ).pl.show()

This is in general not recommended, but it is possible to expand cells beyond the segmented bodies.

In [None]:
sdata = sp.im.expand_labels_layer(
    sdata,
    labels_layer="segmentation_mask",
    distance=10,
    output_labels_layer="segmentation_mask_expanded",
    output_shapes_layer="segmentation_mask_expanded_boundaries",
    overwrite=True,
)

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    shapes_layer=["segmentation_mask_boundaries", "segmentation_mask_expanded_boundaries" ],
    figsize=( 10,10 ),
    crd = [  2000, 4000, 2000, 4000  ],
      )

## 4. Allocating  the transcripts

###  4.1 Creating the count matrix
In this step we
- load in the transcipts: in the case of RESOLVE this is done with a specific loader. If no specific loader exist for your datatype, you can use the general `sp.io.read_transcripts` function.
- allocate the transcripts to the correct cell. This allocation step creates the count matrix, saved in an anndata object.
- Visual checks

In [None]:
sdata = sp.io.read_resolve_transcripts(sdata, output_layer="transcripts", path_count_matrix=path_coordinates, overwrite=True)

sdata = sp.tb.allocate(
    sdata=sdata,
    labels_layer="segmentation_mask",
    points_layer="transcripts",
    output_layer="table_transcriptomics",
    update_shapes_layers=False,
    overwrite=True,
)

In [None]:
print( type( sdata[ "transcripts" ] ) )
sdata[ "transcripts" ].head()

Excercise. 

- Run .compute() on the points layer. What is the data type of the resulting object?
- Have a look at https://docs.dask.org/en/stable/dataframe.html.
- Extract transformation from the points layer "transcripts" using `spatialdata.transformations.get_transformation`. See https://spatialdata.scverse.org/en/stable/generated/spatialdata.transformations.get_transformation.html
- Now extract the transformation from the labels layer "segmentation_mask" and for the image layer "clahe".
- Visualize the points layer and the labels layer using napari-spatialdata. Convince yourself they are registered.

In [None]:
sdata[ "table_transcriptomics" ].X

In [None]:
sdata[ "table_transcriptomics" ].to_df().head()

In [None]:
sdata[ "table_transcriptomics" ].obs

In [None]:
from spatialdata.models import TableModel

sdata[ "table_transcriptomics" ].uns[TableModel.ATTRS_KEY ]
#->table is annotated by labels layer "segmentation_mask"
#->instance_key cell_ID matches labels in "segmentation_mask"

In [None]:
import dask.array as da

da.unique( sdata[ "segmentation_mask" ].data ).compute()

In [None]:
da.unique( sdata[ "segmentation_mask" ].data ).compute().shape
# -> note that not all cells are in table layer "table_transcriptomics".
# this is because not all cells could be assigned transcripts

In [None]:
sdata[ "segmentation_mask_boundaries" ].head()

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    shapes_layer="segmentation_mask_boundaries",
    figsize=( 5,5 ),
    crd = [  2000, 4000, 2000, 4000  ],
    table_layer="table_transcriptomics",
    column = "Axl",
      )

In [None]:
import matplotlib.pyplot as plt

# or via spatialdataplot
plt.figure(figsize=(5, 5))
ax = plt.gca()

gene_name =  "Axl"
sdata.pl.render_labels("segmentation_mask", color=gene_name, method="datashader", fill_alpha=0.5).pl.show(
    coordinate_systems="global", ax=ax
)

In [None]:
#from napari_spatialdata import Interactive

#Interactive( sdata )

###  4.2 Transcript quality plot
After we have created the anndata object, we control the transcript quality. 

First we create a plot to chekc if the transcript density is similar across the whole tissue. If this isn't the case, it can have multiple reasons. Most likely, there will be regions in which the transcript pick-up was less succesfull. Also gene panel choices can influence this plot. 

In [None]:
sdata = sp.im.transcript_density(
    sdata,
    img_layer="clahe",
    points_layer="transcripts",
    output_layer="transcript_density",
    overwrite=True,
)

In [None]:
sp.pl.plot_image( sdata, img_layer = [ "clahe", "transcript_density" ], figsize=( 10,10 ) )

As not all of the image surface is segmented, there will be most likely transcripts that weren't assigned to a cell. 
For sure in the case of nucleus segmentation (like this example), this will be the case.

In general, we hope to not lose any genes. So we hope there aren't genes with low abundances and a low proportion kept. In general we see a downward trend. The more a gene is measured, the less it is located in cells (in ratio).

We also provide a table with the genes that are the least located in cells. If a lot of these genes are markers for the same celltype, the staining might be missing this celltype and you should for sure check this. However, it might also be the case that is celltype just has a lot of cytoplasm and you are only segmenting the nucleus.

In [None]:
df = sp.pl.analyse_genes_left_out(
    sdata,
    labels_layer="segmentation_mask",
    table_layer="table_transcriptomics",
    points_layer="transcripts",
)

In [None]:
df.head()

## 5. Preprocess the table (AnnData)

### 5.1 Filtering and Normalization

The AnnData object is now processed:

- calculate QC metrics
- filter cells with less then 10 gene counts and genes with less then 5 cells (adaptations possible by adapting the function). These filtered cells are again filtered out of the shapes layer and the anndata obejct and saved in an extra shapes layer.
- Normalization: For small gene panel (<500), we recommend to normalize the data based on the size of the segmented object (size_norm=True). For transcriptome-wide methods, we recommend standard library size normalization (size_norm=False). 


The last plot shows the size of the nucleus related to the counts. When working with whole cells, if there are some really big xcells with really low counts, they are probably not real cells and you should filter based on max size. 

In [None]:
# Perform preprocessing.
sdata = sp.tb.preprocess_transcriptomics(
    sdata,
    labels_layer="segmentation_mask",
    table_layer="table_transcriptomics",
    output_layer="table_transcriptomics_preprocessed",  # write results to a new slot, we could also write to the same slot (when passing overwrite==True).
    min_counts=10,
    min_cells=5,
    size_norm=True,
    n_comps=50,
    overwrite=True,
    update_shapes_layers=False,
)

In [None]:
sdata[ "table_transcriptomics_preprocessed" ]

In [None]:
sdata[ "table_transcriptomics_preprocessed" ].to_df().mean( axis=0 ).head() # mean ~ 0

In [None]:
sdata[ "table_transcriptomics_preprocessed" ].to_df().std( axis=0 ).head() # std ~1

In [None]:
sdata[ "table_transcriptomics_preprocessed" ].to_df().head()

In [None]:
sdata[ "table_transcriptomics_preprocessed" ].obs.head()

pct_counts_in_top_2_genes: This column shows the percentage of the total gene expression (count data) in each cell that comes from the top 2 most highly expressed genes in that cell. For example, if 40% of a cell's total gene expression comes from just the top 2 genes, this value would be 40 for that cell.

In [None]:
(sdata[ "table_transcriptomics" ].to_df()).sum(axis=1 ).head()

In [None]:
(sdata[ "table_transcriptomics" ].to_df() >0 ).sum(axis=1 ).head()

In [None]:
sp.pl.preprocess_transcriptomics(
    sdata,
    table_layer="table_transcriptomics_preprocessed",
)

In [None]:
#from napari_spatialdata import Interactive

#Interactive( sdata )

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    table_layer="table_transcriptomics_preprocessed",
    column="total_counts",
    shapes_layer="segmentation_mask_boundaries",
    figsize=(8,8)
)

In this step you can filter cells based on their size: are you sure cells need to be bigger, or sure your cells can not be larger than X? 

You can delete them with this function by defining min_size and max_size. 

In [None]:
sdata = sp.tb.filter_on_size(
    sdata,
    labels_layer="segmentation_mask",
    table_layer="table_transcriptomics_preprocessed",
    output_layer="table_transcriptomics_filter",
    min_size=500,
    max_size=100000,
    update_shapes_layers=False,
    overwrite=True,
)

### 5.2 Clustering

This function performs the neighborhood analysis and the leiden clustering and the UMAP calculations using standard scanpy functions.

You need to define 2 parameters:
- the amount of PC's used: I normally choose something between 15-20 based on the plot of PC's.
- The amount of neighbors used: Normally I go for 35. Less neighbors means more spread, more means everything tighter, in general.

It returns the UMAP and marker gene list per cluster, that can be looked at for finding celltypes. 

In [None]:
import scanpy as sc

sdata = sp.tb.leiden(
    sdata,
    labels_layer="segmentation_mask",
    table_layer="table_transcriptomics_filter",
    output_layer="table_transcriptomics_clustered",
    calculate_umap=True,
    calculate_neighbors=True,
    n_pcs=17,
    n_neighbors=35,
    resolution=0.8,
    rank_genes=True,
    key_added="leiden",
    overwrite=True,
)

sc.pl.umap(sdata.tables["table_transcriptomics_clustered"], color=["leiden"], show=True)
sc.pl.rank_genes_groups(sdata.tables["table_transcriptomics_clustered"], n_genes=8, sharey=False, show=True)

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    table_layer="table_transcriptomics_clustered",
    column="leiden",
    shapes_layer="segmentation_mask_boundaries",
    alpha=1.0,
    linewidth=0,
)

In [None]:
sp.pl.plot_shapes(
    sdata,
    img_layer="clahe",
    table_layer="table_transcriptomics_clustered",
    column="leiden",
    shapes_layer="segmentation_mask_boundaries",
    alpha=1.0,
    linewidth=0,
    crd = [ 0, 3000, 1000, 3000 ]
)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

# for fun, also plot via spatialdataplot
plt.figure(figsize=(5, 5))
ax = plt.gca()

column =  "leiden"

adata = sdata.tables[ "table_transcriptomics_clustered" ]

cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
                    "new_map",
                    adata.uns[column + "_colors"],
                    N=len(adata.uns[column + "_colors"]),
                )

sdata_small = sdata.query.bounding_box(
    min_coordinate=[0, 1000], max_coordinate=[3000, 3000], axes=("x", "y"), target_coordinate_system="global"
)

sdata_small.pl.render_labels("segmentation_mask", color=column,cmap =cmap, method="datashader", fill_alpha=1).pl.show(
    coordinate_systems="global", ax=ax
)

In [None]:
path_mg = registry.fetch( "transcriptomics/resolve/mouse/markerGeneListMartinNoLow.csv" )

sdata, celltypes_scored, celltypes_all = sp.tb.score_genes(
    sdata,
    labels_layer="segmentation_mask",
    table_layer="table_transcriptomics_clustered",
    output_layer="table_transcriptomics_score_genes",
    path_marker_genes=path_mg,
    overwrite=True,
    )

In [None]:
sdata[ "table_transcriptomics_score_genes" ]

In [None]:
import scanpy as sc

from sparrow.utils._keys import _ANNOTATION_KEY

sc.pl.umap(sdata.tables[ "table_transcriptomics_score_genes" ], color=_ANNOTATION_KEY)

In [None]:
sp.pl.plot_shapes(
    sdata,
    column="annotation",
    img_layer="clahe",
    table_layer= "table_transcriptomics_score_genes",
    shapes_layer="segmentation_mask_boundaries",
    linewidth=0,
    alpha=0.7,
)

In [None]:
from napari_spatialdata import Interactive

#Interactive( sdata )

### 5.3 Custom processing on a table layer.

In [None]:
sdata[ "table_transcriptomics_score_genes" ].to_df().head()

In [None]:
sdata["table_transcriptomics_score_genes"]

In [None]:
import squidpy as sq

sq.gr.spatial_neighbors(  sdata["table_transcriptomics_score_genes"] ,coord_type="generic" )

In [None]:
sdata["table_transcriptomics_score_genes"]

In [None]:
# but not yet backed to the zarr store:
from spatialdata import read_zarr

sdata=read_zarr( sdata.path )
sdata["table_transcriptomics_score_genes"]

# observe how .uns[ "spatial_neighbors" ], .obsp[ "spatial_connectivities" ] and .obsp[ "spatial_distances" ] are no longer in table

Lets back the results to the zarr store

In [None]:
from sparrow.utils._keys import _REGION_KEY

sdata["table_transcriptomics_score_genes"].obs[ _REGION_KEY ].cat.categories.to_list()

In [None]:
sq.gr.spatial_neighbors(  sdata["table_transcriptomics_score_genes"] ,coord_type="generic" )

sdata = sp.tb.add_table_layer(
    sdata,
    adata=sdata["table_transcriptomics_score_genes"],
    output_layer="table_transcriptomics_squidpy",
    region=sdata["table_transcriptomics_score_genes"].obs[ _REGION_KEY ].cat.categories.to_list(),
    overwrite=True,
)

In [None]:
sdata["table_transcriptomics_squidpy"]

In [None]:
#sdata=sp.tb.nhood_enrichment( sdata, labels_layer="segmentation_mask", table_layer="table_transcriptomics_score_genes", output_layer="table_transcriptomics_squidpy", overwrite=True )

In [None]:
#sp.pl.nhood_enrichment( sdata, table_layer="table_transcriptomics_squidpy" )

In [None]:
#sdata[ "table_transcriptomics_squidpy"]

In [None]:
#from datasets import sdata_resolve

#sdata=sdata_resolve( output="/Users/arnedf/VIB/DATA/test_data/test.zarr" )