In [None]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
import scanpy as sc
import squidpy as sq
import spatialdata as sd
import spatialdata_io as sdio
import spatialdata_plot

import torch
import scvi
import tangram as tg
import matplotlib.patches as patches

#from scsampler import scsampler
from joblib import Parallel, delayed

from spatialdata.transformations import (
        Affine,
        Identity,
        MapAxis,
        Scale,
        Sequence,
        Translation,
        get_transformation,
        get_transformation_between_coordinate_systems,
        set_transformation,
    )

prjdir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if prjdir not in sys.path:
    sys.path.append(prjdir)

n_jobs=32
sc.settings.n_jobs=n_jobs
sc.set_figure_params(dpi=100, frameon=True, vector_friendly=True, fontsize=10)

from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list('grey_to_blue', ['lightgrey', 'mediumblue'])

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
%load_ext autoreload
%autoreload 2
import src.spatial_helpers.spatial as spp
import src.spatial_helpers.spatialplot as spl
import src.spatial_helpers.sc as scp
scp.set_all_seeds()

In [None]:
datadir = '../../data/xenium_merged'
resultsdir = '../../data/xenium_results'
os.makedirs(resultsdir, exist_ok=True)
sc.settings.figdir = os.path.join(resultsdir)

In [None]:
crca_path = '../../data/public/crca.h5ad' # path to CRC atlas
refdata_crca_downsampled_path = '../../data/public/crca_refdata.h5ad' # path to save downsampled version

In [None]:
adata = sc.read_h5ad(os.path.join(datadir, 'crc_ffpe_integrated.h5ad'))

## Tangram

In [None]:
crca = sc.read_h5ad(crca_path)

In [None]:
layer_key='scvi130'

In [None]:
### MUI reference dataset

In [None]:
refdata = crca[crca.obs['dataset'] == 'MUI_Innsbruck']
refdata = refdata[refdata.obs['sample_type'] != 'blood'].copy()

In [None]:
# run tangram
tgmap, adata = scp.run_tangram(adata, refdata, ref_label='cell_type_middle', adata_layer=layer_key, refdata_layer='denoised')
tg.plot_training_scores(tgmap, bins=20, alpha=0.5)
tgmap.write_h5ad(os.path.join(datadir, 'tgmap_cell_type_middle_scvi_mui.h5ad'))

tgmap, adata = scp.run_tangram(adata, refdata, ref_label='cell_type_fine', adata_layer=layer_key, refdata_layer='denoised')
tg.plot_training_scores(tgmap, bins=20, alpha=0.5)
tgmap.write_h5ad(os.path.join(datadir, 'tgmap_cell_type_fine_scvi_mui.h5ad'))

In [None]:
tgmap = sc.read_h5ad(os.path.join(datadir, 'tgmap_cell_type_middle_scvi_mui.h5ad'))
tgmap, adata = scp.run_tangram(adata, refdata=None, tgmap=tgmap, ref_label='cell_type_middle')
tgmap = sc.read_h5ad(os.path.join(datadir, 'tgmap_cell_type_fine_scvi_mui.h5ad'))
tgmap, adata = scp.run_tangram(adata, refdata=None, tgmap=tgmap, ref_label='cell_type_fine')

In [None]:
# cell type middle
confmat, relabel_dict = scp.get_tangram_annotation(adata, predictions_key='tangram_cell_type_middle', cluster_key='scvi130_nb_leiden_2.5', n_thres=25000)
with mpl.rc_context({'font.size': 5, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 7))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Reds')
    plt.savefig(os.path.join(resultsdir, 'leiden_2.5_tangram_cell_type_middle.jpg'), dpi=300, bbox_inches='tight')

In [None]:
len(list(set(list(relabel_dict.values()))))
pd.DataFrame(list(relabel_dict.items()), columns=['Key', 'Value'])['Value'].value_counts()

In [None]:
relabel_dict_sub = scp.add_suffixes(relabel_dict)
adata.obs['mui_celltype_trangram_middle'] = adata.obs['scvi130_nb_leiden_2.5'].map(relabel_dict)
adata.obs['mui_celltype_trangram_middle_sub'] = adata.obs['scvi130_nb_leiden_2.5'].map(relabel_dict_sub)
adata.obs['mui_celltype_trangram_middle'].value_counts()

In [None]:
# cell type fine
confmat, relabel_dict = scp.get_tangram_annotation(adata, predictions_key='tangram_cell_type_fine', cluster_key='scvi130_nb_leiden_3.8', n_thres=25000)
with mpl.rc_context({'font.size': 4, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 10))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Reds')
    plt.savefig(os.path.join(resultsdir, 'leiden_3.8_tangram_cell_type_fine.jpg'), dpi=300, bbox_inches='tight')

In [None]:
len(list(set(list(relabel_dict.values()))))
pd.DataFrame(list(relabel_dict.items()), columns=['Key', 'Value'])['Value'].value_counts()

In [None]:
relabel_dict_sub = scp.add_suffixes(relabel_dict)
adata.obs['mui_celltype_trangram_fine'] = adata.obs['scvi130_nb_leiden_3.8'].map(relabel_dict)
adata.obs['mui_celltype_trangram_fine_sub'] = adata.obs['scvi130_nb_leiden_3.8'].map(relabel_dict_sub)
adata.obs['mui_celltype_trangram_fine'].value_counts()

In [None]:
### CRCA complete reference dataset

In [None]:
# downsample refdata
refdata = scsampler(crca, obsm='X_scANVI', n_obs=100000, random_state=0, copy=True, random_split=10)
refdata.write_h5ad(os.path.join(refdata_crca_downsampled_path), compression='gzip')

In [None]:
refdata = sc.read_h5ad(refdata_crca_downsampled_path)
refdata.var.set_index('var_names', drop=False, inplace=True)

In [None]:
refdata.obs['crca_cell_type_middle'] = refdata.obs['cell_type_middle']
refdata.obs['crca_cell_type_fine'] = refdata.obs['cell_type_fine']

In [None]:
# run tangram
tgmap, adata = scp.run_tangram(adata, refdata, ref_label='crca_cell_type_middle', adata_layer=layer_key)
tg.plot_training_scores(tgmap, bins=20, alpha=0.5)
tgmap.write_h5ad(os.path.join(datadir, 'tgmap_crca_cell_type_middle_scvi.h5ad'))

tgmap, adata = scp.run_tangram(adata, refdata, ref_label='crca_cell_type_fine', adata_layer=layer_key)
tg.plot_training_scores(tgmap, bins=20, alpha=0.5)
tgmap.write_h5ad(os.path.join(datadir, 'tgmap_crca_cell_type_fine_scvi.h5ad'))

In [None]:
tgmap = sc.read_h5ad(os.path.join(datadir, 'tgmap_crca_cell_type_middle_scvi.h5ad'))
tgmap, adata = scp.run_tangram(adata, refdata=None, tgmap=tgmap, ref_label='crca_cell_type_middle')
tgmap = sc.read_h5ad(os.path.join(datadir, 'tgmap_crca_cell_type_fine_scvi.h5ad'))
tgmap, adata = scp.run_tangram(adata, refdata=None, tgmap=tgmap, ref_label='crca_cell_type_fine')

In [None]:
# cell type middle
confmat, relabel_dict = scp.get_tangram_annotation(adata, predictions_key='tangram_crca_cell_type_middle', cluster_key='scvi130_nb_leiden_2.5', n_thres=25000)
with mpl.rc_context({'font.size': 5, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 7))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Reds')
    plt.savefig(os.path.join(resultsdir, 'leiden_2.5_tangram_crca_cell_type_middle.jpg'), dpi=300, bbox_inches='tight')

In [None]:
print(len(list(set(list(relabel_dict.values())))))
pd.DataFrame(list(relabel_dict.items()), columns=['Key', 'Value'])['Value'].value_counts()

In [None]:
relabel_dict_sub = scp.add_suffixes(relabel_dict)
adata.obs['crca_celltype_trangram_middle'] = adata.obs['scvi130_nb_leiden_2.5'].map(relabel_dict)
adata.obs['crca_celltype_trangram_middle_sub'] = adata.obs['scvi130_nb_leiden_2.5'].map(relabel_dict_sub)

In [None]:
# cell type fine
confmat, relabel_dict = scp.get_tangram_annotation(adata, predictions_key='tangram_crca_cell_type_fine', cluster_key='scvi130_nb_leiden_3.8', n_thres=25000)
with mpl.rc_context({'font.size': 4, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 10))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Reds')
    plt.savefig(os.path.join(resultsdir, 'leiden_3.8_tangram_crca_cell_type_fine.jpg'), dpi=300, bbox_inches='tight')

In [None]:
print(len(list(set(list(relabel_dict.values())))))
pd.DataFrame(list(relabel_dict.items()), columns=['Key', 'Value'])['Value'].value_counts()

In [None]:
relabel_dict_sub = scp.add_suffixes(relabel_dict)
adata.obs['crca_celltype_trangram_fine'] = adata.obs['scvi130_nb_leiden_3.8'].map(relabel_dict)
adata.obs['crca_celltype_trangram_fine_sub'] = adata.obs['scvi130_nb_leiden_3.8'].map(relabel_dict_sub)

In [None]:
# annotation comparison
confmat = pd.crosstab(adata.obs['crca_celltype_trangram_middle'], adata.obs['mui_celltype_trangram_middle'])
confmat = confmat.div(confmat.sum(axis=0), axis=1)
with mpl.rc_context({'font.size': 5, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 7))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Blues')
    plt.savefig(os.path.join(resultsdir, 'tangram_ref_middle_comparison.jpg'), dpi=300, bbox_inches='tight')

In [None]:
# annotation comparison
confmat = pd.crosstab(adata.obs['crca_celltype_trangram_fine'], adata.obs['mui_celltype_trangram_fine'])
confmat = confmat.div(confmat.sum(axis=0), axis=1)
with mpl.rc_context({'font.size': 5, 'xtick.labelsize': 10, 'ytick.labelsize': 10}):
    plt.figure(figsize=(15, 7))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Blues')
    plt.savefig(os.path.join(resultsdir, 'tangram_ref_fine_comparison.jpg'), dpi=300, bbox_inches='tight')

In [None]:
adata.write_h5ad(os.path.join(datadir, 'crc_ffpe_anno.h5ad'))

## Markers

In [None]:
sc.tl.rank_genes_groups(adata, layer='norm', key_added='scvi130_nb_leiden_2.5_rank', groupby='scvi130_nb_leiden_2.5', method='wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata, key='scvi130_nb_leiden_2.5_rank', n_genes=6, layer='norm', min_logfoldchange=0.5, dendrogram=False, save='scvi130_nb_leiden_2.5_markers.png')

In [None]:
sc.tl.rank_genes_groups(adata, layer='norm', key_added='scvi130_nb_leiden_3.5_rank', groupby='scvi130_nb_leiden_3.5', method='wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata, key='scvi130_nb_leiden_3.5_rank', n_genes=6, layer='norm', min_logfoldchange=0.5, dendrogram=False, save='scvi130_nb_leiden_3.5_markers.png')

In [None]:
sc.tl.rank_genes_groups(adata, layer='norm', key_added='scvi130_nb_leiden_3.8_rank', groupby='scvi130_nb_leiden_3.8', method='wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata, key='scvi130_nb_leiden_3.8_rank', n_genes=6, layer='norm', min_logfoldchange=0.5, dendrogram=False, save='scvi130_nb_leiden_3.8_markers.png')

In [None]:
adata.write_h5ad(os.path.join(datadir, 'crc_ffpe_anno.h5ad'))

## Final annotation

In [None]:
k1='mui_celltype_trangram_middle'
k2='mui_celltype_trangram_fine'
k3='crca_celltype_trangram_middle'
k4='crca_celltype_trangram_fine'
adata.obs['tangram_combined'] = adata.obs[k1].astype('str') + ' - ' + adata.obs[k2].astype('str') + ' | ' + adata.obs[k3].astype('str') + ' - ' +adata.obs[k4].astype('str')

In [None]:
topmarkers_dict = scp.rename_clusters_by_markers(adata, 'scvi130_nb_leiden_3.8_rank', n=7)
adata.obs['topmarkers'] = adata.obs['scvi130_nb_leiden_3.8'].astype('str').map(topmarkers_dict)
adata.obs['scvi130_nb_leiden_3.8_anno_notes'] = adata.obs['tangram_combined'] + ' | ' + adata.obs['topmarkers']

In [None]:
tangram_dict = dict(zip(adata.obs['scvi130_nb_leiden_3.8'], adata.obs['scvi130_nb_leiden_3.8_anno_notes']))
tangram_dict = dict(sorted(tangram_dict.items()))

In [None]:
confmat = pd.crosstab(adata.obs['tissue_region'], adata.obs['scvi130_nb_leiden_3.8'])
confmat = confmat.div(confmat.sum(axis=0), axis=1)
with mpl.rc_context({'font.size': 5, 'xtick.labelsize': 8, 'ytick.labelsize': 8}):
    plt.figure(figsize=(17, 2))
    plt.grid(False)
    sns.heatmap(confmat, annot=True, cmap='Blues')

In [None]:
adata.obs['anno_notes'] = adata.obs['scvi130_nb_leiden_3.8'].map(tangram_dict).astype('category')

In [None]:
anno_dict = {
 0: 'Cancer cell', # 'Cancer stem-like - Cancer TA-like | Cancer cell - Cancer TA-like | AREG-CD47-MET-CCND1-ERBB3-MKI67-STMN1',
 1: 'Plasma cell', # 'Plasma IgG - Plasma IgG | Plasma cell - Plasma IgG | IGHG1-IGHGP-IGHG4-IGHG2-IGHG3-IGKC-JCHAIN',
 2: 'Smooth muscle cell', # 'Endothelial cell - Pericyte | Pericyte - Pericyte | ACTA2-SPARCL1-ACTB-C1S-IGFBP7-RGS5-C1R',
 3: 'Epithelial cell', # 'Epithelial cell - Goblet | Epithelial cell - Colonocyte | CCL28-REG4-EPCAM-CDX1-JCHAIN-TFF3-ERBB2',
 4: 'Endothelial cell', # 'Endothelial cell - Endothelial arterial | Endothelial cell - Endothelial arterial | IGFBP7-RGS5-SPARC-PLVAP-A2M-FLT1-SPARCL1',
 5: 'Cancer or epithelial cell', # 'Cancer non-stem-like - TA progenitor | Epithelial cell - Colonocyte | EPCAM-CEACAM6-REG4-TFF3-MUC5AC-CEACAM8-IFNL1',
 6: 'T cell', # 'T cell CD4 - CD4 | CD4 - CD8 stem-like | CD2-CD3E-IL7R-CORO1A-IL2RB-PTPRC-CD3D',
 7: 'Cancer cell', # 'Cancer stem-like - TA progenitor | Epithelial progenitor - TA progenitor | TFF3-XBP1-C1QBP-CEACAM8-CD44-NOTCH1-CTNNB1',
 8: 'Fibroblast', # 'Stromal cell - Fibroblast S3 | Fibroblast - Fibroblast S3 | LUM-FN1-SPARC-IGFBP7-VCAN-DCN-C1S',
 9: 'Plasma cell', # 'Plasma IgA - Plasma IgA | Plasma cell - Plasma IgA | JCHAIN-IGKC-SSR4-MZB1-IRF4-FKBP11-DERL3',
 10: 'Macrophage', # 'Macrophage - Macrophage | Macrophage - Macrophage | CD74-MPEG1-CTSD-CD163-CD4-APOE-PLA2G7',
 11: 'Fibroblast', # 'Plasma IgA - Fibroblast S1 | Fibroblast - Fibroblast S1 | JCHAIN-IGKC-CXCL14-A2M-C1S-CXCL12-PDGFRA',
 12: 'Cancer cell', # 'Cancer non-stem-like - Cancer Goblet-like | Goblet - Goblet | TFF3-REG4-XBP1-MUC5AC-CEACAM6-SOX9-CD44',
 13: 'Cancer cell', # 'Cancer stem-like - TA progenitor | Cancer cell - Cancer Crypt-like | C1QBP-SOX9-CCL20-TUBA1B-IFITM3-TP53-MKI67',
 14: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | Cancer cell - Cancer Crypt-like | LAMP2-PRDX4-ERBB3-CCL20-CDX2-VEGFA-IRF8',
 15: 'B cell', # 'B cell - B cell activated | B cell - B cell activated | MS4A1-FCMR-IGHM-TNFRSF13C-CD74-CD79A-BANK1',
 16: 'Neutrophil', # 'Neutrophil - TAN1 | Neutrophil - Neutrophil | S100A9-G0S2-ITGAX-IL1B-SOCS3-PLEK-FCGR2A',
 17: 'Cancer cell', # 'Cancer stem-like - Cancer TA-like | Cancer cell - Cancer Colonocyte-like | CXCL3-CXCL1-IFITM3-HLA-B-CD74-AREG-ERBB3',
 18: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | Cancer cell - Cancer Crypt-like | RNF43-TFF3-CEACAM6-VEGFA-CD44-CEACAM1-CEACAM8',
 19: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | Cancer cell - Cancer Crypt-like | CEACAM6-CDX2-VEGFA-CEACAM8-ERBB3-FN1-CEACAM1',
 20: 'Macrophage', # 'Macrophage - Macrophage | Macrophage - Macrophage | CD74-CD163-MPEG1-CSF1R-CD4-FCGR2A-IL10RA',
 21: 'Cancer cell', # 'Cancer stem-like - Cancer TA-like | Cancer cell - Cancer TA-like | CEACAM6-CDX2-CEACAM8-UBE2C-HLA-B-VEGFA-CXCL3',
 22: 'Cancer cell', # 'Cancer non-stem-like - Cancer Colonocyte-like | Epithelial cell - Cancer Colonocyte-like | CEACAM1-CEACAM6-SOX9-CEACAM8-VEGFA-TMPRSS2-TFF3',
 23: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | CRLM - Cancer TA-like | IFITM3-AREG-CEACAM1-TFF3-CEACAM8-CEACAM6-RNF43',
 24: 'Fibroblast', # 'Stromal cell - Fibroblast S3 | Fibroblast - Fibroblast S2 | IGFBP7-CXCL14-LUM-SPARC-VCAN-C1R-C1S',
 25: 'Fibroblast', # 'Stromal cell - Fibroblast S3 | Fibroblast - Fibroblast S2 | IGFBP7-LUM-SPARC-CXCL14-C1S-ACTA2-DCN',
 26: 'Epithelial cell', # 'Epithelial cell - TA progenitor | Epithelial cell - TA progenitor | EPCAM-CDX2-CDX1-XBP1-ERBB2-C1QBP-ERBB3',
 27: 'Plasma cell', # 'Plasma IgG - Plasma IgG | Plasma cell - Plasma IgG | IGKC-IGHG1-IGHGP-LUM-IGHG4-IGFBP7-IGHG2',
 28: 'Fibroblast', # 'T cell CD4 - Treg | CD4 - Treg | LUM-DCN-C1S-SPARC-IGFBP7-C1R-VCAN',
 29: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | Cancer cell - Cancer Crypt-like | NOTCH1-SDC1-RNF43-SOX9-ID1-CD47-CDX2',
 30: 'Fibroblast', # 'Stromal cell - Fibroblast S3 | Fibroblast - Fibroblast S3 | DCN-MGP-LUM-C1S-C1R-CXCL12-VCAN',
 31: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | CRLM - Cancer Crypt-like | FN1-RNF43-CTNNB1-VEGFA-IFITM3-CDKN1B-STMN1',
 32: 'Cancer cell', # 'Cancer stem-like - Cancer Colonocyte-like | Cancer cell - Cancer Crypt-like | CCND1-CTNNB1-RNF43-LAMP1-EPCAM-ERBB2-SDC1',
 33: 'Cancer cell', # 'Cancer stem-like - Cancer Colonocyte-like | CRLM - Cancer Colonocyte-like | VEGFA-EPCAM-CEACAM6-CEACAM1-CEACAM8-TFF3-CDX2',
 34: 'Endothelial cell', # 'Endothelial cell - Endothelial arterial | Endothelial cell - Endothelial arterial | PLVAP-SPARCL1-IGFBP7-A2M-FLT1-RGS5-IFITM3',
 35: 'Epithelial cell', # 'Epithelial cell - Colonocyte | Epithelial cell - Colonocyte | TMPRSS2-CEACAM1-CEACAM8-CEACAM6-CCL28-CDKN2B-ANPEP',
 36: 'Cancer cell', # 'Cancer non-stem-like - Cancer Goblet-like | Cancer cell - Cancer TA-like | REG4-SOX9-CXCL14-EPCAM-LAMP1-PTEN-CTSD',
 37: 'Cancer or epithelial cell', # 'Cancer non-stem-like - Goblet | Goblet - Goblet | XBP1-EPCAM-TFF3-REG4-CEACAM6-MUC5AC-CDX2',
 38: 'Cancer cell', # 'Cancer non-stem-like - Cancer Colonocyte-like | Epithelial cell - Colonocyte BEST4 | DMBT1-TMPRSS2-TFF3-CCL28-CEACAM1-CCL20-ERBB2',
 39: 'Cancer cell', # 'Cancer non-stem-like - Cancer Colonocyte-like | Epithelial cell - Cancer Colonocyte-like | SOX9-CEACAM6-CEACAM8-CEACAM1-MET-TMPRSS2-STAT1',
 40: 'Macrophage', # 'Macrophage - Macrophage | Macrophage - Monocyte non-classical | CTSD-APOE-MMP12-CD74-ITGAX-CD68-FCGR2B',
 41: 'Cancer cell', # 'Cancer stem-like - Cancer Colonocyte-like | Cancer cell - Cancer Colonocyte-like | CEACAM6-CEACAM8-MX1-IFITM3-ACE2-RNF43-CEACAM1',
 42: 'Cancer cell', # 'Cancer stem-like - TA progenitor | Cancer cell - TA progenitor | C1QBP-CTNNB1-CCND1-CDX2-TP53-ACTB-SOX9',
 43: 'Cancer cell', # 'Cancer stem-like - Cancer Colonocyte-like | Cancer cell - Colonocyte | VEGFA-EPCAM-CDX2-TMPRSS2-ID1-CEACAM1-ID2',
 44: 'T cell', # 'Cancer stem-like - gamma-delta | Cancer cell - CD8 | CD2-CD3E-IL2RB-ITGAE-GZMA-CD8A-CCL5',
 45: 'Macrophage', # 'Stromal cell - Macrophage | Fibroblast - Macrophage | CTSD-SPARC-CD74-FN1-APOE-IGFBP7-LUM',
 46: 'Dendritic cell', # 'Endothelial cell - cDC1 | Endothelial cell - cDC2 | CD74-MPEG1-ITGAX-IL10RA-CD4-AIF1-CSF1R',
 47: 'Fibroblast', # 'Cancer non-stem-like - CD8 | Epithelial cell - Fibroblast S3 | SPARC-LUM-IGFBP7-FN1-VCAN-ACTA2-C1S',
 48: 'Mast cell', # 'Mast cell - Mast cell | Mast cell - Tuft | CPA3-KIT-MS4A2-GATA2-HPGDS-IGKC-P2RX1',
 49: 'Cancer cell', # 'Cancer stem-like - Cancer Crypt-like | Cancer cell - Cancer Crypt-like | NOTCH1-RNF43-ERBB2-CD44-CTNNB1-EPCAM-SYK',
 50: 'Schwann cell', # 'Stromal cell - Schwann cell | Fibroblast - Schwann cell | S100B-SPARC-SPARCL1-IGFBP7-NCAM1-C1R-C1S',
 51: 'Endothelial cell', # 'Macrophage - Endothelial venous | Macrophage - Endothelial venous | CCL21-IGFBP7-IFITM3-TGFB1-A2M-NOTCH1-MGP',
 52: 'Cancer cell', # 'Cancer stem-like - Cancer Colonocyte-like | Cancer cell - Cancer Colonocyte-like | ACTB-CEACAM6-LIF-CCND1-SMAD3-CEACAM8-CDKN2B',
 53: 'Epithelial cell', # 'Epithelial cell - Colonocyte BEST4 | Epithelial cell - Colonocyte BEST4 | CCL20-SPIB-IL2RG-TMPRSS2-CCL15-VSIR-JAK2',
 54: 'Macrophage', # 'Macrophage - Cancer Crypt-like | Macrophage - Macrophage cycling | ITGAX-CCND1-RNF43-TUBA1B-CTNNB1-PLA2G7-S100A9'
}

In [None]:
anno_dict_sub = scp.add_suffixes(anno_dict)

In [None]:
adata.obs['celltype'] = adata.obs['scvi130_nb_leiden_3.8'].map(anno_dict).astype('category')
adata.obs['celltype_sub'] = adata.obs['scvi130_nb_leiden_3.8'].map(anno_dict_sub).astype('category')

In [None]:
# reassign mixed clusters
adata.obs['celltype'] = adata.obs['celltype'].astype(str)
ix = (adata.obs['celltype'] == 'Cancer or epithelial cell') & (adata.obs['tissue_region'] == 'normal')
adata.obs.loc[ix, 'celltype'] = 'Epithelial cell'
ix = (adata.obs['celltype'] == 'Cancer or epithelial cell') & (adata.obs['tissue_region'] != 'normal')
adata.obs.loc[ix, 'celltype'] = 'Cancer cell'
adata.obs['celltype'] = adata.obs['celltype'].astype('category')

In [None]:
adata.obs['celltype'].value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 3))
sc.tl.dendrogram(adata, groupby='celltype', use_rep='scvi130')
sc.pl.dendrogram(adata, groupby='celltype', ax=ax)
fig.savefig(os.path.join(resultsdir, 'dendrogram_celltype.jpg'), dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 3))
sc.tl.dendrogram(adata, groupby='celltype_sub', use_rep='scvi130')
sc.pl.dendrogram(adata, groupby='celltype_sub', ax=ax)
fig.savefig(os.path.join(resultsdir, 'dendrogram_celltype_sub.jpg'), dpi=300, bbox_inches='tight')

In [None]:
scp.plot_composition(adata.obs, group_key='celltype', composition_key='segm_meth', width=3, height=6, save=os.path.join(resultsdir, 'composition_segm_meth.jpeg'))
scp.plot_composition(adata.obs, group_key='celltype', composition_key='tissue_region', width=3, height=6, save=os.path.join(resultsdir, 'composition_tissue_region.jpeg'))
scp.plot_composition(adata.obs, group_key='patient_id', composition_key='celltype', width=10, height=6, save=os.path.join(resultsdir, 'composition_celltype.jpeg'))

In [None]:
sc.tl.rank_genes_groups(adata, layer='norm', key_added='celltype_rank', groupby='celltype', method='wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata, key='celltype_rank', n_genes=6, layer='norm', min_logfoldchange=0.5, dendrogram=False, save='celltype_markers.png')

In [None]:
colors = {
    'Cancer cell': '#f54733',
    'Epithelial cell': '#b83d4a',
    'T cell': '#ff9d00',
    'Macrophage': '#f0dd0c',
    'Fibroblast': '#00bdad',
    'Plasma cell': '#19449c',
    'B cell': '#607bcc',
    'Schwann cell': '#8eaba6',
    'Dendritic cell': '#ff7fbf',
    'Neutrophil': '#FF3CFE',
    'Endothelial cell': '#911a62',
    'Smooth muscle cell': '#db959a',
    'Mast cell': '#c0d490',
    'Pericyte': '#4b9b83',
    'None': '#ECECEC'
}
adata.uns['colors'] = {'celltype': colors}

In [None]:
plt.figure(figsize=(8, 6))
for i, (cell_type, color) in enumerate(colors.items()):
    plt.barh(i, 1, color=color)
    plt.text(1.05, i, cell_type, va='center', ha='left', fontsize=12)
plt.axis('off')
plt.show()

In [None]:
adata.write_h5ad(os.path.join(datadir, 'crc_ffpe_anno.h5ad'))

In [None]:
sdata = sd.read_zarr(os.path.join(datadir, 'crca_xenium.zarr'))
sdata['anno'] = adata
sdata = spp.match_ids(sdata, ['cell_boundaries'], table_key='anno')
sdata.tables['anno'].obs['region'] = 'cell_boundaries'
sdata.set_table_annotates_spatialelement('anno', region_key='region', region='cell_boundaries')
sdata.delete_element_from_disk('anno')
sdata.write_element('anno', overwrite=True)