In [53]:
import mudata as mu
from deconvatac.tl import tangram
import pandas as pd
import numpy as np

class ExperimentWrapper:
    """
    A simple wrapper around a sacred experiment, making use of sacred's captured functions with prefixes.
    This allows a modular design of the configuration, where certain sub-dictionaries (e.g., "data") are parsed by
    specific method. This avoids having one large "main" function which takes all parameters as input.
    """

    def __init__(self, init_all=True):
        if init_all:
            self.init_all()

    def init_dataset(self, mdata_spatial_path, mdata_reference_path, var_HVF_column, labels_key, modality):

        self.spatial_path = mdata_spatial_path
        self.adata_spatial = mu.read_h5mu(mdata_spatial_path).mod[modality]
        self.adata_reference = mu.read_h5mu(mdata_reference_path).mod[modality]
        # subset on HVFs
        self.adata_spatial = self.adata_spatial[:, self.adata_reference.var[var_HVF_column]]
        self.adata_reference = self.adata_reference[:, self.adata_reference.var[var_HVF_column]]

        self.modality = modality
        self.labels_key = labels_key
        self.var_HVF_column = var_HVF_column

    def init_method(self, method_id):
        self.method_id = method_id

    def init_all(self):
        self.init_dataset()
        self.init_method()

    def run(self, output_path):

        dataset = self.spatial_path.split("/")[-1].split(".")[0]
        dataset_var_column = dataset + "_" + self.var_HVF_column
        output_path = output_path + self.modality + "/" + dataset_var_column

        tangram(
            adata_spatial=self.adata_spatial,
            adata_ref=self.adata_reference,
            labels_key=self.labels_key,
            run_rank_genes=False,
            result_path=output_path,
            device="cuda:0",
            num_epochs=1000,
        )

        results = {
            "result_path": output_path + "/tangram_ct_pred.csv",
            "dataset": dataset,
            "modality": self.modality,
            "var_HVF_column": self.var_HVF_column,
        }
        return results

In [24]:
! pwd

/vol/storage/martensl/deconvATAC/notebooks/analysis


In [54]:
tangram

<function deconvatac.tl.tangram.tangram(adata_spatial, adata_ref, labels_key, run_rank_genes=False, layer_rank_genes=None, num_epochs=1000, device='cpu', return_adatas=False, result_path='./tangram_results', **kwargs)>

In [55]:
ex = ExperimentWrapper(init_all=False)

## Create new reference


In [42]:

sample_cells = pd.read_parquet("/vol/storage/data/simulations/test/Heart_1.pq")


In [61]:
ref = mu.read_h5mu("/vol/storage/data/cellxgene/human_cardiac_niches/human_cardiac_niches.h5mu")

In [62]:
cell_ids = np.concatenate(sample_cells['cell_id'].values)

In [66]:
cell_ids.shape

(4311,)

In [63]:
ref = ref[np.unique(cell_ids)]

  if not is_categorical_dtype(df_full[k]):


In [67]:
ref

In [68]:
ref.write("/vol/storage/data/simulations/test/Heart1_ref.h5mu")

In [71]:
method_id =  "Tangram"
output_path =  "/vol/storage/data/deconvolution_results/test/"


# mdata_reference_path = "/vol/storage/data/cellxgene/human_cardiac_niches/human_cardiac_niches.h5mu"
mdata_reference_path = "/vol/storage/data/simulations/test/Heart1_ref.h5mu"
labels_key = "cell_type"

mdata_spatial_path = "/vol/storage/data/simulations/test/Heart_1.h5mu"

modality = "atac"
var_HVF_column = "highly_variable"

In [70]:
ex.init_dataset(mdata_spatial_path, mdata_reference_path, var_HVF_column, labels_key, modality)
ex.init_method(method_id)

  if not is_categorical_dtype(df_full[k]):


In [73]:
ex.run(output_path)

INFO:root:19988 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:19988 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.


1000


INFO:root:Begin training with 19988 genes and rna_count_based density_prior in cells mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.404, KL reg: 0.110
Score: 0.875, KL reg: 0.000
Score: 0.878, KL reg: 0.000
Score: 0.879, KL reg: 0.000
Score: 0.879, KL reg: 0.000
Score: 0.880, KL reg: 0.000
Score: 0.880, KL reg: 0.000
Score: 0.880, KL reg: 0.000
Score: 0.880, KL reg: 0.000
Score: 0.880, KL reg: 0.000


INFO:root:Saving results..
INFO:root:spatial prediction dataframe is saved in `obsm` `tangram_ct_pred` of the spatial AnnData.


{'result_path': '/vol/storage/data/deconvolution_results/test/atac/Heart_1_highly_variable/tangram_ct_pred.csv',
 'dataset': 'Heart_1',
 'modality': 'atac',
 'var_HVF_column': 'highly_variable'}

In [8]:
5

5

In [7]:
5


5

In [25]:
new = pd.read_csv("/vol/storage/data/deconvolution_results/tangram/atac/russell_250_highly_accessible/tangram_ct_pred.csv", index_col=0)

## Evaluate the results

In [74]:
import seml
import pandas as pd
import glob
import mudata as mu
import deconvatac as de
import seaborn as sns
import tqdm
import os

In [75]:
def get_proportions(adata):
    df = pd.DataFrame(adata.obsm["proportions"], columns=adata.uns["proportion_names"], index=adata.obs_names)
    return df

In [76]:
data_path = "/vol/storage/data/deconvolution_results"

In [77]:
methods = ["tangram"]  # cell2location, moscot
modalities = ["atac"]

In [124]:
# go over all methods and modalities
df = [pd.DataFrame({'path': glob.glob(os.path.join(data_path, method, modality, "*", "*"))}) for method in methods for modality in modalities]

In [125]:
df = pd.concat(df)

In [126]:
df

Unnamed: 0,path
0,/vol/storage/data/deconvolution_results/tangra...
1,/vol/storage/data/deconvolution_results/tangra...
2,/vol/storage/data/deconvolution_results/tangra...
3,/vol/storage/data/deconvolution_results/tangra...
4,/vol/storage/data/deconvolution_results/tangra...
5,/vol/storage/data/deconvolution_results/tangra...
6,/vol/storage/data/deconvolution_results/tangra...
7,/vol/storage/data/deconvolution_results/tangra...
8,/vol/storage/data/deconvolution_results/tangra...
9,/vol/storage/data/deconvolution_results/tangra...


In [127]:
df[['method', 'modality', 'dataset_features']] = df['path'].str.split('/', expand=True).iloc[:, 5:-1]

In [128]:
df['dataset'] = df['dataset_features'].str.rsplit("_", n=2).str[0]
df["features"] = df["dataset_features"].str.split("_", n=2).str[-1]

In [129]:
df

Unnamed: 0,path,method,modality,dataset_features,dataset,features
0,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Brain_1_highly_accessible,Brain_1,highly_accessible
1,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_3_highly_accessible,Heart_3,highly_accessible
2,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_variable,russell_250,highly_variable
3,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Brain_2_highly_variable,Brain_2,highly_variable
4,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible
5,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible
6,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Brain_4_highly_accessible,Brain_4,highly_accessible
7,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Brain_3_highly_accessible,Brain_3,highly_accessible
8,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Brain_1_highly_variable,Brain_1,highly_variable
9,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_1_highly_accessible,Heart_1,highly_accessible


In [146]:
mapping_dict = {
    "russell_250": "/vol/storage/data/simulations/russell_250.h5mu",
    "Heart_1": "/vol/storage/data/simulations/Heart_1.h5mu",
    "Heart_2": "/vol/storage/data/simulations/Heart_2.h5mu",
    "Heart_3": "/vol/storage/data/simulations/Heart_3.h5mu",
    "Heart_4": "/vol/storage/data/simulations/Heart_4.h5mu",
    "Brain_1": "/vol/storage/data/simulations/Brain_1.h5mu",
    "Brain_2": "/vol/storage/data/simulations/Brain_2.h5mu",
    "Brain_3": "/vol/storage/data/simulations/Brain_3.h5mu",
    "Brain_4": "/vol/storage/data/simulations/Brain_4.h5mu",
}

In [147]:
df["mdata_spatial_path"] = df['dataset'].map(mapping_dict)

In [148]:
df = df.query("(dataset == 'russell_250' | dataset == 'Heart_1') ")

In [149]:
df

Unnamed: 0,path,method,modality,dataset_features,dataset,features,mdata_spatial_path
2,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_variable,russell_250,highly_variable,/vol/storage/data/simulations/russell_250.h5mu
4,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible,/vol/storage/data/simulations/russell_250.h5mu
5,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible,/vol/storage/data/simulations/russell_250.h5mu
9,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_1_highly_accessible,Heart_1,highly_accessible,/vol/storage/data/simulations/Heart_1.h5mu
16,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_1_highly_variable,Heart_1,highly_variable,/vol/storage/data/simulations/Heart_1.h5mu
16,/vol/storage/data/deconvolution_results/test/a...,tangram,atac,Heart_1_highly_variable,Heart_1,highly_variable,/vol/storage/data/simulations/Heart_1.h5mu


In [150]:
def load_table(path, index_col):
    res = pd.read_csv(path, index_col=index_col)
    if "q05cell_abundance_w_sf_" in res.columns[0]:
        res.columns = res.columns.to_series().str.split("q05cell_abundance_w_sf_", expand=True).loc[:, 1].values
    elif "meanscell_abundance_w_sf_" in res.columns[0]:
        res.columns = res.columns.to_series().str.split("meanscell_abundance_w_sf_", expand=True).loc[:, 1].values
    if res.index[0] != 0:
        res.index = res.index.astype(int) - 1
    res.index = res.index.astype(str)
    if "cell_ID" in res.columns:
        res.drop("cell_ID", axis=1, inplace=True)
    res = res.div(res.sum(axis=1), axis=0)
    return res

In [151]:
df = pd.concat([df, df.iloc[[-1]]])

In [152]:
df["path"].iloc[-1] = "/vol/storage/data/deconvolution_results/test/atac/Heart_1_highly_variable/tangram_ct_pred.csv"

In [153]:
df

Unnamed: 0,path,method,modality,dataset_features,dataset,features,mdata_spatial_path
2,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_variable,russell_250,highly_variable,/vol/storage/data/simulations/russell_250.h5mu
4,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible,/vol/storage/data/simulations/russell_250.h5mu
5,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,russell_250_highly_accessible,russell_250,highly_accessible,/vol/storage/data/simulations/russell_250.h5mu
9,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_1_highly_accessible,Heart_1,highly_accessible,/vol/storage/data/simulations/Heart_1.h5mu
16,/vol/storage/data/deconvolution_results/tangra...,tangram,atac,Heart_1_highly_variable,Heart_1,highly_variable,/vol/storage/data/simulations/Heart_1.h5mu
16,/vol/storage/data/deconvolution_results/test/a...,tangram,atac,Heart_1_highly_variable,Heart_1,highly_variable,/vol/storage/data/simulations/Heart_1.h5mu
16,/vol/storage/data/deconvolution_results/test/a...,tangram,atac,Heart_1_highly_variable,Heart_1,highly_variable,/vol/storage/data/simulations/Heart_1.h5mu


In [154]:
jsd = []
rmse = []
for _, row in tqdm.tqdm(df.iterrows()):
    if i == 
    # load ground truth
    target_adata = mu.read(row["mdata_spatial_path"])
    targets = get_proportions(target_adata[row["modality"]])

    # load table
    predictions = load_table(row["path"], index_col=(None if row["method"] == "moscot" else 0))
    predictions = predictions.loc[targets.index, targets.columns]
    jsd.append(de.tl.jsd(predictions, targets))
    rmse.append(de.tl.rmse(predictions, targets))
df["jsd"] = jsd
df["rmse"] = rmse

5it [00:23,  4.74s/it]


KeyError: "['Adipocyte', 'Atrial Cardiomyocyte', 'Endothelial cell', 'Fibroblast', 'Lymphatic Endothelial cell', 'Lymphoid', 'Myeloid', 'Neural cell', 'Ventricular Cardiomyocyte'] not in index"

In [144]:
load_table(row["path"], index_col=(None if row["method"] == "moscot" else 0))

Unnamed: 0,Mural cell,Mast cell,Mesothelial cell
0,0.941892,0.010925,0.047183
1,0.585719,0.414263,0.000018
2,0.920166,0.016500,0.063334
3,0.842040,0.080708,0.077253
4,0.761817,0.153722,0.084461
...,...,...,...
956,0.871320,0.073932,0.054749
957,0.828523,0.143127,0.028349
958,0.611134,0.355417,0.033449
959,0.957805,0.001252,0.040943
