In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from anndata import AnnData
import pathlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import tangram as tg
import os

In [None]:
data_root = "./data/embyo_E8.5"

seq_data = sc.read_h5ad(f"{data_root}_sc.h5ad")
spatial_data = sc.read_h5ad(f"{data_root}_st.h5ad")

In [None]:
spatial_data.var_names = [x.lower() for x in spatial_data.var_names]
seq_data.var_names = [x.lower() for x in seq_data.var_names]

spatial_data.var_names_make_unique()
seq_data.var_names_make_unique()

sc.pp.calculate_qc_metrics(seq_data, percent_top=None, log1p=False, inplace=True)
mt_genes = seq_data.var_names.str.startswith("mt-")
seq_data = seq_data[seq_data.obs["total_counts"] <= 33000, ~mt_genes].copy()
sc.pp.filter_genes(seq_data, min_cells=int(0.01*seq_data.shape[0]))
seq_data.X = seq_data.X.toarray()

sc.pp.calculate_qc_metrics(spatial_data, percent_top=None, log1p=False, inplace=True)
spatial_data = spatial_data[spatial_data.obs["total_counts"] <= 600, :].copy()


adata = spatial_data.copy()
RNAseq_adata = seq_data.copy()

# preprocess RNAseq data
sc.pp.normalize_total(RNAseq_adata)
sc.pp.log1p(RNAseq_adata)

# subset spatial data into shared genes
gene_names = np.intersect1d(adata.var_names, RNAseq_adata.var_names)

In [None]:
adata

In [None]:
RNAseq_adata

We prepares the data using `pp_adatas`, which does the following:
- Takes a list of genes from user via the `genes` argument. These genes are used as training genes.
- Annotates training genes under the `training_genes` field, in `uns` dictionary, of each AnnData. 
- Ensure consistent gene order in the datasets (_Tangram_ requires that the the $j$-th column in each matrix correspond to the same gene).
- If the counts for a gene are all zeros in one of the datasets, the gene is removed from the training genes.
- If a gene is not present in both datasets, the gene is removed from the training genes.
- In the pp_adatas function, the gene names are converted to lower case to get rid of the inconsistent capitalization. If this is not wanted, you can set the parameter gene_to_lowercase = False 

In [None]:
tg.pp_adatas(RNAseq_adata, adata, genes=gene_names)

In [None]:
def cv_data_gen(adata_sc, adata_sp, cv_mode="loo"):
    """ Generates pair of training/test gene indexes cross validation datasets

    Args:
        adata_sc (AnnData): single cell data
        adata_sp (AnnData): gene spatial data
        mode (str): Optional. support 'loo' and '10fold'. Default is 'loo'.

    Yields:
        tuple: list of train_genes, list of test_genes
    """

    # Check if training_genes key exist/is valid in adatas.uns
    if "training_genes" not in adata_sc.uns.keys():
        raise ValueError("Missing tangram parameters. Run `pp_adatas()`.")

    if "training_genes" not in adata_sp.uns.keys():
        raise ValueError("Missing tangram parameters. Run `pp_adatas()`.")

    if not list(adata_sp.uns["training_genes"]) == list(adata_sc.uns["training_genes"]):
        raise ValueError(
            "Unmatched training_genes field in two Anndatas. Run `pp_adatas()`."
        )

    genes_array = np.array(adata_sp.uns["training_genes"])

    if cv_mode == "loo":
        cv = LeaveOneOut()
    elif cv_mode == "5fold":
        cv = KFold(n_splits=5)

    for train_idx, test_idx in cv.split(genes_array):
        train_genes = list(genes_array[train_idx])
        test_genes = list(genes_array[test_idx])
        yield train_genes, test_genes

In [None]:
from tqdm import tqdm
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from tangram import compare_spatial_geneexp

cv_mode = "5fold"
mode="cells"
#mode="clusters"

if mode=="clusters":
    #whether or not cluter scRNA data
    RNAseq_adata_label = RNAseq_adata.copy()
    sc.pp.highly_variable_genes(RNAseq_adata_label)
    RNAseq_adata_label = RNAseq_adata[:, RNAseq_adata_label.var.highly_variable].copy()
    sc.pp.scale(RNAseq_adata_label, max_value=10)
    sc.tl.pca(RNAseq_adata_label)
    sc.pp.neighbors(RNAseq_adata_label)
    sc.tl.leiden(RNAseq_adata_label, resolution = 0.5)
    RNAseq_adata.obs['leiden'] = RNAseq_adata_label.obs.leiden
    del RNAseq_adata_label
    RNAseq_adata = tg.adata_to_cluster_expression(RNAseq_adata, "leiden", scale=True, add_density=False)

verbose=False
test_genes_list = []
test_pred_list = []
test_score_list = []
train_score_list = []
test_df_list = []
curr_cv_set = 1

if cv_mode == "loo":
    length = len(list(RNAseq_adata.uns["training_genes"]))
elif cv_mode == "5fold":
    length = 5

for train_genes, test_genes in tqdm(
    cv_data_gen(RNAseq_adata, adata, cv_mode), total=length
):
    # train
    adata_map = tg.map_cells_to_space(RNAseq_adata,adata,
                          cv_train_genes=train_genes,
                          mode=mode,
                          #cluster_label="leiden",
                          num_epochs=2000,
                          learning_rate=0.1,
                          verbose=False,
                          device="cuda:1"
                          #device='cpu',
                        )
    
    cv_genes = train_genes + test_genes

    # project on space
    adata_ge = tg.project_genes(
        adata_map, RNAseq_adata[:, cv_genes], scale=False,
    )

    # retrieve result for test gene (genes X cluster/cell)
    #if cv_mode == "loo":
    adata_ge_test = adata_ge[:, test_genes].X.T
    test_pred_list.append(adata_ge_test)

    # output test genes dataframe

    df_g = compare_spatial_geneexp(adata_ge, adata, RNAseq_adata, cv_genes)

    test_df = df_g[df_g.index.isin(test_genes)]
    test_score = df_g.loc[test_genes]["score"] 

    # output avg score
    test_genes_list.append(test_genes)
    test_score_list.append(test_score)
    #train_score_list.append(train_score)
    test_df_list.append(test_df)

    if verbose == True:
        msg = "cv set: {}----test score: {:.3f}".format(
            curr_cv_set,  test_score
        )
        print(msg)

    curr_cv_set += 1

# use nanmean to ignore nan in score list
if cv_mode == "loo":
    avg_test_score = np.nanmean(test_score_list)
else:
    test_score_df = pd.concat(test_score_list, axis=0)
    avg_test_score = np.nanmean(test_score_df)
#avg_train_score = np.nanmean(train_score_list)

cv_dict = {
    "avg_test_score": avg_test_score
}

print("cv avg test score {:.3f}".format(avg_test_score))
#print("cv avg train score {:.3f}".format(avg_train_score))

if cv_mode == "loo":

    # output df_test_genes dataframe
    test_gene_df = pd.concat(test_df_list, axis=0)
    
    # output AnnData for generated spatial data by LOOCV
    adata_ge_cv = sc.AnnData(
        X=np.squeeze(test_pred_list).T,
        obs=adata.obs.copy(),
        var=pd.DataFrame(
            test_score_list,
            columns=["test_score"],
            index=np.squeeze(test_genes_list),
        ),
    )
else:
    # output df_test_genes dataframe
    test_gene_df = pd.concat(test_df_list, axis=0)
    
    # output AnnData for generated spatial data by LOOCV
    adata_ge_cv = sc.AnnData(
        X=np.vstack(test_pred_list).T,
        obs=adata.obs.copy(),
        var=pd.DataFrame(
            test_score_df.values,
            columns=["test_score"],
            index=test_score_df.index,
        ),
    )


In [None]:
adata_ge_cv

In [None]:
adata_ge_cv.var['test_score'].values

In [None]:
cv_genes = adata_ge_cv.var["test_score"].index.values
adata_ge_cv.uns["train_genes_df"] = pd.DataFrame(adata_ge_cv.var['test_score'].values,cv_genes, columns=["train_score"])
adata_ge_cv.uns["train_genes_df"]["sparsity_sc"] = RNAseq_adata[
        :, cv_genes
    ].var.sparsity
adata_ge_cv.uns["train_genes_df"]["sparsity_sp"] = adata[
        :, cv_genes
    ].var.sparsity
adata_ge_cv.uns["train_genes_df"]["sparsity_diff"] = (
        adata[:, cv_genes].var.sparsity
        - RNAseq_adata[:, cv_genes].var.sparsity
    )

In [None]:
from sklearn.metrics import mean_squared_error
from scipy import stats

pearson = []
spearman = []
kendalltau = []
RMSE = []
    

for gene in gene_names:
    v1 = adata[:,gene].X
    v2 = adata_ge_cv[:,gene].X
    personR = stats.pearsonr(v1.reshape(-1), v2.reshape(-1))
    spearmanR = stats.spearmanr(v1.reshape(-1), v2.reshape(-1))
    kentou = stats.kendalltau(v1.reshape(-1), v2.reshape(-1))
    rmse = mean_squared_error(v1, v2, squared=False)

    pearson.append(personR[0])
    spearman.append(spearmanR[0])
    kendalltau.append(kentou.statistic)
    RMSE.append(rmse)

norm_raw = stats.zscore(adata.X, axis=0)
norm_imputed = stats.zscore(adata_ge_cv.X, axis=0)

norm_rmse = []
for v1, v2 in zip(norm_raw.T, norm_imputed.T):
    rmse = mean_squared_error(v1, v2, squared=False)
    norm_rmse.append(rmse)

df_sc = pd.DataFrame({"Pearson": pearson, "Spearman": spearman, "Kendalltou":kendalltau, "norm_RMSE": norm_rmse,"RMSE":RMSE})
df_sc.mean()

In [None]:
adata_ge_cv.var["Pearson"] = pearson
adata_ge_cv.var["Spearman"] = spearman
adata_ge_cv.var["Kendall_tau"] = kendalltau
adata_ge_cv.var["norm_RMSE"] = norm_rmse
adata_ge_cv.var["RMSE"] = RMSE