In [None]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy
import scvi
from scipy.stats import spearmanr
from scvi.data import cortex, smfish
from scvi.external import GIMVI
import os
import time
import random
import scanpy as sc

import torch
import anndata as ad

import numpy as np
import pandas as pd
import loompy
import matplotlib.pyplot as plt
import scipy.stats as st
from SpaGE.main import SpaGE
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load the dataset
data_root = "./data/embyo_E8.5"

seq_data = sc.read_h5ad(f"{data_root}_sc.h5ad")
spatial_data = sc.read_h5ad(f"{data_root}_st.h5ad")

spatial_data.var_names = [x.lower() for x in spatial_data.var_names]
seq_data.var_names = [x.lower() for x in seq_data.var_names]

spatial_data.var_names_make_unique()
seq_data.var_names_make_unique()

sc.pp.calculate_qc_metrics(seq_data, percent_top=None, log1p=False, inplace=True)
mt_genes = seq_data.var_names.str.startswith("mt-")
seq_data = seq_data[seq_data.obs["total_counts"] <= 33000, ~mt_genes].copy()
sc.pp.filter_genes(seq_data, min_cells=int(0.01*seq_data.shape[0]))
seq_data.X = seq_data.X.toarray()

sc.pp.calculate_qc_metrics(spatial_data, percent_top=None, log1p=False, inplace=True)
spatial_data = spatial_data[spatial_data.obs["total_counts"] <= 600, :].copy()

# subset spatial data into shared genes
gene_names = np.intersect1d(spatial_data.var_names, seq_data.var_names)

# copy for calculating kendall tau cor
seq_data_ori = seq_data[:, gene_names].copy()
spatial_data_ori = spatial_data[:, gene_names].copy()

scanpy.pp.normalize_total(spatial_data)
scanpy.pp.log1p(spatial_data)

scanpy.pp.normalize_total(seq_data)
scanpy.pp.log1p(seq_data)

## Cross validation util fun

In [6]:
from tqdm import tqdm
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

def cv_data_gen(genelist, cv_mode="CV", kfold=5):
    """ Generates pair of training/test gene indexes cross validation datasets

    Args:
        genelist (list): list of all shared genes by adata_sc and adata_sp
        mode (str): Optional. support 'loo' and '10fold'. Default is 'loo'.

    Yields:
        tuple: list of train_genes, list of test_genes
    """

    #genes_array = np.array(adata_sp.uns["training_genes"])
    genes_array = np.array(genelist)

    if cv_mode == "loo":
        cv = LeaveOneOut()
    elif cv_mode == "CV":
        cv = KFold(n_splits=kfold)

    for train_idx, test_idx in cv.split(genes_array):
        train_genes = list(genes_array[train_idx])
        test_genes = list(genes_array[test_idx])
        yield train_genes, test_genes

## Preparing the data and run the model

In [7]:
start = time.time()

# only use genes in both datasets
seq_data = seq_data[:, gene_names].copy()
spatial_data = spatial_data[:, gene_names].copy()

seq_gene_names = seq_data.var_names
n_genes = seq_data.n_vars

# randomly permute all the shared genes
np.random.seed(seed=0)
rand_gene_idx = np.random.choice(range(n_genes), n_genes, replace=False)

test_gene_list = []
test_df_list = []

n_fold = 5

for train_genes, test_genes in tqdm(
    cv_data_gen(rand_gene_idx, cv_mode="CV"), total=n_fold
):

    # spatial_data_partial has a subset of the genes to train on
    spatial_data_partial = spatial_data[:, gene_names[train_genes]].copy()

    # create our model

    ST = pd.DataFrame(spatial_data_partial.X, columns=spatial_data_partial.var_names)
    SC = pd.DataFrame(seq_data.X, columns=seq_data.var_names)

    Imp_Genes = SpaGE(ST,SC,n_pv=30,
                        genes_to_predict = gene_names[test_genes])
   
    test_df_list.append(Imp_Genes)

    test_gene_list.append(test_genes)

end = time.time()
print("Time eplase is:", (end - start))

100%|██████████| 5/5 [00:30<00:00,  6.12s/it]

Time eplase is: 30.69905734062195





In [17]:
from sklearn.metrics import mean_squared_error
from scipy import stats

pearson = []
spearman = []
kendalltau = []
RMSE = []
    

for gene in gene_names:
    v1 = spatial_data_ori[:,gene].X
    v2 = test_gene_df[gene]
    personR = stats.pearsonr(v1.reshape(-1), v2)
    spearmanR = stats.spearmanr(v1.reshape(-1), v2)
    kentou = stats.kendalltau(v1.reshape(-1), v2)
    rmse = mean_squared_error(v1, v2, squared=False)

    pearson.append(personR[0])
    spearman.append(spearmanR[0])
    kendalltau.append(kentou.statistic)
    RMSE.append(rmse)

# norm_raw = stats.zscore(adata.X, axis=0)
# norm_imputed = stats.zscore(adata_ge_cv.X, axis=0)

# norm_rmse = []
# for v1, v2 in zip(norm_raw.T, norm_imputed.T):
#     rmse = mean_squared_error(v1, v2, squared=False)
#     norm_rmse.append(rmse)

df_sc = pd.DataFrame({"Pearson": pearson, "Spearman": spearman, "Kendalltou":kendalltau, "RMSE":RMSE})
df_sc.mean()

Pearson       0.254945
Spearman      0.178627
Kendalltou    0.145114
RMSE          1.181270
dtype: float64

In [18]:
spatial_data_ori = spatial_data_ori[:,test_gene_ind].copy()

spatial_data_ori.obsm["imputed"] = np.array(test_gene_df)

spatial_data_ori.var["Pearson"] = pearson
spatial_data_ori.var["Spearman"] = spearman
spatial_data_ori.var["Kendall_tau"] = kendalltau
spatial_data_ori.var["RMSE"] = RMSE

In [19]:
spatial_data_ori.write_h5ad("SpaGE_embryo_5fold.h5ad", compression=True)