In [1]:
# Integration of biopsies and organ donors from 2 studies
import anndata, numpy as np, pandas as pd, imp, lpy, scanpy as sc
sc.logging.print_versions()

scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.19.1 scipy==1.5.2 pandas==1.0.5 scikit-learn==0.23.2 statsmodels==0.11.1 python-igraph==0.8.2 louvain==0.6.1 leidenalg==0.8.1


In [2]:
# Set some constants
adata = anndata.read_h5ad("N2-integrated_donors.h5ad")

In [3]:
# Integrate the Endothelial cell fraction specifically
# Use sample for batch correction, and integrate all genes
das = lpy.scvi_prepare(adata, "SampleID", adata.obs["broad_celltypes"] == "Endothelial", nbgenes = 0, doinspect=True)

[35;46;1mPrepare Data for Scvi/TotalVi[0m[34m
def scvi_prepare(anndatapath, field, cellfilter = None, nbgenes = 5000, genes_to_filter= None, use_ccfilter_prefix=None, citeseqkey = "protein_expression", use_raw_slot_instead =None, min_cell_threshold= 0, doinspect=False):
    if doinspect is True: print("\033[35;46;1mPrepare Data for Scvi/TotalVi\033[0m\033[34m"); print(inspect.getsource(scvi_prepare));print("\033[31;43;1mExecution:\033[0m")
    if use_ccfilter_prefix is not None :
        genes_to_filter = [use_ccfilter_prefix + x for x in ["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11","ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1","CDC20","TTK","CDC25C","KIF2C","RANGAP1","NCAPD2","DLGAP5","CDCA2","CDCA8","ECT2","KIF23","HMMR","AURKA","PSRC1","ANLN","LBR","CKAP5","CENPE","CTCF","NEK2","G2E3","GAS2L3","CBX5","CENPA","MCM5","PCNA","TYMS","FEN1"

In [4]:
# run scvi
latent = lpy.runSCVI(das["dataset"], n_latent= 16, doinspect=True)

[35;46;1mRun scvi[0m[34m
def runSCVI(dataset, nbstep = 500, n_latent = 64, doinspect= False):
    if doinspect is True: print("\033[35;46;1mRun scvi\033[0m\033[34m"); print(inspect.getsource(runSCVI));print("\033[31;43;1mExecution:\033[0m")
    vae = VAE(dataset.nb_genes, n_batch= dataset.n_batches, n_labels= dataset.n_labels, n_latent = n_latent)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.9, frequency=5, use_cuda=True)
    trainer.train(n_epochs=nbstep)
    full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)))
    return(full.sequential().get_latent()[0])

[31;43;1mExecution:[0m
[2020-11-04 16:40:01,437] INFO - scvi.inference.inference | KL warmup for 400 epochs


HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…




In [5]:
# store latent variable back in th object, and produce UMAP coordinates and leiden clusters 
adata = lpy.insertLatent(adata,latent, "endothelial", "X_umap_endothelial", None, "leiden_endothelial_raw", cellnames = das["names"], doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")


    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(

In [6]:
# Integrate the Immune cells fraction specifically
# uses the same method used in the last 3 cells with identical parameters
das = lpy.scvi_prepare(adata, "SampleID", adata.obs["broad_celltypes"] == "Immune", nbgenes = 0)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "immune", "X_umap_immune", None, "leiden_immune_raw", cellnames = das["names"])

[2020-11-04 17:34:47,623] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 17:34:47,703] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 17:34:47,706] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 17:34:47,739] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 17:34:47,741] INFO - scvi.dataset.dataset | Downsampled from 18 to 18 cells
[2020-11-04 17:34:47,745] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 17:34:47,787] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 17:34:47,788] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 17:34:47,819] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 17:34:47,824] INFO - scvi.dataset.dataset | Downsampled from 82 to 82 cells
[2020-11-04 17:34:47,827] INFO - scvi.datase

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id


In [7]:
# in addition find clusters with leiden resolution (for the immune population
adata = lpy.insertLatent(adata,latent, None, None, None, "leidenres3_immune_raw", cellnames = das["names"], leiden_resolution=3)

defining permutation
Finding clusters
Inserting Cluster Id


In [8]:
# Integrate the Stromal cells cell fraction specifically
# uses the same method as before with identical parameters
das = lpy.scvi_prepare(adata, "SampleID", adata.obs["broad_celltypes"] == "Stromal", nbgenes = 0)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "stromal", "X_umap_stromal", None, "leiden_stromal_raw", cellnames = das["names"])

... storing 'leiden_immune_raw' as categorical
... storing 'leidenres3_immune_raw' as categorical


[2020-11-04 17:39:51,903] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 17:39:51,942] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 17:39:51,944] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 17:39:51,966] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 17:39:51,969] INFO - scvi.dataset.dataset | Downsampled from 82 to 82 cells
[2020-11-04 17:39:51,972] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 17:39:52,164] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 17:39:52,166] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 17:39:52,539] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 17:39:52,599] INFO - scvi.dataset.dataset | Downsampled from 1277 to 1277 cells
[2020-11-04 17:39:52,609] INFO - scvi.da

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id


In [9]:
# Integrate the Supporting cells fraction specifically
# uses the same method as before with identical parameters
das = lpy.scvi_prepare(adata, "SampleID", adata.obs["broad_celltypes"] == "Supporting", nbgenes = 0,min_cell_threshold=2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "supporting", "X_umap_supporting", None, "leiden_supporting_raw", cellnames = das["names"])

[2020-11-04 21:25:05,196] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 21:25:05,786] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 21:25:05,788] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 21:25:06,851] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 21:25:07,026] INFO - scvi.dataset.dataset | Downsampled from 3713 to 3713 cells
[2020-11-04 21:25:07,041] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 21:25:07,085] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 21:25:07,086] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 21:25:07,126] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 21:25:07,132] INFO - scvi.dataset.dataset | Downsampled from 156 to 156 cells
[2020-11-04 21:25:07,135] INFO - scvi.

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id


In [10]:
# Integrate the Epithelial cells fraction specifically
# uses the same method as before with identical parameters
das = lpy.scvi_prepare(adata, "SampleID", adata.obs["broad_celltypes"] == "Epithelial", nbgenes = 0, min_cell_threshold= 2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "epithelial", "X_umap_epithelial", None, "leiden_epithelial_raw", cellnames = das["names"])


[2020-11-04 21:52:35,768] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 21:52:35,770] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 21:52:37,763] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 21:52:37,874] INFO - scvi.dataset.dataset | Downsampled from 15483 to 15483 cells
[2020-11-04 21:52:37,881] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 21:52:37,921] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 21:52:37,923] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 21:52:37,944] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 21:52:37,948] INFO - scvi.dataset.dataset | Downsampled from 50 to 50 cells
[2020-11-04 21:52:37,951] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 21:52:38,090] INFO - scvi.

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id


In [11]:
#sample epithelial cells to get at most 1000 cells from each sample
subsampled_epithelial_cells = lpy.subsample(adata.obs["broad_celltypes"] == "Epithelial", 1000, adata.obs["SampleID"], doinspect=True)

[35;46;1mSample a subset of a defined size[0m[34m
def subsample(truefalse_vector, subsamplesize, partition = None, doinspect = False):
    if doinspect is True: print("\033[35;46;1mSample a subset of a defined size\033[0m\033[34m"); print(inspect.getsource(subsample));print("\033[31;43;1mExecution:\033[0m")
    if partition is None:
        partition = ["thesame" for x in range(len(truefalse_vector))]
    valueset = list(set(partition))
    fout = np.zeros(len(truefalse_vector), dtype="bool")
    which = lambda lst:list(np.where(lst)[0])
    for i in valueset:
        subf = truefalse_vector & (partition == i)
        wlts = which(subf)
        if len(wlts) > subsamplesize:
            wlts = random.sample(wlts, subsamplesize)
        fout[wlts] = True
    return(fout)

[31;43;1mExecution:[0m


In [12]:
# Integrate the subsampled Epithelial cells fraction specifically
# uses the same method as before with identical parameters
das = lpy.scvi_prepare(adata, "SampleID", subsampled_epithelial_cells, nbgenes = 0, min_cell_threshold= 2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "epithelial_subsample", "X_umap_epithelial_subsample", None, "leiden_epithelial_subsample_raw", cellnames = das["names"])

[2020-11-04 22:41:37,529] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 22:41:37,632] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 22:41:37,634] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 22:41:37,883] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 22:41:37,931] INFO - scvi.dataset.dataset | Downsampled from 1000 to 1000 cells
[2020-11-04 22:41:37,934] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-04 22:41:37,961] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-04 22:41:37,963] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-04 22:41:37,980] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-04 22:41:37,982] INFO - scvi.dataset.dataset | Downsampled from 50 to 50 cells
[2020-11-04 22:41:37,985] INFO - scvi.da

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id


In [13]:
#in addition, insert leiden cluster with resolution 2 for the epithelial subsample
adata = lpy.insertLatent(adata,latent,None, None, None, "leidenres2_epithelial_subsample_raw", cellnames = adata.obs_names[subsampled_epithelial_cells], leiden_resolution=2)

defining permutation
Finding clusters
Inserting Cluster Id


In [14]:
#sample epithelial cells to get at most 1000 cells from each sample
subsampled_stromal_cells = lpy.subsample(adata.obs["broad_celltypes"] == "Stromal", 1000, adata.obs["SampleID"])

In [15]:
das = lpy.scvi_prepare(adata, "SampleID", subsampled_stromal_cells, nbgenes = 0, min_cell_threshold= 2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "stromal_subsample", "X_umap_stromal_subsample", None, "leiden_stromal_subsample_raw", cellnames = das["names"])
adata = lpy.insertLatent(adata,latent,None, None, None, "leidenres2_stromal_subsample_raw", cellnames = das["names"],leiden_resolution=2)

[2020-11-11 13:20:54,691] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-11 13:20:54,856] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-11 13:20:54,858] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-11 13:20:55,141] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-11 13:20:55,189] INFO - scvi.dataset.dataset | Downsampled from 1000 to 1000 cells
[2020-11-11 13:20:55,195] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-11 13:20:55,349] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-11 13:20:55,351] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-11 13:20:55,635] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-11 13:20:55,685] INFO - scvi.dataset.dataset | Downsampled from 1000 to 1000 cells
[2020-11-11 13:20:55,691] INFO - scv

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [16]:
adata.write_h5ad("N3-integrated_donors.h5ad")

... storing 'leiden_stromal_subsample_raw' as categorical
... storing 'leidenres2_stromal_subsample_raw' as categorical
