In [1]:
# Integration of biopsies and organ donors from 2 studies
import anndata, numpy as np, pandas as pd, lpy, scanpy as sc, imp, re
sc.logging.print_versions()

scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.19.1 scipy==1.5.2 pandas==1.0.5 scikit-learn==0.23.2 statsmodels==0.11.1 python-igraph==0.8.2 louvain==0.6.1 leidenalg==0.8.1


In [2]:
# Set some constants
adata = anndata.read_h5ad("N1-denoised-nuclei.h5ad")
adata_trg = anndata.read_h5ad("N1-nuclei.h5ad")

In [3]:
# prepare data for scvi, subset out doublets, extrude cc genes (listed in function) and populate scvi objects 
obskey_filteredcells= "filtered_cells"
adata.obs["filtered_cells"] = adata_trg.obs["filtered_cells"] 
sp = lpy.scvi_prepare(adata, "sample_names", adata.obs[obskey_filteredcells] == False, nbgenes = 0, use_ccfilter_prefix="", doinspect=True)

[35;46;1mPrepare Data for Scvi/TotalVi[0m[34m
def scvi_prepare(anndatapath, field, cellfilter = None, nbgenes = 5000, genes_to_filter= None, use_ccfilter_prefix=None, citeseqkey = "protein_expression", use_raw_slot_instead =None, min_cell_threshold= 0, doinspect=False):
    if doinspect is True: print("\033[35;46;1mPrepare Data for Scvi/TotalVi\033[0m\033[34m"); print(inspect.getsource(scvi_prepare));print("\033[31;43;1mExecution:\033[0m")
    if use_ccfilter_prefix is not None :
        genes_to_filter = [use_ccfilter_prefix + x for x in ["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11","ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1","CDC20","TTK","CDC25C","KIF2C","RANGAP1","NCAPD2","DLGAP5","CDCA2","CDCA8","ECT2","KIF23","HMMR","AURKA","PSRC1","ANLN","LBR","CKAP5","CENPE","CTCF","NEK2","G2E3","GAS2L3","CBX5","CENPA","MCM5","PCNA","TYMS","FEN1"

In [4]:
# run scvi
latent = lpy.runSCVI(sp["dataset"], doinspect=True)

[35;46;1mRun scvi[0m[34m
def runSCVI(dataset, nbstep = 500, n_latent = 64, doinspect= False):
    if doinspect is True: print("\033[35;46;1mRun scvi\033[0m\033[34m"); print(inspect.getsource(runSCVI));print("\033[31;43;1mExecution:\033[0m")
    vae = VAE(dataset.nb_genes, n_batch= dataset.n_batches, n_labels= dataset.n_labels, n_latent = n_latent)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.9, frequency=5, use_cuda=True)
    trainer.train(n_epochs=nbstep)
    full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)))
    return(full.sequential().get_latent()[0])

[31;43;1mExecution:[0m
[2020-12-19 18:54:54,082] INFO - scvi.inference.inference | KL warmup for 400 epochs


HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…




In [5]:
# store latent variable back in th object, and produce UMAP coordinates and leiden clusters 

adata_trg = lpy.insertLatent(adata_trg,latent, "scvi_sampl_cc", "X_umap_sampl_cc", None, "leiden_sampl_cc", cellnames = sp["names"], doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")

    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(l

In [6]:
# additionnally produce leiden clusters with resolution 2
adata_trg = lpy.insertLatent(adata_trg,latent, None, None, None, "leidenres2_sampl_cc", cellnames = sp["names"],leiden_resolution=2, doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")

    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(l

In [7]:
sc.tl.rank_genes_groups(adata_trg, 'leiden_sampl_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")
sc.tl.rank_genes_groups(adata_trg, 'leidenres2_sampl_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")

... storing 'leiden_sampl_cc' as categorical
... storing 'leidenres2_sampl_cc' as categorical




In [8]:
das = lpy.scvi_prepare(adata, "DonorID", adata.obs[obskey_filteredcells] == False, nbgenes = 0, use_ccfilter_prefix="")
latent = lpy.runSCVI(das["dataset"])
adata_trg = lpy.insertLatent(adata_trg,latent,"scvi_donor_cc", "X_umap_donor_cc", None, "leiden_donor_cc", cellnames = das["names"])
adata_trg = lpy.insertLatent(adata_trg,latent, None, None, None, "leidenres2_donor_cc", cellnames = das["names"], leiden_resolution=2)

[2020-12-19 20:27:07,726] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-19 20:27:07,727] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-19 20:27:08,214] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-19 20:27:08,358] INFO - scvi.dataset.dataset | Downsampled from 9971 to 9971 cells
[2020-12-19 20:27:08,399] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-19 20:27:10,625] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-19 20:27:10,627] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-19 20:27:12,178] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-19 20:27:12,449] INFO - scvi.dataset.dataset | Downsampled from 6005 to 6005 cells
[2020-12-19 20:27:12,484] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-19 20:27:13,006] INFO - scv

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [9]:
sc.tl.rank_genes_groups(adata_trg, 'leiden_donor_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")
sc.tl.rank_genes_groups(adata_trg, 'leidenres2_donor_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")

... storing 'leiden_donor_cc' as categorical
... storing 'leidenres2_donor_cc' as categorical




In [4]:
obskey_filteredcells= "filtered_cells"
das = lpy.scvi_prepare(adata, "sample_names", (adata.obs[obskey_filteredcells] == False)&(adata.obs["n_genes"] < 6000), nbgenes = 0, use_ccfilter_prefix="")
latent = lpy.runSCVI(das["dataset"])
adata_trg = lpy.insertLatent(adata_trg,latent,"scvi_sampl6kg_cc", "X_umap_sampl6kg_cc", None, "leiden_sampl6kg_cc", cellnames = das["names"])
adata_trg = lpy.insertLatent(adata_trg,latent, None, None, None, "leidenres2_sampl6kg_cc", cellnames = das["names"], leiden_resolution=2)

[2020-12-20 00:36:33,858] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-20 00:36:35,864] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-20 00:36:35,866] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-20 00:36:37,333] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-20 00:36:37,599] INFO - scvi.dataset.dataset | Downsampled from 5538 to 5538 cells
[2020-12-20 00:36:37,648] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-20 00:36:39,155] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-20 00:36:39,157] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-20 00:36:40,322] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-20 00:36:40,540] INFO - scvi.dataset.dataset | Downsampled from 4512 to 4512 cells
[2020-12-20 00:36:40,591] INFO - scv

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [5]:
sc.tl.rank_genes_groups(adata_trg, 'leiden_sampl6kg_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")
sc.tl.rank_genes_groups(adata_trg, 'leidenres2_sampl6kg_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")

... storing 'leiden_sampl6kg_cc' as categorical
... storing 'leidenres2_sampl6kg_cc' as categorical




In [6]:
das = lpy.scvi_prepare(adata, "DonorID", (adata.obs[obskey_filteredcells] == False)&(adata.obs["n_genes"] < 6000), nbgenes = 0, use_ccfilter_prefix="")
latent = lpy.runSCVI(das["dataset"])
adata_trg = lpy.insertLatent(adata_trg,latent,"scvi_donor6kg_cc", "X_umap_donor6kg_cc", None, "leiden_donor6kg_cc", cellnames = das["names"])
adata_trg = lpy.insertLatent(adata_trg,latent, None, None, None, "leidenres2_donor6kg_cc", cellnames = das["names"], leiden_resolution=2)

[2020-12-20 01:15:25,157] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-20 01:15:29,013] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-20 01:15:29,016] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-20 01:15:31,423] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-20 01:15:31,864] INFO - scvi.dataset.dataset | Downsampled from 9279 to 9279 cells
[2020-12-20 01:15:32,182] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-12-20 01:15:32,184] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-12-20 01:15:32,563] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-12-20 01:15:32,692] INFO - scvi.dataset.dataset | Downsampled from 11610 to 11610 cells
[2020-12-20 01:15:32,731] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-12-20 01:15:34,925] INFO - s

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [7]:
sc.tl.rank_genes_groups(adata_trg, 'leiden_donor6kg_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")
sc.tl.rank_genes_groups(adata_trg, 'leidenres2_donor6kg_cc', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata_trg, n_genes=25, sharey=False, key = "t-test", save= "markers.pdf")

... storing 'leiden_donor6kg_cc' as categorical
... storing 'leidenres2_donor6kg_cc' as categorical




In [8]:
# save object
adata_trg.write_h5ad("N2-nuclei.h5ad")