In [1]:
# Integration of biopsies and organ donors from 2 studies
import anndata, numpy as np, pandas as pd, imp, lpy, scanpy as sc
sc.logging.print_versions()

scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.19.1 scipy==1.5.2 pandas==1.0.5 scikit-learn==0.23.2 statsmodels==0.11.1 python-igraph==0.8.2 louvain==0.6.1 leidenalg==0.8.1


In [2]:
# Set some constants
adata = anndata.read_h5ad("N4-integrated_donors.h5ad")

In [3]:
# Integrate the Endothelial cell fraction specifically
# Use sample for batch correction, and integrate all genes
das = lpy.scvi_prepare(adata, "SampleID", [x not in ["filtered", "doublets"] for x in  adata.obs["subcluster_immune"] ], nbgenes = 0, min_cell_threshold= 2, doinspect=True)

[35;46;1mPrepare Data for Scvi/TotalVi[0m[34m
def scvi_prepare(anndatapath, field, cellfilter = None, nbgenes = 5000, genes_to_filter= None, use_ccfilter_prefix=None, citeseqkey = "protein_expression", use_raw_slot_instead =None, min_cell_threshold= 0, doinspect=False):
    if doinspect is True: print("\033[35;46;1mPrepare Data for Scvi/TotalVi\033[0m\033[34m"); print(inspect.getsource(scvi_prepare));print("\033[31;43;1mExecution:\033[0m")
    if use_ccfilter_prefix is not None :
        genes_to_filter = [use_ccfilter_prefix + x for x in ["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11","ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1","CDC20","TTK","CDC25C","KIF2C","RANGAP1","NCAPD2","DLGAP5","CDCA2","CDCA8","ECT2","KIF23","HMMR","AURKA","PSRC1","ANLN","LBR","CKAP5","CENPE","CTCF","NEK2","G2E3","GAS2L3","CBX5","CENPA","MCM5","PCNA","TYMS","FEN1"

In [4]:
# run scvi
latent = lpy.runSCVI(das["dataset"], n_latent= 16, doinspect=True)

[35;46;1mRun scvi[0m[34m
def runSCVI(dataset, nbstep = 500, n_latent = 64, doinspect= False):
    if doinspect is True: print("\033[35;46;1mRun scvi\033[0m\033[34m"); print(inspect.getsource(runSCVI));print("\033[31;43;1mExecution:\033[0m")
    vae = VAE(dataset.nb_genes, n_batch= dataset.n_batches, n_labels= dataset.n_labels, n_latent = n_latent)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.9, frequency=5, use_cuda=True)
    trainer.train(n_epochs=nbstep)
    full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)))
    return(full.sequential().get_latent()[0])

[31;43;1mExecution:[0m
[2020-11-16 10:13:38,788] INFO - scvi.inference.inference | KL warmup for 400 epochs


HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…




In [5]:
# store UMAP representation only for immune cells
adata = lpy.insertLatent(adata,latent, None, "X_umap_immune_curated", None, None, cellnames = das["names"], doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")


    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(

In [6]:
das = lpy.scvi_prepare(adata, "SampleID", [x not in ["13","18","12", "filtered"] for x in  adata.obs["leidenres2_epithelial_subsample_raw"] ], nbgenes = 0, min_cell_threshold= 2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "epithelial_curated", "X_umap_epithelial_curated", None, "leiden_epithelial_curated", cellnames = das["names"])
adata = lpy.insertLatent(adata,latent, None, None, None, "leidenres2_epithelial_curated", cellnames = das["names"], leiden_resolution=2)

[2020-11-16 10:16:22,990] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-16 10:16:23,027] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-16 10:16:23,029] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-16 10:16:23,045] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-16 10:16:23,047] INFO - scvi.dataset.dataset | Downsampled from 46 to 46 cells
[2020-11-16 10:16:23,050] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-16 10:16:23,123] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-16 10:16:23,125] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-16 10:16:23,226] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-16 10:16:23,243] INFO - scvi.dataset.dataset | Downsampled from 349 to 349 cells
[2020-11-16 10:16:23,247] INFO - scvi.data

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [11]:
das = lpy.scvi_prepare(adata, "SampleID", [x not in ["1","7","9","11", "10", "filtered"] for x in  adata.obs["leiden_stromal_subsample_raw"] ], nbgenes = 0, min_cell_threshold= 2)
latent = lpy.runSCVI(das["dataset"], n_latent= 16)
adata = lpy.insertLatent(adata,latent, "stromal_curated", "X_umap_stromal_curated", None, "leiden_stromal_curated", cellnames = das["names"])
adata = lpy.insertLatent(adata,latent, None, None, None, "leidenres2_stromal_curated", cellnames = das["names"], leiden_resolution=2)

[2020-11-16 13:13:42,886] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-16 13:13:43,123] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-16 13:13:43,125] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-16 13:13:43,715] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-16 13:13:43,780] INFO - scvi.dataset.dataset | Downsampled from 908 to 908 cells
[2020-11-16 13:13:43,791] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-11-16 13:13:43,965] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-11-16 13:13:43,967] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-11-16 13:13:44,297] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-11-16 13:13:44,349] INFO - scvi.dataset.dataset | Downsampled from 775 to 775 cells
[2020-11-16 13:13:44,359] INFO - scvi.da

HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…


defining permutation
Inserting Latent coords
computing UMAP
Inserting Umap coords
Finding clusters
Inserting Cluster Id
defining permutation
Finding clusters
Inserting Cluster Id


In [12]:
adata.write_h5ad("N5-integrated_donors.h5ad")

... storing 'leiden_stromal_curated' as categorical
... storing 'leidenres2_stromal_curated' as categorical
