In [1]:
# Integration of biopsies and organ donors from 2 studies
import anndata, numpy as np, pandas as pd, lpy, scanpy as sc, wget, imp, re

In [2]:
#list sample and load metadata
meta = pd.read_csv("./samplemeta.tsv",sep='\t')
# define path the location of the count matrices for every sample listed in the metadata file loaded
folderbase = {"Luz" : "/lustre/scratch117/cellgen/team292/lh20/revision/", "Wang" : "/lustre/scratch117/cellgen/team205/sharedData/lh20/endometrium-sra-map/"}
folderinner = {"Luz" : "/", "Wang" : "/counts/Gene/filtered/"}
folderlist = [folderbase[meta["StudyName"][i]] + meta["FolderName"][i] + folderinner[meta["StudyName"][i]] for i in range(meta.shape[0])]
adata = lpy.createAnnData(folderlist, meta["FolderName"],doqcplots=True, doinspect=True)

[35;46;1mCreate AnnData Object[0m[34m
def createAnnData(folderlist, prefix, souporcell_folderlist = None, souporcell_genodico = None, autoinclude=["percent_mito", "log2p1_count", "n_genes"], min_cell_per_gene_allowed=3, min_gene_per_cell_allowed=500, sample_obskey = "sample_names",doqcplots=False, doinspect=False, mitogeneprefix="MT-", do_log2_normalize=True):
    if doinspect is True: print("\033[35;46;1mCreate AnnData Object\033[0m\033[34m"); print(inspect.getsource(createAnnData));print("\033[31;43;1mExecution:\033[0m")
    adatas = []
    def pickname(x,y):
        if (y == "singlet"): return("_genotype_" +str(x))
        else: return("_doublet")
    def pickname2(x,y,z):
        if (y == "singlet"): return(z[int(x)])
        else: return("doublet")
    for i in range(len(folderlist)):
        print("Processing " + prefix[i])
        adatas.append(sc.read_10x_mtx(folderlist[i]))
        if souporcell_folderlist is not None:
            try:
                res = pd.read_csv(soup

... storing 'sample_names' as categorical


In [3]:
#add metadata from data table
adata = lpy.addMetadata(adata, meta, "sample_names", "FolderName", doinspect=True)

[35;46;1mAdd metadata from sample table to anndata[0m[34m
def addMetadata(adata, metadata, obs_key, meta_key, doinspect=False):
    if doinspect is True: print("\033[35;46;1mAdd metadata from sample table to anndata\033[0m\033[34m"); print(inspect.getsource(addMetadata));print("\033[31;43;1mExecution:\033[0m")
    aslist = metadata[meta_key].tolist()
    rowmap = {i : aslist.index(i)  for i in aslist}
    for val in metadata.columns:
        if val != meta_key:
            aslist = metadata[val].tolist()
            adata.obs[val] = [aslist[rowmap[i]] for i in adata.obs[obs_key] ]
    return adata;

[31;43;1mExecution:[0m


In [4]:
#add metadata from other dataset
fname_umap = wget.download("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE111nnn/GSE111976/suppl/GSE111976_umap_endo_10x.csv.gz")
fname_meta = wget.download("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE111nnn/GSE111976/suppl/GSE111976_summary_10x_day_donor_ctype.csv.gz")
map = {"GSM4577306": "14", "GSM4577307":"19", "GSM4577308" : "20", "GSM4577309" : "29", "GSM4577310": "39", "GSM4577311":"41", "GSM4577312":"57", "GSM4577313":"58","GSM4577314":"60","GSM4577315":"63"}
wang_umap = pd.read_csv(fname_umap)
wang_celltypes = pd.read_csv(fname_meta)
wang_name = wang_umap["Unnamed: 0"].tolist()
del wang_umap["Unnamed: 0"]
wang_umap = np.array(wang_umap)
tmap = {}
for i in range(len(adata.obs_names)):
    if adata.obs["sample_names"][i] in map.keys():
        tmap.update( {re.sub(adata.obs["sample_names"][i], map[adata.obs["sample_names"][i]], adata.obs_names[i]) : i})

adata.obs["Wang_celltype"] = ""
adata.obsm["X_Wang_umap"] = np.zeros( (len(adata.obs_names), 2) )
for i in range(len(wang_name)):
    if wang_name[i] in tmap.keys():
        adata.obsm["X_Wang_umap"][tmap[wang_name[i]],:] = wang_umap[i,:]
        adata.obs["Wang_celltype"][tmap[wang_name[i]]] = wang_celltypes["cell_type"][i]

In [5]:
#find doublets and identidy cells with high mitochondrial content, and labels such cells as "fitlered cells"
obskey_filteredcells = "filtered_cells"
adata = lpy.scrub(adata, "sample_names",obskey_cellfilter= obskey_filteredcells,add_cell_filter={"max_percent_mito": 0.15, "scrublet_local_pred": False}, doinspect=True)

[35;46;1mDetect Doublets and defining cells to filter[0m[34m
def scrub(adata, batch_obsattrib, bonf_threshold = 0.01, add_qc_metrics=False,mito_prefix= "MT-", obskey_cellfilter = "filtered_cells", add_cell_filter={"max_percent_mito": 0.15, "scrublet_local_pred": False}, doinspect=False):
    if doinspect is True: print("\033[35;46;1mDetect Doublets and defining cells to filter\033[0m\033[34m"); print(inspect.getsource(scrub));print("\033[31;43;1mExecution:\033[0m")
    
    import scrublet as scr
    import scanpy as sc
    print("spliting data using attribute " + batch_obsattrib)
    adatas = splitAnnData(adata, batch_obsattrib)

    if (add_qc_metrics):
        mito_genes = [name for name in adata.var_names if name.startswith(mito_prefix)]
        adata.obs['log2p1_RNA_count'] = np.log1p(adata.X.sum(axis=1).A1) / math.log(2)
        adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

    dalist = list(adata.obs_names)
    tmap = {}
 

Calculating doublet scores...
Automatically set threshold at doublet score = 0.68
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 9.7%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 1.3%
Elapsed time: 3.2 seconds
annoying values
processing 10/21
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.29
Detected doublet rate = 10.3%
Estimated detectable doublet fraction = 60.3%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 17.1%
Elapsed time: 11.6 seconds
annoying values
processing 11/21
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.35
Detected doublet rate = 2.9%
Estimated detectable doublet fraction = 53.6%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 5.4%
Elapsed time: 4.3 seconds
annoying values
processing 12/21
Preprocessi

In [6]:
# prepare data for scvi, subset out doublets, extrude cc genes (listed in function) and populate scvi objects 
sp = lpy.scvi_prepare(adata, "sample_names", adata.obs[obskey_filteredcells] == False, use_ccfilter_prefix="", doinspect=True)

[35;46;1mPrepare Data for Scvi/TotalVi[0m[34m
def scvi_prepare(anndatapath, field, cellfilter = None, nbgenes = 5000, genes_to_filter= None, use_ccfilter_prefix=None, citeseqkey = "protein_expression", use_raw_slot_instead =None, min_cell_threshold= 0, doinspect=False):
    if doinspect is True: print("\033[35;46;1mPrepare Data for Scvi/TotalVi\033[0m\033[34m"); print(inspect.getsource(scvi_prepare));print("\033[31;43;1mExecution:\033[0m")
    if use_ccfilter_prefix is not None :
        genes_to_filter = [use_ccfilter_prefix + x for x in ["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11","ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1","CDC20","TTK","CDC25C","KIF2C","RANGAP1","NCAPD2","DLGAP5","CDCA2","CDCA8","ECT2","KIF23","HMMR","AURKA","PSRC1","ANLN","LBR","CKAP5","CENPE","CTCF","NEK2","G2E3","GAS2L3","CBX5","CENPA","MCM5","PCNA","TYMS","FEN1"

[2020-10-30 14:52:16,408] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-10-30 14:52:16,542] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-10-30 14:52:16,544] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-10-30 14:52:16,801] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:16,844] INFO - scvi.dataset.dataset | Downsampled from 879 to 879 cells
[2020-10-30 14:52:16,880] INFO - scvi.dataset.anndataset | Dense size under 1Gb, casting to dense format (np.ndarray).
[2020-10-30 14:52:17,081] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-10-30 14:52:17,084] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-10-30 14:52:17,510] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:17,577] INFO - scvi.dataset.dataset | Downsampled from 1408 to 1408 cells
[2020-10-30 14:52:17,644] INFO - scvi.

[2020-10-30 14:52:49,510] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:49,733] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-10-30 14:52:49,735] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-10-30 14:52:50,674] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:51,060] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-10-30 14:52:51,063] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-10-30 14:52:52,811] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:53,542] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-10-30 14:52:53,544] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-10-30 14:52:54,246] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-10-30 14:52:54,599] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020

In [7]:
# run scvi
latent = lpy.runSCVI(sp["dataset"], doinspect=True)

[35;46;1mRun scvi[0m[34m
def runSCVI(dataset, nbstep = 500, n_latent = 64, doinspect= False):
    if doinspect is True: print("\033[35;46;1mRun scvi\033[0m\033[34m"); print(inspect.getsource(runSCVI));print("\033[31;43;1mExecution:\033[0m")
    vae = VAE(dataset.nb_genes, n_batch= dataset.n_batches, n_labels= dataset.n_labels, n_latent = n_latent)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.9, frequency=5, use_cuda=True)
    trainer.train(n_epochs=nbstep)
    full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)))
    return(full.sequential().get_latent()[0])

[31;43;1mExecution:[0m
[2020-10-30 14:54:07,272] INFO - scvi.inference.inference | KL warmup for 400 epochs


HBox(children=(FloatProgress(value=0.0, description='training', max=500.0, style=ProgressStyle(description_wid…




In [8]:
# store latent variable back in th object, and produce UMAP coordinates and leiden clusters 
adata = lpy.insertLatent(adata,latent, "scvi_sampl_cc", "X_umap_scvi_sampl_cc", None, "leiden_scvi_sampl_cc", cellnames = sp["names"], doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")


    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(

In [9]:
# additionnally produce leiden clusters with resolution 2
adata = lpy.insertLatent(adata,latent, None, None, None, "leidenres2_scvi_sampl_cc", cellnames = sp["names"],leiden_resolution=2, doinspect=True)

[35;46;1mCompute Clusters and Reduces representations[0m[34m
def insertLatent(adata, latent , latent_key= "latent", umap_key= "X_umap", tsne_key = "X_tsne", leiden_key = "leiden", rename_cluster_key= None,cellfilter = None, cellnames =None, leiden_resolution=1.0,doinspect=False):
    if doinspect is True: print("\033[35;46;1mCompute Clusters and Reduces representations\033[0m\033[34m"); print(inspect.getsource(insertLatent));print("\033[31;43;1mExecution:\033[0m")


    if cellnames is None:
        #order of full must match
        assert latent.shape[0] == len(adata.obs_names),  "cell names need for be provided if size of latent mismatches adata"
        map = range(len(adata.obs_names))
        if latent_key is not None:
            adata.obsm[latent_key] = latent
    else:
        print("defining permutation")
        dalist = list(adata.obs_names)
        tmap = {}
        for i in range(len(adata.obs_names)):
            tmap.update( {adata.obs_names[i] : i})
# for i in range(

In [10]:
# save object
adata.write_h5ad("N1-integrated_donors.h5ad")

... storing 'StudyName' as categorical
... storing 'SampleID' as categorical
... storing 'DonorID' as categorical
... storing 'BiopsyType' as categorical
... storing 'Location' as categorical
... storing 'Binary Stage' as categorical
... storing 'Stage' as categorical
... storing 'Day' as categorical
... storing 'Women age' as categorical
... storing '10x kit' as categorical
... storing 'Treatment' as categorical
... storing 'Wang_celltype' as categorical
... storing 'leiden_scvi_sampl_cc' as categorical
... storing 'leidenres2_scvi_sampl_cc' as categorical
