In [40]:
# Integration of biopsies and organ donors from 2 studiesar
import anndata, numpy as np, pandas as pd, imp, lpy, scanpy as sc
sc.logging.print_header()
%load_ext rpy2.ipython
%matplotlib inline

scanpy==1.6.0 anndata==0.7.4 umap==0.4.6 numpy==1.19.4 scipy==1.5.3 pandas==1.1.4 scikit-learn==0.22.2 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1 leidenalg==0.8.2


In [26]:
# Set some constants
sampledir = "/lustre/scratch117/cellgen/team292/lh20/organoids/"
demul = pd.read_csv("demultiplex.csv",delimiter= '\t')

samplelist = demul["sampleID"]
souporcell_genodico = {demul["sampleID"][i] : [demul["genotype0"][i], demul["genotype1"][i], demul["genotype2"][i], demul["genotype3"][i]] for i in range(demul.shape[0])}
folderlist = [sampledir +x + "/" for x in samplelist]
soupxfolderlist = [sampledir + x + "/" for x in samplelist ] 


In [27]:
#Load data
adata=lpy.createAnnData( folderlist, samplelist, souporcell_folderlist = soupxfolderlist, souporcell_genodico=souporcell_genodico, doqcplots=True, doinspect=True)

[35;46;1mCreate AnnData Object[0m[34m
def createAnnData(folderlist, prefix, souporcell_folderlist = None, souporcell_genodico = None, autoinclude=["percent_mito", "log2p1_count", "n_genes"], min_cell_per_gene_allowed=3, min_gene_per_cell_allowed=500, sample_obskey = "sample_names",doqcplots=False, doinspect=False, mitogeneprefix="MT-", do_log2_normalize=True):
    if doinspect is True: print("\033[35;46;1mCreate AnnData Object\033[0m\033[34m"); print(inspect.getsource(createAnnData));print("\033[31;43;1mExecution:\033[0m")
    adatas = []
    def pickname(x,y):
        if (y == "singlet"): return("_genotype_" +str(x))
        else: return("_doublet")
    def pickname2(x,y,z):
        if (y == "singlet"): return(z[int(x)])
        else: return("doublet")
    for i in range(len(folderlist)):
        print("Processing " + prefix[i])
        adatas.append(sc.read_10x_mtx(folderlist[i]))
        if souporcell_folderlist is not None:
            try:
                res = pd.read_csv(soup

... storing 'souporcell' as categorical
... storing 'demultiplexed' as categorical
... storing 'sample_names' as categorical


In [32]:
meta = pd.read_csv("sample_meta.csv",delimiter= '\t',)
adata = lpy.addMetadata(adata,meta, "sample_names","sample_names",doinspect=True)
adata.obs["sample_names"].value_counts()

[35;46;1mAdd metadata from sample table to anndata[0m[34m
def addMetadata(adata, metadata, obs_key, meta_key, doinspect=False):
    if doinspect is True: print("\033[35;46;1mAdd metadata from sample table to anndata\033[0m\033[34m"); print(inspect.getsource(addMetadata));print("\033[31;43;1mExecution:\033[0m")
    aslist = metadata[meta_key].tolist()
    rowmap = {i : aslist.index(i)  for i in aslist}
    for val in metadata.columns:
        if val != meta_key:
            aslist = metadata[val].tolist()
            adata.obs[val] = [aslist[rowmap[i]] for i in adata.obs[obs_key] ]
    return adata;

[31;43;1mExecution:[0m


WSSS_END9397532     19488
WSSS_END9397529     16256
WSSS_END9397534     15286
WSSS_END9397531     12526
WSSS_END9397525     11941
WSSS_END9397535     10602
WSSS_END9263316     10269
WSSS_END9397527     10010
WSSS_END9397526      9740
WSSS_END9397533      9556
WSSS_END9397530      8088
WSSS_END9397524      7967
WSSS_END9397528      7854
WSSS_END9263314      7581
WSSS_END9397520      6958
WSSS_END9263315      6892
organoids7090724     6220
WSSS_END9397523      6112
WSSS_END9263313      5889
WSSS_END9397522      5598
organoids7090731     4711
WSSS_END9397521      4702
organoids7090723     4187
organoids7090732     4032
organoids7090730     3657
organoids7090729     3540
organoids7090728     3408
organoids7090727     2803
organoids7090726     2652
organoids7090725     2644
WSSS_END9263317       699
Name: sample_names, dtype: int64

In [33]:
obskey_filteredcells = "filtered_cells"
adata=lpy.scrub(adata, "sample_names",obskey_cellfilter= obskey_filteredcells,add_cell_filter={"souporcell": "singlet" ,"max_percent_mito":0.3, "scrublet_local_pred": False }, doinspect=True)

[35;46;1mDetect Doublets and defining cells to filter[0m[34m
def scrub(adata, batch_obsattrib, bonf_threshold = 0.01, add_qc_metrics=False,mito_prefix= "MT-", obskey_cellfilter = "filtered_cells", add_cell_filter={"max_percent_mito": 0.15, "scrublet_local_pred": False}, doinspect=False):
    if doinspect is True: print("\033[35;46;1mDetect Doublets and defining cells to filter\033[0m\033[34m"); print(inspect.getsource(scrub));print("\033[31;43;1mExecution:\033[0m")
    
    import scrublet as scr
    import scanpy as sc
    print("spliting data using attribute " + batch_obsattrib)
    adatas = splitAnnData(adata, batch_obsattrib)

    if (add_qc_metrics):
        mito_genes = [name for name in adata.var_names if name.startswith(mito_prefix)]
        adata.obs['log2p1_RNA_count'] = np.log1p(adata.X.sum(axis=1).A1) / math.log(2)
        adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

    dalist = list(adata.obs_names)
    tmap = {}
 

In [35]:
# save object
adata = lpy.addCycleCycleAnnotation(adata, doinspect=True)

[35;46;1mAdd Cell Cycle annotation to anndata object[0m[34m
def addCycleCycleAnnotation(adata, s_genes = None, g2m_genes = None, geneprefix = "", use_raw_data= True, doinspect=False):
    if doinspect is True: print("\033[35;46;1mAdd Cell Cycle annotation to anndata object\033[0m\033[34m"); print(inspect.getsource(addCycleCycleAnnotation));print("\033[31;43;1mExecution:\033[0m")

    # uses Seurat Cell Cycles default genes by default
    if s_genes is None: # "MLF1IP"
        s_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4","RRM1","UNG","GINS2","MCM6","CDCA7","DTL","PRIM1","UHRF1","HELLS","RFC2","RPA2","NASP","RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2","RAD51","RRM2","CDC45","CDC6","EXO1","TIPIN","DSCC1","BLM","CASP8AP2","USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8"]
    if g2m_genes is None: #use default list
        g2m_genes =["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A

In [47]:
adata.uns["phase_colors"] = ['#6E40AA', '#FF8C38', '#28EA8D']
adata.write_h5ad("N1-integrated_organoids.h5ad")