In [1]:
# Integration of biopsies and organ donors from 2 studies
import anndata, numpy as np, pandas as pd, imp, lpy, scanpy as sc
sc.logging.print_header()



scanpy==1.6.0 anndata==0.7.5 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.1 scikit-learn==0.22.2 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1 leidenalg==0.7.0


In [2]:
adata = anndata.read_h5ad("N3-integrated_donors.h5ad")

In [3]:
# make general celltype annotation, which imports 1 cluster form endothelial zoom-in
general = np.array(adata.obs["leiden_scvi_sampl_cc"])
general[adata.obs["leiden_endothelial_raw"] == "11"] = "24"
adata.obs["general_celltypes"] =  lpy.applyFactorRename(general,{
    "7" : "SOX9",
    tuple(["21","2", "13"]) : "Lumenal",
    "1" : "Glandular",
    "10" : "Ciliated",
    "11" : "Lymphoid",
    "18" : "Myeloid",
    "4" : "Endothelial ACKR1",
    "24" : "Endothelial SEMA3G",
    "8" : "PV MYH11",
    "9" : "PV STEAP4",
    "3" : "uSMC",
    "17" : "Fibroblast C7",
    tuple(["5","6", "19"]) : "Fibroblast eS",
    tuple(["0","14", "12"]) : "Fibroblast dS",
    tuple(["22","23","20","15","16"]) : "Other"
    }, doinspect=True)
adata.obs["general_celltypes"].value_counts()

[35;46;1mRename classes in list[0m[34m
def applyFactorRename(input, dico, doinspect = False):
    if doinspect is True: print("\033[35;46;1mRename classes in list\033[0m\033[34m"); print(inspect.getsource(applyFactorRename));
    import numpy as np
    tmp = np.array(input, dtype=object)
    out = tmp.copy()
    for k,x in dico.items():
        if isinstance(k, tuple):
            for l in k:
                out[tmp == l] = x
        else:
            out[tmp == k] = x
    leftover_categories = set(out)
    leftover_categories = leftover_categories.difference(set(dico.values()))
    categories = list(dico.values())
    categories = categories + list(leftover_categories)
    return(pd.Categorical(out, categories, ordered=True))



Fibroblast dS         24436
filtered              20503
Lumenal               14252
Glandular             13100
Fibroblast eS         12675
uSMC                   7746
Endothelial ACKR1      7188
SOX9                   3646
PV MYH11               3414
PV STEAP4              3345
Ciliated               3189
Other                  2923
Lymphoid               2311
Fibroblast C7           932
Myeloid                 761
Endothelial SEMA3G      389
Name: general_celltypes, dtype: int64

In [4]:
adata.obs["subcluster_immune"] =  lpy.applyFactorRename(adata.obs["leidenres3_immune_raw"],{
    "6" : "DCs",
    tuple(["5", "19", "21"]): "uM1",
    "8": "uM2",
    "23" : "B cells",
    tuple(["4", "25"]) : "Tcells CD8",
    "12" : "Tcells CD4",
    tuple(["18","24"]) : "Cycling lymphocytes",
    "16" : "ILC3",
    "10" : "uNK1",
    tuple(["1","14","27"]) : "uNK2",
    "2" : "uNK3",
    "26" : "Peripheral",
    "28" : "Mast cells",
    tuple(["0","3", "15", "7", "22", "9", "17", "20", "11", "13"]): "doublets"
    })
adata.obs["subcluster_immune"].value_counts()

filtered               117738
doublets                 1222
uNK2                      316
uM1                       298
Tcells CD8                219
uNK3                      200
DCs                       166
uM2                       133
uNK1                      122
Cycling lymphocytes       121
Tcells CD4                102
ILC3                       81
B cells                    46
Peripheral                 34
Mast cells                 12
Name: subcluster_immune, dtype: int64

In [5]:
adata.obs["subcluster_epithelial"]= lpy.applyFactorRename(adata.obs["leidenres2_epithelial_raw"],{
    tuple(["2", "15"]) : "SOX9",
    tuple(["21"]) : "SOX9_prolif",
    tuple(["11", "23"]) : "SOX9_LGR5",
    tuple(["4", "3", "18", "25", "7", "8"]): "Lumenal 1",
    "16": "Lumenal 2",
    tuple(["0", "5", "14", "20"]): "Glandular",
    tuple(["1", "12", "13", "19", "24"]) : "Glandular_secretory",
    tuple(["6", "27"]) : "ciliated",
    "17" : "ciliated LRG5",
    tuple(["22", "9", "10", "26", "28"]): "doublets"
})
adata.obs["subcluster_epithelial"].value_counts()

filtered               86624
Lumenal 1               8283
Glandular               6157
Glandular_secretory     6095
doublets                3568
SOX9                    3223
SOX9_LGR5               1977
ciliated                1871
Lumenal 2               1100
ciliated LRG5           1078
SOX9_prolif              834
Name: subcluster_epithelial, dtype: int64

In [6]:
adata.obs["subcluster_epithelial_balanced"]= lpy.applyFactorRename(adata.obs["leidenres2_epithelial_subsample_raw"],{
    tuple(["21", "4", "8" , "17", "14","6"]) : "SOX9",
    tuple(["20","7"]) : "SOX9_prolif",
    tuple(["5", "9"]) : "SOX9_LGR5",
    tuple(["0", "22", "24"]): "Lumenal 1",
    "19": "Lumenal 2",
    tuple(["16", "3"]): "Glandular",
    tuple(["15", "1", "11", "10"]) : "Glandular_secretory",
    "25" : "Pre-ciliated",
    "2" : "Ciliated",
    "23" : "Ciliated LRG5",
    tuple(["18", "12"]) : "Low QC",
    "13": "doublets"
})
adata.uns["subcluster_epithelial_balanced_colors"] = ['#974815', '#F08129', '#EAB896', '#F7C900', '#C69E57', '#E53215', '#9C1915', '#1B3E92', '#429FD9', '#675DA6', '#AAAAAA', '#8888FF', '#888888']
adata.obs["subcluster_epithelial_balanced"].value_counts()

filtered               108944
Glandular_secretory      2856
SOX9                     2806
Lumenal 1                1376
SOX9_LGR5                1150
Glandular                1101
SOX9_prolif               883
ciliated                  811
doublets                  428
Lumenal 2                 273
ciliated LRG5             182
Name: subcluster_epithelial_balanced, dtype: int64

In [7]:
adata.obs["subcluster_stromal_balanced"]= lpy.applyFactorRename(adata.obs["leiden_stromal_subsample_raw"],{
    tuple(["2", "3", "5", "6", "13", "4"]): "dS",
    tuple(["0", "8"]): "eS",
    tuple(["12"]) : "Cycling", # "28" : "G2M phase", "23" : "S phase",   
    tuple(["1","7","9","11"]): "Low Qc",
    tuple(["10"]): "doublets"
})
adata.obs["subcluster_stromal_balanced"].value_counts()

filtered    105079
eS            6212
Low Qc        4200
dS            4199
doublets       698
Cycling        422
Name: subcluster_stromal_balanced, dtype: int64

In [8]:
adata.obs["subcluster_epithelial_balanced_broad"] =  lpy.applyFactorRename(adata.obs["subcluster_epithelial_balanced"],{
    tuple(["SOX9", "SOX9_prolif", "SOX9_LGR5"]) :"SOX9" ,
    tuple(["Lumenal 1", "Lumenal 2"]) :"Lumenal" ,
    tuple(["Glandular", "Glandular_secretory"]) :"Glandular" ,
    tuple(["ciliated", "ciliated LRG5"]) :"Ciliated" ,
    tuple(["Low QC", "doublets", "filtered"]) :"filtered"})
adata.uns["subcluster_epithelial_balanced_broad_colors"] = ['#974815', '#F7C900', '#E53215', '#429FD9', '#888888']

In [9]:
#set color for clusters
adata.uns["broad_celltypes_colors"] = ['#F08129', '#48B750', '#499ABA', '#999977', '#958054', '#7979CC', '#888888']
adata.uns["SampleID_colors"] = ['#800000', '#a70000', '#e70000', '#a76060', '#e28080', '#4a4a00', '#404030', '#848400', '#707040', '#a5a500', '#909060', '#005917', '#009f29', '#00c734', '#004a75', '#0084d2', '#12a7ff', '#410068', '#7a00c1', '#bb48ff', '#cd79ff']
adata.uns["Location_colors"] = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
#adata.uns["Binary Stage_colors"] = ['#008800', '#AA4400']
adata.uns["BiopsyType_colors"] = ['#AA0000', '#0000FF', ]
adata.uns["Location_colors"] = ['#0000FF', '#AA0088', '#888800']
adata.uns["phase_colors"] = ['#6E40AA', '#FF8C38', '#28EA8D']
adata.uns["subcluster_stromal_balanced_colors"] = ['#D6C2A7', '#A58A62', '#758A00', '#AAAAAA', '#8888FF', '#888888']

adata.uns["general_celltypes_colors"] = ['#F08129', '#F7C900', '#9C1915', '#429FD9', '#8B4F9A', '#499ABA', '#48B750', '#274E13', '#7C7C7C', '#C4C4C2', '#A7A7CE', '#776747', '#A58A62', '#D6C2A7', '#FFFFFF', '#FFFFFF']
adata.uns["subcluster_immune_colors"] = ['#C0BEB7', '#3A728C', '#3FC7F4', '#FF90D0', '#9356A0', '#E080FF', '#B31F87', '#009D00', '#60DF9C', '#3FAB5D', '#014529', '#63A585', '#888800', '#FFFFFF', '#AAAAAA']
adata.uns["subcluster_epithelial_colors"] = ['#974815', '#F08129', '#EAB896', '#F7C900', '#C69E57', '#E53215', '#9C1915', '#429FD9', '#675DA6', '#AAAAAA', '#888888']


In [10]:
adata = lpy.addCycleCycleAnnotation(adata, doinspect=True)

[35;46;1mAdd Cell Cycle annotation to anndata object[0m[34m
def addCycleCycleAnnotation(adata, s_genes = None, g2m_genes = None, geneprefix = "", use_raw_data= True, doinspect=False):
    if doinspect is True: print("\033[35;46;1mAdd Cell Cycle annotation to anndata object\033[0m\033[34m"); print(inspect.getsource(addCycleCycleAnnotation));print("\033[31;43;1mExecution:\033[0m")

    # uses Seurat Cell Cycles default genes by default
    if s_genes is None: # "MLF1IP"
        s_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4","RRM1","UNG","GINS2","MCM6","CDCA7","DTL","PRIM1","UHRF1","HELLS","RFC2","RPA2","NASP","RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2","RAD51","RRM2","CDC45","CDC6","EXO1","TIPIN","DSCC1","BLM","CASP8AP2","USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8"]
    if g2m_genes is None: #use default list
        g2m_genes =["HMGB2","CDK1","NUSAP1","UBE2C","BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF","TACC3","FAM64A

In [11]:
general = np.array(adata.obs["general_celltypes"])
general[adata.obs["broad_celltypes"] == "Epithelial"] = "filtered"
flt = adata.obs["subcluster_epithelial_balanced"] != "filtered"
general[flt] = adata.obs["subcluster_epithelial_balanced"][flt]
adata.obs["fine_celltypes"] =  lpy.applyFactorRename(general,{
    "SOX9" : "Epithelial SOX9",
    "SOX9_prolif" : "Epithelial SOX9_prolif",
    "SOX9_LGR5" : "Epithelial SOX9_LGR5",
    "Lumenal 1" : "Epithelial Lumenal 1",
    "Lumenal 2" : "Epithelial Lumenal 2",
    "Glandular" : "Epithelial Glandular",
    "Glandular_secretory" : "Epithelial Glandular_secretory",
    "Pre-ciliated": "Epithelial Pre-ciliated",
    "Ciliated" : "Epithelial Ciliated",
    "Ciliated LRG5" : "Epithelial Ciliated LRG5",
    "Lymphoid" : "Lymphoid",
    "Myeloid" : "Myeloid",
    "Endothelial ACKR1" : "Endothelial ACKR1",
    "Endothelial SEMA3G" : "Endothelial SEMA3G",
    "PV MYH11" : "PV MYH11",
    "PV STEAP4" : "PV STEAP4",
    "uSMC" : "uSMC",
    "Fibroblast C7" : "Fibroblast C7",
    "Fibroblast eS" : "Fibroblast eS",
    "Fibroblast dS" : "Fibroblast dS",
    tuple(["doublets", "Low QC", "Other"]) : "filtered"}
    )
adata.uns["fine_celltypes_colors"] = ['#974815', '#F08129', '#EAB896', '#F7C900', '#C69E57', '#E53215', '#9C1915', '#0C31A8', '#429FD9', '#675DA6', '#8B4F9A', '#499ABA', '#48B750', '#274E13', '#7C7C7C', '#C4C4C2', '#A7A7CE', '#776747', '#A58A62', '#D6C2A7', '#FFFFFF']
adata.obs["fine_celltypes"].value_counts()

filtered                          46884
Fibroblast dS                     24436
Fibroblast eS                     12675
uSMC                               7746
Endothelial ACKR1                  7188
PV MYH11                           3414
PV STEAP4                          3345
Epithelial SOX9                    2806
Lymphoid                           2311
Epithelial Glandular_secretory     2147
Epithelial Lumenal 1               1376
Epithelial SOX9_LGR5               1150
Epithelial Glandular               1101
Fibroblast C7                       932
Epithelial SOX9_prolif              883
Epithelial Ciliated                 762
Myeloid                             761
Endothelial SEMA3G                  389
Epithelial Lumenal 2                273
Epithelial Ciliated LRG5            182
Epithelial Pre-ciliated              49
Name: fine_celltypes, dtype: int64

In [12]:
adata.write_h5ad("N4-integrated_donors.h5ad")

In [13]:
sc.set_figure_params(figsize= [6,6])
adata2 = adata[ [ x not in ["filtered"] for x in adata.obs["subcluster_epithelial"] ],:]
adata2.obsm["umap"] = adata2.obsm["X_umap_epithelial"]
sc.pl.umap(adata2, color=["subcluster_epithelial"], save= "color.pdf", color_map = "viridis")



In [14]:
sc.pl.umap(adata2, color=["Day"], save= "Day.pdf", color_map = "viridis")
sc.pl.umap(adata2, color=["DonorID"], save= "DonorID.pdf", color_map = "viridis")
sc.pl.umap(adata2, color=["Location"], save= "Location.pdf", color_map = "viridis")
sc.pl.umap(adata2, color=["Binary Stage"], save= "BinaryStage.pdf", color_map = "viridis")
sc.pl.umap(adata2, color=["BiopsyType"], save= "BiopsyType.pdf", color_map = "viridis")
sc.pl.umap(adata2, color=["phase"], save= "Phase.pdf", color_map = "viridis")

