In [1]:
# Curating organoids cell types and projection with in vivo data
import anndata, numpy as np, pandas as pd, imp, lpy, scanpy as sc
sc.logging.print_header()



scanpy==1.6.0 anndata==0.7.5 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.1 scikit-learn==0.22.2 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1 leidenalg==0.7.0


In [2]:
adata = anndata.read_h5ad("N2-integrated_organoids.h5ad")
adata_primary = anndata.read_h5ad("../primary/N5-integrated_donors.h5ad")

In [3]:
adata.obs["leiden_scvi_genotype_bulkorg_subsmpl_renamed"] =  lpy.applyFactorRename(adata.obs["leiden_scvi_genotype_bulkorg_subsmpl"],{
    tuple(["0","8","5","3","6"]): "No Hormone",
    tuple(["9","2"]) : "No Hormone MKI67",
    "7" : "Estrogen MKI67", "4" : "Estrogen", 
    "11" : "Ciliated 1", "10": "Ciliated 2", "1" : "Secretory"
    }, doinspect=True)
adata.obs["leiden_scvi_genotype_bulkorg_subsmpl_renamed"].value_counts()

[35;46;1mRename classes in list[0m[34m
def applyFactorRename(input, dico, doinspect = False):
    if doinspect is True: print("\033[35;46;1mRename classes in list\033[0m\033[34m"); print(inspect.getsource(applyFactorRename));
    import numpy as np
    tmp = np.array(input, dtype=object)
    out = tmp.copy()
    for k,x in dico.items():
        if isinstance(k, tuple):
            for l in k:
                out[tmp == l] = x
        else:
            out[tmp == k] = x
    leftover_categories = set(out)
    leftover_categories = leftover_categories.difference(set(dico.values()))
    categories = list(dico.values())
    categories = categories + list(leftover_categories)
    return(pd.Categorical(out, categories, ordered=True))



filtered            180624
No Hormone            1456
No Hormone MKI67       461
Secretory              412
Estrogen               268
Estrogen MKI67         174
Ciliated 2              70
Ciliated 1              64
Name: leiden_scvi_genotype_bulkorg_subsmpl_renamed, dtype: int64

In [4]:
adata.obs["nobatch_bulkorg_subsmpl"] =  lpy.applyFactorRename(adata.obs["leiden_scvi_nobatch_bulkorg_subsmpl"],{
    tuple(["0","5","3","6"]): "No Hormone",   
    "4" : "Estrogen", 
    "8" : "Ciliated",
    "2" : "Secretory",
    tuple(["7", "1"]) : "No Hormone MKI67",
    }, doinspect=True)
adata.obs["nobatch_bulkorg_subsmpl"].value_counts()

[35;46;1mRename classes in list[0m[34m
def applyFactorRename(input, dico, doinspect = False):
    if doinspect is True: print("\033[35;46;1mRename classes in list\033[0m\033[34m"); print(inspect.getsource(applyFactorRename));
    import numpy as np
    tmp = np.array(input, dtype=object)
    out = tmp.copy()
    for k,x in dico.items():
        if isinstance(k, tuple):
            for l in k:
                out[tmp == l] = x
        else:
            out[tmp == k] = x
    leftover_categories = set(out)
    leftover_categories = leftover_categories.difference(set(dico.values()))
    categories = list(dico.values())
    categories = categories + list(leftover_categories)
    return(pd.Categorical(out, categories, ordered=True))



filtered            180624
No Hormone            1391
No Hormone MKI67       575
Secretory              422
Estrogen               401
Ciliated               116
Name: nobatch_bulkorg_subsmpl, dtype: int64

In [5]:
adata.obs["E001_celltypes"] =  lpy.applyFactorRename(adata.obs["leiden_scvi_scvi_genotype_E001_hormones"],{
    tuple(["4", "3"]): "No Hormone cycling",
    tuple(["1"]): "No Hormone",
    tuple(["2"]): "Estrogen",
    tuple(["6"]): "Preciliated",
    tuple(["5"]): "Ciliated",
    tuple(["0"]) : "Lumenal 1",
    tuple(["8"]) : "Lumenal 2",
    tuple(["9"]) : "Lumenal 3",
    tuple(["7"]) : "Other",
    tuple(["10"]) : "Low Qc"
    })
adata.obs["E001_celltypes"].value_counts()

filtered              159814
No Hormone cycling      5961
Lumenal 1               4390
No Hormone              3658
Estrogen                3505
Ciliated                2287
Preciliated             1806
Other                   1350
Lumenal 3                379
Lumenal 2                219
Low Qc                   160
Name: E001_celltypes, dtype: int64

In [6]:
adata.obs["bulkorg_celltypes"] =  lpy.applyFactorRename(adata.obs["leiden_scvi_genotype_bulkorg_hormones"],{
    tuple(["3"]): "No Hormone",
    tuple(["5", "2"]): "No Hormone cycling",
    tuple(["1"]): "Estrogen",
    tuple(["4", "6"]): "Estrogen cycling",
    tuple(["9"]): "Preciliated",
    tuple(["8"]): "Ciliated",
    tuple(["0", "7"]) : "Lumenal"
    })
adata.obs["bulkorg_celltypes"].value_counts()

filtered              181824
Lumenal                  435
No Hormone cycling       379
Estrogen cycling         267
Estrogen                 258
No Hormone               239
Ciliated                  66
Preciliated               61
Name: bulkorg_celltypes, dtype: int64

In [7]:
adata.obs["E001_celltypes_strictlyhormones"] =  lpy.applyFactorRename(adata.obs["leiden_scvi_scvi_genotype_E001_strictlyhormones"],{
    tuple(["4", "6"]): "Preciliated",
    tuple(["2"]): "Ciliated",
    tuple(["7"]): "PreLumenal",
    tuple(["0", "5"]): "Lumenal",
    tuple(["1", "3"]): "SOX9",
    })
adata.obs["E001_celltypes_strictlyhormones"].value_counts()

filtered       171289
SOX9             3786
Lumenal          3692
Ciliated         1995
Preciliated      1761
PreLumenal        474
8                 358
9                 174
Name: E001_celltypes_strictlyhormones, dtype: int64

In [8]:
adata.obs["E001_celltypes"] =lpy.applyFactorRename(adata.obs["leidenres2_scvi_scvi_genotype_E001"],{
    tuple(["3", "14","16", "11"]): "No hormones",
    tuple(["9","7", "10", "15"]): "No hormones MKI67",
    tuple(["2", "20","1", "8", "13", "19"]): "No hormones Maturing",
    tuple(["17", "12"]): "Ciliated Maturing",
    "4"  : "Ciliated",
    "5" : "Lumenal Maturing",
    "6" : "Lumenal MKI67",
    "0" : "Lumenal"
})
adata.obs["E001_celltypes"].value_counts()

filtered                146208
No hormones Maturing     10120
No hormones MKI67         7199
No hormones               5844
Lumenal                   4182
Ciliated                  2450
Lumenal Maturing          2385
Lumenal MKI67             2252
Ciliated Maturing         1977
18                         587
21                         255
22                          70
Name: E001_celltypes, dtype: int64

In [25]:
# No batch correction, no subsampling
adata.write_h5ad("N3-integrated_organoids.h5ad")

... storing 'logist_proj_E001_celltypes' as categorical
