In [1]:
from typing import * 
from pathlib import Path

import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc
from tqdm import tqdm 
import matplotlib.pyplot as plt

from scrna_atlas import settings, utils

In [2]:
adata = ad.read_h5ad('/efs/expression_atlas/scrna/cellxgene/old/b84def55-a776-4aa4-a9a6-7aab8b973086.h5ad', backed='r')
# adata = ad.read_h5ad('/efs/expression_atlas/scrna/cellxgene/old/3310476e-ee9d-4179-9446-df5d073f38d8.h5ad', backed='r')

In [7]:
# Subset all of the tabula muris files. 

tabula_muris_files = []
tabula_sapiens_files = []
for fh in list(Path('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/').glob('*.h5ad')):
    adata = ad.read_h5ad(fh, backed='r')
    print(adata.uns['title'])
    if 'A single-cell transcriptomic atlas characterizes ageing tissues in the mouse' in adata.uns['title']:
        tabula_muris_files.append(fh)
    if 'Tabula Sapiens' in adata.uns['title']:
        tabula_sapiens_files.append(fh)

Lung - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
Spatial transcriptomics in the healthy human kidney: Puck_200115_15
Individual Single-Cell RNA-seq PBMC Data from Lee et al.
Dissection: Thalamus (THM) - intralaminar nuclear complex (ILN) - posterior group of intralaminar nuclei (PILN) - centromedian and parafasicular nuclei - CM and Pf
Spatiotemporal analysis of human intestinal development at single-cell resolution: Fetal A6
An integrated transcriptomic and epigenomic atlas of mouse primary motor cortex cell types: 10X_nuclei_v3_Broad
UMAP visualization of fibroblast subclusters
Visium spatial - LV
Tabula Sapiens - Epithelial
Fovea - Cell Types of the Human Retina and Its Organoids at Single-Cell Resolution
Spatial transcriptomics in mouse: Puck_191223_17
white matter - oligodendroglia
DCM/ACM heart cell atlas: Macrophages
E9.5: 201112_05
Mature kidney dataset: full
Trophoblast_stemcells
Single cell RNA-seq data from normal adult kidney tissue


In [8]:
tabula_sapiens_files

[PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/97a17473-e2b1-4f31-a544-44a60773e2dd.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/c5d88abe-f23a-45fa-a534-788985e93dad.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/18eb630b-a754-4111-8cd4-c24ec80aa5ec.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/e5c63d94-593c-4338-a489-e1048599e751.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/cee11228-9f0b-4e57-afe2-cfe15ee56312.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/ff45e623-7f5f-46e3-b47d-56be0341f66b.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/e6a11140-2545-46bc-929e-da243eed2cae.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/0d2ee4ac-05ee-40b2-afb6-ebb584caa867.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/5a11f879-d1ef-458a-910c-9b0bdfca5ebf.

In [9]:
tabula_muris_files

[PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/0fb7916e-7a68-4a4c-a441-3ab3989f29a7.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/b8c618e5-4b3d-4566-8a3f-7e40047f5c54.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/3f4fe86f-aced-4d10-b174-ee35b9f46b9d.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/4546e757-34d0-4d17-be06-538318925fcd.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/76544818-bc5b-4a0d-87d4-40dde89545cb.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/1d29fd10-c8b3-4611-b0ac-3c578125adbf.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/98ad5247-68f8-42f8-b8e5-7938cb373a91.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/f16a8f4d-bc97-43c5-a2f6-bbda952e4c5c.h5ad'),
 PosixPath('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/821c79aa-044e-40cf-b331-0fe3edd48019.

In [3]:
cellxgene_loc = Path('/efs/expression_atlas/scrna/cellxgene/2023-12-15/h5ads/')

In [4]:
(settings.DATA_LOC / 'tabula_muris' / '10X').mkdir(exist_ok=True)
(settings.DATA_LOC / 'tabula_muris' / 'smart-seq').mkdir(exist_ok=True)

In [25]:
# for fh in tabula_muris_files:
for fh in (settings.DATA_LOC / 'tabula_muris').glob('*.h5ad'):
    print(fh.stem)
    _adata = ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r')
    print(_adata.uns['title'])
    if '10x' in _adata.uns['title']:
        process_cellxgene_atlas_adatas(
            ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_muris' / '10X',
            use_rep='X_pca',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
        )
    elif 'Smart-seq2' in _adata.uns['title']:
        process_cellxgene_atlas_adatas(
            ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_muris' / 'smart-seq',
            use_rep='X_pca',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
            normalize_length=True,
        )
    elif _adata.obs['assay'].nunique() == 1 and '10X' in _adata.obs['assay'].values[0].upper():
        process_cellxgene_atlas_adatas(
            ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_muris' / '10X',
            use_rep='X_pca',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
        )
    elif _adata.obs['assay'].nunique() == 1 and 'SMART' in _adata.obs['assay'].values[0].upper():
        process_cellxgene_atlas_adatas(
            ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_muris' / 'smart-seq',
            use_rep='X_pca',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
            normalize_length=True,
        )
    else:
        print(f'Skipping {fh.stem}...')

0fb7916e-7a68-4a4c-a441-3ab3989f29a7_pseuodobulk
Lung - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 114/114 [00:12<00:00,  9.08it/s]
100%|██████████| 64/64 [00:58<00:00,  1.09it/s]
... storing 'tissue_cell_type' as categorical


3f4fe86f-aced-4d10-b174-ee35b9f46b9d_pseuodobulk
Trachea - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 49/49 [00:02<00:00, 19.90it/s]
100%|██████████| 34/34 [00:09<00:00,  3.58it/s]
... storing 'tissue_cell_type' as categorical


4546e757-34d0-4d17-be06-538318925fcd_pseuodobulk
Liver - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 38/38 [00:01<00:00, 19.86it/s]
100%|██████████| 25/25 [00:07<00:00,  3.54it/s]
... storing 'tissue_cell_type' as categorical


1d29fd10-c8b3-4611-b0ac-3c578125adbf_pseuodobulk
Skin of body - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 41/41 [00:03<00:00, 10.97it/s]
100%|██████████| 29/29 [00:11<00:00,  2.42it/s]
... storing 'tissue_cell_type' as categorical


4fa55ee5-8da4-4d42-9525-1c52d4ce50bf_pseuodobulk
Skin of body - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 41/41 [00:03<00:00, 10.91it/s]
100%|██████████| 32/32 [00:12<00:00,  2.48it/s]
... storing 'tissue_cell_type' as categorical


2491629a-bde0-46ad-a073-e34fcb516857_pseuodobulk
Large intestine - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(21069, 9)
(21069, 10)


100%|██████████| 65/65 [00:07<00:00,  8.47it/s]
100%|██████████| 37/37 [00:25<00:00,  1.47it/s]
... storing 'tissue_cell_type' as categorical


1a0610d8-1339-479b-b261-7fb586c3dab9_pseuodobulk
Trachea - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 91/91 [00:05<00:00, 18.13it/s]
100%|██████████| 45/45 [00:19<00:00,  2.30it/s]
... storing 'tissue_cell_type' as categorical


48b37086-25f7-4ecd-be66-f5bb378e3aea_pseuodobulk
All - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 1007/1007 [03:27<00:00,  4.84it/s]
100%|██████████| 535/535 [14:26<00:00,  1.62s/it]
... storing 'tissue_cell_type' as categorical


0bd1a1de-3aee-40e0-b2ec-86c7a30c7149_pseuodobulk
Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 154/154 [00:29<00:00,  5.15it/s]
100%|██████████| 74/74 [01:58<00:00,  1.61s/it]
... storing 'tissue_cell_type' as categorical


0380ddce-c31b-422a-88fe-34a1945bd949_pseuodobulk
Aorta — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 34/34 [00:00<00:00, 75.56it/s]
100%|██████████| 22/22 [00:01<00:00, 13.97it/s]
... storing 'tissue_cell_type' as categorical


1fe63353-9e75-4824-aa30-ed8d84be748c_pseuodobulk
Heart - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 51/51 [00:05<00:00,  8.72it/s]
100%|██████████| 34/34 [00:23<00:00,  1.47it/s]
... storing 'tissue_cell_type' as categorical


1efd4700-87dd-4b45-8762-11ba3fea7a65_pseuodobulk
Heart - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 74/74 [00:07<00:00,  9.57it/s]
100%|██████████| 47/47 [00:30<00:00,  1.55it/s]
... storing 'tissue_cell_type' as categorical


170ce19f-7a2f-4926-a1cc-adcad99e7474_pseuodobulk
Thymus - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 40/40 [00:02<00:00, 19.50it/s]
100%|██████████| 29/29 [00:08<00:00,  3.30it/s]
... storing 'tissue_cell_type' as categorical


05e6f6e3-0473-4b85-9f94-bcc5f1b5e04b_pseuodobulk
Lung - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 102/102 [00:03<00:00, 29.20it/s]
100%|██████████| 45/45 [00:11<00:00,  3.92it/s]
... storing 'tissue_cell_type' as categorical


b8c618e5-4b3d-4566-8a3f-7e40047f5c54_pseuodobulk
Kidney - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 136/136 [00:16<00:00,  8.43it/s]
100%|██████████| 79/79 [01:01<00:00,  1.29it/s]
... storing 'tissue_cell_type' as categorical


76544818-bc5b-4a0d-87d4-40dde89545cb_pseuodobulk
Adipose tissue — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(17985, 9)
(17985, 10)


100%|██████████| 55/55 [00:04<00:00, 12.09it/s]
100%|██████████| 37/37 [00:17<00:00,  2.10it/s]
... storing 'tissue_cell_type' as categorical


98ad5247-68f8-42f8-b8e5-7938cb373a91_pseuodobulk
Tongue - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 39/39 [00:21<00:00,  1.83it/s]
100%|██████████| 36/36 [01:24<00:00,  2.35s/it]
... storing 'tissue_cell_type' as categorical


821c79aa-044e-40cf-b331-0fe3edd48019_pseuodobulk
Mesenteric fat pad — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 37/37 [00:01<00:00, 18.85it/s]
100%|██████████| 26/26 [00:07<00:00,  3.41it/s]
... storing 'tissue_cell_type' as categorical


c9096ac4-ea44-4cf9-82f4-af05cb83eb24_pseuodobulk
Tongue - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 22/22 [00:02<00:00,  7.41it/s]
100%|██████████| 16/16 [00:10<00:00,  1.49it/s]
... storing 'tissue_cell_type' as categorical


bc7466d7-ff13-4ff2-9c3d-7a1d208bd492_pseuodobulk
Mammary gland - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 22/22 [00:01<00:00, 12.61it/s]
100%|██████████| 17/17 [00:07<00:00,  2.28it/s]
... storing 'tissue_cell_type' as categorical


8f1bc86b-7976-4826-8602-f5266160ad86_pseuodobulk
Brown adipose tissue — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 27/27 [00:00<00:00, 35.01it/s]
100%|██████████| 18/18 [00:03<00:00,  5.63it/s]
... storing 'tissue_cell_type' as categorical


e3b8c485-7811-407e-99ed-c7d574be9d7c_pseuodobulk
Pancreas - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 44/44 [00:03<00:00, 14.08it/s]
100%|██████████| 24/24 [00:09<00:00,  2.61it/s]
... storing 'tissue_cell_type' as categorical


6202a243-b713-4e12-9ced-c387f8483dea_pseuodobulk
Liver - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 58/58 [00:03<00:00, 14.59it/s]
100%|██████████| 38/38 [00:14<00:00,  2.70it/s]
... storing 'tissue_cell_type' as categorical


db55b719-6102-493a-9251-404bc501d0de_pseuodobulk
Bone marrow - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 111/111 [00:11<00:00,  9.52it/s]
100%|██████████| 60/60 [00:44<00:00,  1.35it/s]
... storing 'tissue_cell_type' as categorical


7c6091da-4606-44c7-a2c4-ef896de09e28_pseuodobulk
Mammary gland - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 63/63 [00:08<00:00,  7.30it/s]
100%|██████████| 32/32 [00:37<00:00,  1.18s/it]
... storing 'tissue_cell_type' as categorical


524179b0-b406-4723-9c46-293ffa77ca81_pseuodobulk
Kidney - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 32/32 [00:01<00:00, 30.87it/s]
100%|██████████| 21/21 [00:03<00:00,  5.62it/s]
... storing 'tissue_cell_type' as categorical


66ff82b4-9380-469c-bc4b-cfa08eacd325_pseuodobulk
Brain non-myeloid cells - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 68/68 [00:05<00:00, 11.42it/s]
100%|██████████| 39/39 [00:20<00:00,  1.88it/s]
... storing 'tissue_cell_type' as categorical


c08f8441-4a10-4748-872a-e70c0bcccdba_pseuodobulk
Brain myeloid cells - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 21/21 [00:06<00:00,  3.25it/s]
100%|██████████| 20/20 [00:29<00:00,  1.46s/it]
... storing 'tissue_cell_type' as categorical


cbd62079-bed8-4aa1-9659-670f9cb51f9d_pseuodobulk
Diaphragm — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 27/27 [00:00<00:00, 34.24it/s]
100%|██████████| 16/16 [00:02<00:00,  7.02it/s]
... storing 'tissue_cell_type' as categorical


93966790-bbfa-420f-aa85-bc5ca51d9c96_pseuodobulk
Bladder lumen - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 25/25 [00:05<00:00,  4.26it/s]
100%|██████████| 20/20 [00:24<00:00,  1.22s/it]
... storing 'tissue_cell_type' as categorical


e80d4e1c-672f-496a-8f32-37eab34f727d_pseuodobulk
Spleen - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 28/28 [00:01<00:00, 15.37it/s]
100%|██████████| 21/21 [00:08<00:00,  2.56it/s]
... storing 'tissue_cell_type' as categorical


ef47280b-3e68-4188-a49a-7b8374c8a6f2_pseuodobulk
Pancreas - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 33/33 [00:02<00:00, 16.08it/s]
100%|██████████| 22/22 [00:09<00:00,  2.30it/s]
... storing 'tissue_cell_type' as categorical


6e4f871d-fd7c-4909-8c14-e4c9957c2e8f_pseuodobulk
Thymus - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 47/47 [00:05<00:00,  9.00it/s]
100%|██████████| 28/28 [00:20<00:00,  1.34it/s]
... storing 'tissue_cell_type' as categorical


c2878000-d3f0-4d30-9a8a-2139a13c72f8_pseuodobulk
Subcutaneous adipose tissue — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 53/53 [00:02<00:00, 20.74it/s]
100%|██████████| 38/38 [00:08<00:00,  4.54it/s]
... storing 'tissue_cell_type' as categorical


a7ace090-1ba1-47f2-8def-6e11298b7816_pseuodobulk
Limb muscle - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 84/84 [00:21<00:00,  3.88it/s]
100%|██████████| 35/35 [01:29<00:00,  2.57s/it]
... storing 'tissue_cell_type' as categorical


bf12f9c6-4211-4c91-9c71-22019f29f516_pseuodobulk
Bladder lumen - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 14/14 [00:01<00:00,  7.58it/s]
100%|██████████| 12/12 [00:05<00:00,  2.03it/s]
... storing 'tissue_cell_type' as categorical


de4e7a0c-91b2-44e4-b382-87da74c9efb6_pseuodobulk
Large intestine - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(17985, 9)
(17985, 10)


100%|██████████| 32/32 [00:01<00:00, 28.27it/s]
100%|██████████| 20/20 [00:04<00:00,  4.87it/s]
... storing 'tissue_cell_type' as categorical


e2b469d4-b5c3-4a35-9d19-ee71ce61cae0_pseuodobulk
Spleen - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - 10x
(17985, 9)
(17985, 10)


100%|██████████| 103/103 [00:20<00:00,  4.98it/s]
100%|██████████| 48/48 [01:34<00:00,  1.98s/it]
... storing 'tissue_cell_type' as categorical


ec6c52b8-3368-4f72-a416-1ade0dab97bf_pseuodobulk
Limb muscle - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 46/46 [00:01<00:00, 28.96it/s]
100%|██████████| 27/27 [00:05<00:00,  4.52it/s]
... storing 'tissue_cell_type' as categorical


98e5ea9f-16d6-47ec-a529-686e76515e39_pseuodobulk
All - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
(21069, 9)
(21069, 10)


100%|██████████| 1055/1055 [02:32<00:00,  6.90it/s]
100%|██████████| 532/532 [09:42<00:00,  1.10s/it]
... storing 'tissue_cell_type' as categorical


a6626b73-a0de-4dee-99aa-2559ab05af11_pseuodobulk
Gonadal fat pad — A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
(21069, 9)
(21069, 10)


100%|██████████| 51/51 [00:01<00:00, 25.59it/s]
100%|██████████| 31/31 [00:06<00:00,  4.43it/s]
... storing 'tissue_cell_type' as categorical


f16a8f4d-bc97-43c5-a2f6-bbda952e4c5c_pseuodobulk
All - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse
Skipping f16a8f4d-bc97-43c5-a2f6-bbda952e4c5c_pseuodobulk...


In [5]:
(settings.DATA_LOC / 'tabula_sapiens' / '10X').mkdir(exist_ok=True)
(settings.DATA_LOC / 'tabula_sapiens' / 'smart-seq').mkdir(exist_ok=True)

In [7]:
# for fh in tabula_sapiens_files:
for fh in (settings.DATA_LOC / 'tabula_sapiens').glob('*.h5ad'):
    print(fh.stem)
    _adata = ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad')
    _adata_10X = _adata[_adata.obs['assay'].str.upper().str.contains('10X')]
    _adata_smartseq = _adata[_adata.obs['assay'].str.upper().str.contains('SMART')]
    if _adata_10X.shape[0] > 0:
        process_cellxgene_atlas_adatas(
            # ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            _adata_10X,
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_sapiens' / '10X',
            use_rep='X_scvi',
            # umap_key='X_scvi_umap',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
            normalize_length=False,
        )
    if _adata_smartseq.shape[0] > 0:
        process_cellxgene_atlas_adatas(
            # ad.read_h5ad(cellxgene_loc / f'{fh.stem.split("_")[0]}.h5ad', backed='r'),
            _adata_smartseq,
            fh.stem.split('_')[0],
            out_loc=settings.DATA_LOC / 'tabula_sapiens' / 'smart-seq',
            use_rep='X_scvi',
            # umap_key='X_scvi_umap',
            umap_key='X_umap',
            feature_lengths_df=adata.var[['feature_length']].copy(),
            normalize_length=True,
        )


18eb630b-a754-4111-8cd4-c24ec80aa5ec_pseuodobulk


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


(58604, 12)
(58604, 13)


100%|██████████| 194/194 [00:00<00:00, 384.57it/s]
100%|██████████| 136/136 [00:44<00:00,  3.08it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 102/102 [00:00<00:00, 1169.27it/s]
100%|██████████| 54/54 [00:01<00:00, 42.73it/s]
... storing 'tissue_cell_type' as categorical


0d2ee4ac-05ee-40b2-afb6-ebb584caa867_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 224/224 [00:00<00:00, 400.29it/s]
100%|██████████| 96/96 [00:30<00:00,  3.11it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 86/86 [00:00<00:00, 1086.06it/s]
100%|██████████| 34/34 [00:00<00:00, 37.18it/s]
... storing 'tissue_cell_type' as categorical


4f1555bc-4664-46c3-a606-78d34dd10d92_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 47/47 [00:00<00:00, 411.59it/s]
100%|██████████| 40/40 [00:07<00:00,  5.10it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 37/37 [00:00<00:00, 885.04it/s]
100%|██████████| 25/25 [00:01<00:00, 20.36it/s]
... storing 'tissue_cell_type' as categorical


2423ce2c-3149-4cca-a2ff-cf682ea29b5f_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 30/30 [00:00<00:00, 287.74it/s]
100%|██████████| 22/22 [00:07<00:00,  2.79it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 15/15 [00:00<00:00, 988.24it/s]
100%|██████████| 7/7 [00:00<00:00, 46.73it/s]
... storing 'tissue_cell_type' as categorical


1c9eb291-6d31-47e1-96b2-129b5e1ae64f_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 72/72 [00:00<00:00, 255.22it/s]
100%|██████████| 45/45 [00:22<00:00,  1.96it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 82/82 [00:00<00:00, 924.78it/s]
100%|██████████| 43/43 [00:03<00:00, 13.22it/s]
... storing 'tissue_cell_type' as categorical


0041b9c3-6a49-4bf7-8514-9bc7190067a7_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 70/70 [00:00<00:00, 630.35it/s]
100%|██████████| 49/49 [00:06<00:00,  7.64it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 41/41 [00:00<00:00, 1113.17it/s]
100%|██████████| 21/21 [00:00<00:00, 52.67it/s]
... storing 'tissue_cell_type' as categorical


0ced5e76-6040-47ff-8a72-93847965afc0_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 148/148 [00:00<00:00, 412.76it/s]
100%|██████████| 88/88 [00:28<00:00,  3.04it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 31/31 [00:00<00:00, 1123.34it/s]
100%|██████████| 15/15 [00:00<00:00, 54.28it/s]
... storing 'tissue_cell_type' as categorical


2ba40233-8576-4dec-a5f1-2adfa115e2dc_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 42/42 [00:00<00:00, 345.18it/s]
100%|██████████| 24/24 [00:09<00:00,  2.49it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 34/34 [00:00<00:00, 1120.91it/s]
100%|██████████| 17/17 [00:00<00:00, 62.75it/s]
... storing 'tissue_cell_type' as categorical


97a17473-e2b1-4f31-a544-44a60773e2dd_pseuodobulk
(58604, 11)
(58604, 12)


100%|██████████| 555/555 [00:01<00:00, 369.85it/s]
100%|██████████| 250/250 [01:30<00:00,  2.76it/s]
... storing 'tissue_cell_type' as categorical


(58604, 11)
(58604, 12)


100%|██████████| 224/224 [00:00<00:00, 1152.93it/s]
100%|██████████| 85/85 [00:03<00:00, 27.88it/s]
... storing 'tissue_cell_type' as categorical


c5d88abe-f23a-45fa-a534-788985e93dad_pseuodobulk
(58604, 11)
(58604, 12)


100%|██████████| 1846/1846 [00:03<00:00, 604.03it/s]
100%|██████████| 945/945 [03:42<00:00,  4.25it/s]
... storing 'tissue_cell_type' as categorical


(58604, 11)
(58604, 12)


100%|██████████| 931/931 [00:00<00:00, 1276.04it/s]
100%|██████████| 347/347 [00:07<00:00, 44.04it/s]
... storing 'tissue_cell_type' as categorical


e5c63d94-593c-4338-a489-e1048599e751_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 116/116 [00:00<00:00, 408.33it/s]
100%|██████████| 59/59 [00:20<00:00,  2.91it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 46/46 [00:00<00:00, 1043.86it/s]
100%|██████████| 20/20 [00:00<00:00, 36.40it/s]
... storing 'tissue_cell_type' as categorical


cee11228-9f0b-4e57-afe2-cfe15ee56312_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 185/185 [00:00<00:00, 490.08it/s]
100%|██████████| 97/97 [00:26<00:00,  3.71it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 71/71 [00:00<00:00, 1122.41it/s]
100%|██████████| 31/31 [00:00<00:00, 31.48it/s]
... storing 'tissue_cell_type' as categorical


ff45e623-7f5f-46e3-b47d-56be0341f66b_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 58/58 [00:00<00:00, 346.08it/s]
100%|██████████| 29/29 [00:10<00:00,  2.90it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 35/35 [00:00<00:00, 998.70it/s]
100%|██████████| 20/20 [00:00<00:00, 43.17it/s]
... storing 'tissue_cell_type' as categorical


e6a11140-2545-46bc-929e-da243eed2cae_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 62/62 [00:00<00:00, 339.85it/s]
100%|██████████| 46/46 [00:08<00:00,  5.62it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 27/27 [00:00<00:00, 1166.42it/s]
100%|██████████| 15/15 [00:00<00:00, 116.90it/s]
... storing 'tissue_cell_type' as categorical


5a11f879-d1ef-458a-910c-9b0bdfca5ebf_pseuodobulk
(58604, 11)
(58604, 12)


100%|██████████| 619/619 [00:00<00:00, 884.35it/s]
100%|██████████| 286/286 [00:21<00:00, 13.27it/s]
... storing 'tissue_cell_type' as categorical


(58604, 11)
(58604, 12)


100%|██████████| 221/221 [00:00<00:00, 1281.10it/s]
100%|██████████| 79/79 [00:01<00:00, 53.94it/s]
... storing 'tissue_cell_type' as categorical


d8732da6-8d1d-42d9-b625-f2416c30054b_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 70/70 [00:00<00:00, 520.22it/s]
100%|██████████| 44/44 [00:07<00:00,  6.25it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 33/33 [00:00<00:00, 1064.81it/s]
100%|██████████| 19/19 [00:00<00:00, 80.04it/s]
... storing 'tissue_cell_type' as categorical


53d208b0-2cfd-4366-9866-c3c6114081bc_pseuodobulk
(58604, 11)
(58604, 12)


100%|██████████| 2858/2858 [00:05<00:00, 491.11it/s]
100%|██████████| 1366/1366 [06:44<00:00,  3.38it/s]
... storing 'tissue_cell_type' as categorical


(58604, 11)
(58604, 12)


100%|██████████| 1309/1309 [00:01<00:00, 1199.20it/s]
100%|██████████| 501/501 [00:16<00:00, 31.28it/s]
... storing 'tissue_cell_type' as categorical


55cf0ea3-9d2b-4294-871e-bb4b49a79fc7_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 37/37 [00:00<00:00, 199.41it/s]
100%|██████████| 26/26 [00:13<00:00,  1.92it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 38/38 [00:00<00:00, 1005.38it/s]
100%|██████████| 18/18 [00:00<00:00, 28.13it/s]
... storing 'tissue_cell_type' as categorical


a68b64d8-aee3-4947-81b7-36b8fe5a44d2_pseuodobulk
(58604, 11)
(58604, 12)


100%|██████████| 703/703 [00:01<00:00, 523.99it/s]
100%|██████████| 350/350 [01:08<00:00,  5.11it/s]
... storing 'tissue_cell_type' as categorical


(58604, 11)
(58604, 12)


100%|██████████| 309/309 [00:00<00:00, 1028.56it/s]
100%|██████████| 119/119 [00:02<00:00, 42.23it/s]
... storing 'tissue_cell_type' as categorical


6d41668c-168c-4500-b06a-4674ccf3e19d_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 46/46 [00:00<00:00, 627.21it/s]
100%|██████████| 30/30 [00:04<00:00,  7.33it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 20/20 [00:00<00:00, 955.48it/s]
100%|██████████| 10/10 [00:00<00:00, 97.48it/s]
... storing 'tissue_cell_type' as categorical


6ec405bb-4727-4c6d-ab4e-01fe489af7ea_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 46/46 [00:00<00:00, 447.59it/s]
100%|██████████| 32/32 [00:05<00:00,  5.39it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 18/18 [00:00<00:00, 836.32it/s]
100%|██████████| 9/9 [00:00<00:00, 70.50it/s]
... storing 'tissue_cell_type' as categorical


f01bdd17-4902-40f5-86e3-240d66dd2587_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 175/175 [00:00<00:00, 521.26it/s]
100%|██████████| 77/77 [00:24<00:00,  3.18it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 27/27 [00:00<00:00, 1069.90it/s]
100%|██████████| 16/16 [00:00<00:00, 43.74it/s]
... storing 'tissue_cell_type' as categorical


5e5e7a2f-8f1c-42ac-90dc-b4f80f38e84c_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 34/34 [00:00<00:00, 143.11it/s]
100%|██████████| 27/27 [00:17<00:00,  1.52it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 33/33 [00:00<00:00, 1149.40it/s]
100%|██████████| 15/15 [00:00<00:00, 52.42it/s]
... storing 'tissue_cell_type' as categorical


d77ec7d6-ef2e-49d6-9e79-05b7f8881484_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 95/95 [00:00<00:00, 478.59it/s]
100%|██████████| 52/52 [00:13<00:00,  3.85it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 15/15 [00:00<00:00, 823.06it/s]
100%|██████████| 11/11 [00:00<00:00, 37.64it/s]
... storing 'tissue_cell_type' as categorical


a0754256-f44b-4c4a-962c-a552e47d3fdc_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 108/108 [00:00<00:00, 578.95it/s]
100%|██████████| 59/59 [00:10<00:00,  5.86it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 45/45 [00:00<00:00, 1211.44it/s]
100%|██████████| 14/14 [00:00<00:00, 71.88it/s]
... storing 'tissue_cell_type' as categorical


a2d4d33e-4c62-4361-b80a-9be53d2e50e8_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 56/56 [00:00<00:00, 244.01it/s]
100%|██████████| 32/32 [00:14<00:00,  2.26it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 16/16 [00:00<00:00, 849.90it/s]
100%|██████████| 12/12 [00:00<00:00, 30.04it/s]
... storing 'tissue_cell_type' as categorical


a357414d-2042-4eb5-95f0-c58604a18bdd_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 83/83 [00:00<00:00, 602.01it/s]
100%|██████████| 48/48 [00:09<00:00,  4.94it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 28/28 [00:00<00:00, 1042.73it/s]
100%|██████████| 17/17 [00:00<00:00, 61.63it/s]
... storing 'tissue_cell_type' as categorical


983d5ec9-40e8-4512-9e65-a572a9c486cb_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 129/129 [00:00<00:00, 330.37it/s]
100%|██████████| 78/78 [00:42<00:00,  1.84it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 54/54 [00:00<00:00, 1074.35it/s]
100%|██████████| 33/33 [00:00<00:00, 34.84it/s]
... storing 'tissue_cell_type' as categorical


7357cee7-9f7f-4ab0-8cec-90de8f047e38_pseuodobulk
(58604, 12)
(58604, 13)


100%|██████████| 74/74 [00:00<00:00, 536.60it/s]
100%|██████████| 44/44 [00:11<00:00,  3.94it/s]
... storing 'tissue_cell_type' as categorical


(58604, 12)
(58604, 13)


100%|██████████| 27/27 [00:00<00:00, 1119.30it/s]
100%|██████████| 15/15 [00:00<00:00, 73.51it/s]
... storing 'tissue_cell_type' as categorical


In [5]:
atlas_adata_muris = ad.read_h5ad(cellxgene_loc / '48b37086-25f7-4ecd-be66-f5bb378e3aea.h5ad', backed='r')

In [6]:
padata = ad.read_h5ad(settings.DATA_LOC / 'tabula_muris' / '10X' / '48b37086-25f7-4ecd-be66-f5bb378e3aea_pseuodobulk.h5ad')

In [8]:
padata_muris_tissue = utils.pseudobulk_adata(
    atlas_adata_muris,
    'tissue',
    'tissue',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 16/16 [00:30<00:00,  1.92s/it]
100%|██████████| 16/16 [03:29<00:00, 13.11s/it]


In [9]:
padata_muris_celltype = utils.pseudobulk_adata(
    atlas_adata_muris,
    'cell_type',
    'cell_type',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 122/122 [02:09<00:00,  1.06s/it]
100%|██████████| 119/119 [10:26<00:00,  5.26s/it]


In [10]:
padata_muris_louvain = utils.pseudobulk_adata(
    atlas_adata_muris,
    'louvain',
    'louvain',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 58/58 [02:19<00:00,  2.41s/it]
100%|██████████| 58/58 [11:04<00:00, 11.45s/it]


In [11]:
atlas_adata_muris.obs['tissue_cell_type'] = atlas_adata_muris.obs.apply(lambda x: f'{x.tissue}__{x.cell_type}', axis=1)

In [12]:
padata_muris_tissuecelltype = utils.pseudobulk_adata(
    atlas_adata_muris,
    'tissue_cell_type',
    'tissue_cell_type',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 169/169 [02:09<00:00,  1.30it/s]
100%|██████████| 166/166 [10:24<00:00,  3.76s/it]


In [13]:
padata.var['tau__tissue'] = calc_adata_tau(padata_muris_tissue, return_tau=True)
padata.var['tau__cell_type'] = calc_adata_tau(padata_muris_celltype, return_tau=True)
padata.var['tau__louvain'] = calc_adata_tau(padata_muris_louvain, return_tau=True)
padata.var['tau__tissue_cell_type'] = calc_adata_tau(padata_muris_tissuecelltype, return_tau=True)
padata.var['tau__pseudobulk'] = calc_adata_tau(padata, return_tau=True)

  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)


In [19]:
s_tissue = utils.label_expression_similarity(
    padata_muris_tissue.layers['ntpm'],
    padata_muris_tissue.obs,
    padata_muris_tissue.var,
    specificity_column = 'tissue',
)
s_celltype = utils.label_expression_similarity(
    padata_muris_celltype.layers['ntpm'],
    padata_muris_celltype.obs,
    padata_muris_celltype.var,
    specificity_column = 'cell_type',
)
s_tissuecelltype = utils.label_expression_similarity(
    padata_muris_tissuecelltype.layers['ntpm'],
    padata_muris_tissuecelltype.obs,
    padata_muris_tissuecelltype.var,
    specificity_column = 'tissue_cell_type',
)
s_louvain = utils.label_expression_similarity(
    padata_muris_louvain.layers['ntpm'],
    padata_muris_louvain.obs,
    padata_muris_louvain.var,
    specificity_column = 'louvain',
)
s_pseudobulk = utils.label_expression_similarity(
    padata.layers['ntpm'],
    padata.obs,
    padata.var,
)
padata.uns['specificity'] = {}
padata.uns['specificity']['tissue'] = s_tissue
padata.uns['specificity']['cell_type'] = s_celltype
padata.uns['specificity']['tissue_cell_type'] = s_tissuecelltype
padata.uns['specificity']['louvain'] = s_louvain
padata.uns['specificity']['pseudobulk'] = s_pseudobulk

In [22]:
(settings.DATA_LOC / 'tabula_muris' / '10X' / 'tmp').mkdir(exist_ok=True)
padata.write_h5ad(settings.DATA_LOC / 'tabula_muris' / '10X' / 'tmp' / '48b37086-25f7-4ecd-be66-f5bb378e3aea_pseuodobulk.h5ad')

In [8]:
atlas_adata_sapiens = ad.read_h5ad(cellxgene_loc / '53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad')

In [9]:
padata = ad.read_h5ad(settings.DATA_LOC / 'tabula_sapiens' / '10X' / '53d208b0-2cfd-4366-9866-c3c6114081bc_pseuodobulk.h5ad')

In [10]:
padata_sapiens_tissue = utils.pseudobulk_adata(
    atlas_adata_sapiens,
    'tissue',
    'tissue',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 45/45 [00:06<00:00,  6.61it/s]
100%|██████████| 45/45 [08:02<00:00, 10.72s/it]


In [11]:
padata_sapiens_celltype = utils.pseudobulk_adata(
    atlas_adata_sapiens,
    'cell_type',
    'cell_type',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 161/161 [00:04<00:00, 33.37it/s]
100%|██████████| 160/160 [07:39<00:00,  2.87s/it]


In [18]:
atlas_adata_sapiens.obs

Unnamed: 0,tissue_in_publication,assay_ontology_term_id,donor_id,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,...,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,_sample_key,_cell_index
AAACCCACACTCCTGT_TSP6_Liver_NA_10X_1_1,Liver,EFO:0009922,TSP6,,7633.0,2259,macrophage,Monocyte/Macrophage,True,immune,...,macrophage,10x 3' v3,normal,Homo sapiens,male,liver,European,67-year-old human stage,macrophage__macrophage,0
AAACGAAGTACCAGAG_TSP6_Liver_NA_10X_1_1,Liver,EFO:0009922,TSP6,,2858.0,1152,monocyte,Monocyte,True,immune,...,monocyte,10x 3' v3,normal,Homo sapiens,male,liver,European,67-year-old human stage,monocyte__monocyte,1
AAACGCTCAACGGCTC_TSP6_Liver_NA_10X_1_1,Liver,EFO:0009922,TSP6,,7787.0,2983,endothelial cell of hepatic sinusoid,Endothelial,True,endothelial,...,endothelial cell of hepatic sinusoid,10x 3' v3,normal,Homo sapiens,male,liver,European,67-year-old human stage,endothelial cell of hepatic sinusoid__endothel...,2
AAAGAACAGCCTCTTC_TSP6_Liver_NA_10X_1_1,Liver,EFO:0009922,TSP6,,10395.0,2598,macrophage,Monocyte/Macrophage,True,immune,...,macrophage,10x 3' v3,normal,Homo sapiens,male,liver,European,67-year-old human stage,macrophage__macrophage,3
AAAGAACGTAGCACAG_TSP6_Liver_NA_10X_1_1,Liver,EFO:0009922,TSP6,,6610.0,2125,liver dendritic cell,Dendritic cell,True,immune,...,liver dendritic cell,10x 3' v3,normal,Homo sapiens,male,liver,European,67-year-old human stage,liver dendritic cell__liver dendritic cell,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P4_S364,Vasculature,EFO:0008931,TSP2,aorta,13205.0,579,endothelial cell,endothelial cell,True,endothelial,...,endothelial cell,Smart-seq2,normal,Homo sapiens,female,aorta,African American or Afro-Caribbean,61-year-old human stage,endothelial cell__endothelial cell,483147
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365,Vasculature,EFO:0008931,TSP2,aorta,9565.0,529,endothelial cell,endothelial cell,True,endothelial,...,endothelial cell,Smart-seq2,normal,Homo sapiens,female,aorta,African American or Afro-Caribbean,61-year-old human stage,endothelial cell__endothelial cell,483148
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P7_S367,Vasculature,EFO:0008931,TSP2,aorta,195639.0,2753,endothelial cell,endothelial cell,True,endothelial,...,endothelial cell,Smart-seq2,normal,Homo sapiens,female,aorta,African American or Afro-Caribbean,61-year-old human stage,endothelial cell__endothelial cell,483149
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P8_S368,Vasculature,EFO:0008931,TSP2,aorta,37260.0,984,endothelial cell,endothelial cell,True,endothelial,...,endothelial cell,Smart-seq2,normal,Homo sapiens,female,aorta,African American or Afro-Caribbean,61-year-old human stage,endothelial cell__endothelial cell,483150


In [19]:
sc.pp.neighbors(
    atlas_adata_sapiens,
    n_neighbors=15,
    n_pcs=50,
    use_rep='X_scvi',
)

sc.tl.louvain(
    atlas_adata_sapiens,
    resolution=1.0,
)

In [20]:
padata_sapiens_louvain = utils.pseudobulk_adata(
    atlas_adata_sapiens,
    'louvain',
    'louvain',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 78/78 [00:04<00:00, 16.49it/s]
100%|██████████| 78/78 [07:35<00:00,  5.84s/it]


In [21]:
atlas_adata_sapiens.obs['tissue_cell_type'] = atlas_adata_sapiens.obs.apply(lambda x: f'{x.tissue}__{x.cell_type}', axis=1)

In [22]:
padata_sapiens_tissuecelltype = utils.pseudobulk_adata(
    atlas_adata_sapiens,
    'tissue_cell_type',
    'tissue_cell_type',
    min_cells=5,
    min_counts=200,
    umap_key='X_umap',
    calc_tpm=True,
    normalize_length=False,
)

100%|██████████| 749/749 [00:04<00:00, 156.68it/s]
100%|██████████| 679/679 [07:33<00:00,  1.50it/s]


In [28]:
padata.var['tau__tissue'] = calc_adata_tau(padata_sapiens_tissue, return_tau=True)
padata.var['tau__cell_type'] = calc_adata_tau(padata_sapiens_celltype, return_tau=True)
padata.var['tau__louvain'] = calc_adata_tau(padata_sapiens_louvain, return_tau=True)
padata.var['tau__tissue_cell_type'] = calc_adata_tau(padata_sapiens_tissuecelltype, return_tau=True)
padata.var['tau__pseudobulk'] = calc_adata_tau(padata, return_tau=True)

  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)
  xi = l / l.max(axis=0)


In [29]:
s_tissue = utils.label_expression_similarity(
    padata_sapiens_tissue.layers['ntpm'],
    padata_sapiens_tissue.obs,
    padata_sapiens_tissue.var,
    specificity_column = 'tissue',
)
s_celltype = utils.label_expression_similarity(
    padata_sapiens_celltype.layers['ntpm'],
    padata_sapiens_celltype.obs,
    padata_sapiens_celltype.var,
    specificity_column = 'cell_type',
)
s_tissuecelltype = utils.label_expression_similarity(
    padata_sapiens_tissuecelltype.layers['ntpm'],
    padata_sapiens_tissuecelltype.obs,
    padata_sapiens_tissuecelltype.var,
    specificity_column = 'tissue_cell_type',
)
s_louvain = utils.label_expression_similarity(
    padata_sapiens_louvain.layers['ntpm'],
    padata_sapiens_louvain.obs,
    padata_sapiens_louvain.var,
    specificity_column = 'louvain',
)
s_pseudobulk = utils.label_expression_similarity(
    padata.layers['ntpm'],
    padata.obs,
    padata.var,
)
padata.uns['specificity'] = {}
padata.uns['specificity']['tissue'] = s_tissue
padata.uns['specificity']['cell_type'] = s_celltype
padata.uns['specificity']['tissue_cell_type'] = s_tissuecelltype
padata.uns['specificity']['louvain'] = s_louvain
padata.uns['specificity']['pseudobulk'] = s_pseudobulk