In [1]:
import scanpy as sc 
import numpy as np
from collections import Counter

#### Since ATAC has fewer nuclei, it is more likely to be the limitation in terms of which cell types have at least 750 nuclei

- Cardiomyocyte
- Endothelial
- Fibroblast
- Lymphoid
- Myeloid
- Pericyte

#### Since ATAC has fewer nuclei, it is more likely to be the limitation in terms of which cell types have at least 750 nuclei

In [2]:
%%time
adata = sc.read_h5ad("../../../RNA/aggregated_analysis/07_final_RNA_without_scvi.h5ad")
adata

CPU times: user 17.7 s, sys: 1min 56s, total: 2min 14s
Wall time: 4min 56s


AnnData object with n_obs × n_vars = 2305964 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2', 'v2_scvi_cell_type', 'final_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'

In [3]:
Counter(adata.obs.age_status)

Counter({'postnatal': 2235017, 'fetal': 70947})

In [4]:
Counter(adata.obs.disease_binary)

Counter({'N': 1212319, 'Y': 1093645})

In [5]:
# add disease and binary status
adata.obs['disease_and_age_status'] = adata.obs['disease_binary'].astype(str) + ":" + adata.obs['age_status'].astype(str)

In [6]:
adata_metadata = adata.obs 

In [7]:
adata_metadata.groupby(["disease_and_age_status", "final_cell_type"])['barcode'].count()

disease_and_age_status  final_cell_type
N:fetal                 Adipocyte               3
                        Cardiomyocyte       24448
                        Endocardial          6997
                        Endothelial          4344
                        Epicardial           2748
                        Fibroblast          19818
                        LEC                   800
                        Lymphoid             4074
                        Mast                   82
                        Myeloid              2291
                        Neuronal             1182
                        Pericyte             2689
                        vSMC                 1471
N:postnatal             Adipocyte            6649
                        Cardiomyocyte      365026
                        Endocardial         22569
                        Endothelial        110727
                        Epicardial           3498
                        Fibroblast         274170
          

In [17]:
%%time 

cell_types_to_keep = ["Cardiomyocyte", "Endothelial", "Fibroblast", "Lymphoid", "Myeloid", "Pericyte"]

filtered_adata = adata[adata.obs['final_cell_type'].isin(cell_types_to_keep)].copy()

n_cells = 750 

filtered_adata.obs['group'] = filtered_adata.obs['disease_and_age_status'].astype(str) + "_" + filtered_adata.obs['final_cell_type'].astype(str)
groups = filtered_adata.obs.group.unique()

subsampled_indices = []
for group in groups:
    group_indices = filtered_adata.obs.index[filtered_adata.obs['group'] == group]
    sampled_indices = np.random.choice(group_indices, n_cells, replace=False)
    # add to growing list
    subsampled_indices.extend(sampled_indices)

adata_subsampled = filtered_adata[subsampled_indices].copy()
print(f"Total cells subsampled: {adata_subsampled.n_obs}")

Total cells subsampled: 13500
CPU times: user 1min 10s, sys: 14min 8s, total: 15min 19s
Wall time: 18min 32s


In [18]:
Counter(adata_subsampled.obs['group'])

Counter({'N:fetal_Endothelial': 750,
         'Y:postnatal_Endothelial': 750,
         'N:postnatal_Endothelial': 750,
         'N:fetal_Lymphoid': 750,
         'Y:postnatal_Lymphoid': 750,
         'N:postnatal_Lymphoid': 750,
         'N:fetal_Fibroblast': 750,
         'Y:postnatal_Fibroblast': 750,
         'N:postnatal_Fibroblast': 750,
         'N:fetal_Cardiomyocyte': 750,
         'Y:postnatal_Cardiomyocyte': 750,
         'N:postnatal_Cardiomyocyte': 750,
         'N:fetal_Pericyte': 750,
         'Y:postnatal_Pericyte': 750,
         'N:postnatal_Pericyte': 750,
         'N:fetal_Myeloid': 750,
         'Y:postnatal_Myeloid': 750,
         'N:postnatal_Myeloid': 750})

In [19]:
adata_subsampled.write("01_subsampled_RNA.h5ad")