In [1]:
import pandas as pd 
import scanpy as sc 
import numpy as np
import snapatac2 as snap
from collections import Counter

In [2]:
%%time
adata = sc.read_h5ad("../07_final_ATAC.h5ad")
adata

CPU times: user 6.38 s, sys: 50.2 s, total: 56.6 s
Wall time: 3min 8s


AnnData object with n_obs × n_vars = 690044 × 654221
    obs: 'ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study', 'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology', 'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type', 'tech_plus_study', 'age_group', 'decade', 'final_cell_type', 'cell_or_nuclei', 'disease'
    var: 'count', 'selected'
    uns: 'age_status_colors', 'cell_type_colors', 'leiden', 'leiden_colors', 'neighbors', 'spectral_eigenvalue', 'study_colors'
    obsm: 'X_spectral', 'X_spectral_harmony', 'X_umap'
    obsp: 'connectivities', 'distances'

In [3]:
Counter(adata.obs.age_status)

Counter({'postnatal': 567322, 'fetal': 122722})

In [4]:
Counter(adata.obs.disease_binary)

Counter({'N': 662539, 'Y': 27505})

In [5]:
# add disease and binary status
adata.obs['disease_and_age_status'] = adata.obs['disease_binary'].astype(str) + ":" + adata.obs['age_status'].astype(str)

In [6]:
Counter(adata.obs.disease_and_age_status)

Counter({'N:postnatal': 539817, 'N:fetal': 122722, 'Y:postnatal': 27505})

In [7]:
adata_metadata = adata.obs 

In [8]:
adata_metadata.groupby(["disease_and_age_status", "final_cell_type"])['ATAC_barcode'].count()

  adata_metadata.groupby(["disease_and_age_status", "final_cell_type"])['ATAC_barcode'].count()


disease_and_age_status  final_cell_type
N:fetal                 Adipocyte               0
                        Cardiomyocyte       78897
                        Endothelial         14497
                        Epicardial              8
                        Fibroblast          20855
                        Lymphoid             2116
                        Mast                   12
                        Myeloid              2096
                        Neuronal              472
                        Pericyte             3758
                        vSMC                   11
N:postnatal             Adipocyte             935
                        Cardiomyocyte      199287
                        Endothelial         96908
                        Epicardial           2291
                        Fibroblast         113718
                        Lymphoid            15808
                        Mast                 1280
                        Myeloid             52336
          

### If we use a threshold of at least 750 cells for each particular age/disease status, then the only cell types that satisfy this are: 

- Cardiomyocyte
- Endothelial
- Fibroblast
- Lymphoid
- Myeloid
- Pericyte

### Sample equally from each of the age + disease status & cell type categories; set n=750

In [9]:
%%time 

cell_types_to_keep = ["Cardiomyocyte", "Endothelial", "Fibroblast", "Lymphoid", "Myeloid", "Pericyte"]

filtered_adata = adata[adata.obs.cell_type.isin(cell_types_to_keep)].copy()

n_cells = 750 

filtered_adata.obs['group'] = filtered_adata.obs['disease_and_age_status'].astype(str) + "_" + filtered_adata.obs['cell_type'].astype(str)
groups = filtered_adata.obs.group.unique()

subsampled_indices = []
for group in groups:
    group_indices = filtered_adata.obs.index[filtered_adata.obs['group'] == group]
    sampled_indices = np.random.choice(group_indices, n_cells, replace=False)
    # add to growing list
    subsampled_indices.extend(sampled_indices)

adata_subsampled = filtered_adata[subsampled_indices].copy()
print(f"Total cells subsampled: {adata_subsampled.n_obs}")

Total cells subsampled: 13500
CPU times: user 31.9 s, sys: 4min 11s, total: 4min 43s
Wall time: 4min 47s


In [10]:
adata_subsampled.write("01_subsampled_ATAC.h5ad")

... storing 'disease_and_age_status' as categorical
... storing 'group' as categorical
