In [2]:
import os
os.chdir("../../")

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import surgeon as sg
import scanpy as sc
import numpy as np

Using TensorFlow backend.


In [10]:
def subsample_selection(adata, study_key, frac=0.1, specific_cell_types=None, cell_type_key=None):
    studies = adata.obs[study_key].unique().tolist()
    if specific_cell_types and cell_type_key:
        subsampled_adata = adata[adata.obs[cell_type_key].isin(specific_cell_types)]
        other_adata = adata[~adata.obs[cell_type_key].isin(specific_cell_types)]
    else:
        subsampled_adata = None
    for study in studies:
        study_adata = other_adata[other_adata.obs[study_key] == study]
        n_samples = study_adata.shape[0]
        print(study, n_samples)
        subsample_idx = np.random.choice(n_samples, int(frac * n_samples), replace=False)
        study_adata_subsampled = study_adata[subsample_idx, :]
        subsampled_adata = study_adata_subsampled if subsampled_adata is None else subsampled_adata.concatenate(study_adata_subsampled)
    return subsampled_adata

In [6]:
adata = sc.read("./data/mouse_brain/mouse_brain_normalized_hvg_all.h5ad")
adata

AnnData object with n_obs × n_vars = 978734 × 2000 
    obs: 'Age', 'Subclass', 'Taxonomy_group', 'Tissue', 'age', 'batch', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'class', 'cluster', 'cluster_id', 'louvain', 'reason', 'refined_class', 'region', 'region_subcluster', 'sample_type', 'study', 'subcluster'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [8]:
adata.obs['study'].value_counts()

Saunders        691489
Zeisel          145954
Rosenberg       133435
Tabula_muris      7856
Name: study, dtype: int64

In [9]:
adata.obs['cell_type'].value_counts()

neuron                            628235
oligodendrocyte                   144260
astrocyte                          88530
endothelial cell                   55031
brain pericyte                     18692
oligodendrocyte precursor cell     18150
microglial cell                    15855
ependymal cell                      3851
macrophage                          3844
olfactory ensheathing cell          2286
Name: cell_type, dtype: int64

In [10]:
adata.obs.groupby(['cell_type', 'study']).size()

cell_type                       study       
astrocyte                       Rosenberg        10683
                                Saunders         56915
                                Tabula_muris       472
                                Zeisel           20460
brain pericyte                  Rosenberg         1188
                                Saunders         12135
                                Tabula_muris       156
                                Zeisel            5213
endothelial cell                Rosenberg          474
                                Saunders         46455
                                Tabula_muris       715
                                Zeisel            7387
ependymal cell                  Rosenberg          400
                                Saunders          2083
                                Zeisel            1368
macrophage                      Rosenberg           53
                                Saunders          1701
                    

In [7]:
pretrain_adata = adata[adata.obs['study'].isin(['Saunders', 'Rosenberg'])]
pretrain_adata

View of AnnData object with n_obs × n_vars = 824924 × 2000 
    obs: 'Age', 'Subclass', 'Taxonomy_group', 'Tissue', 'age', 'batch', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'class', 'cluster', 'cluster_id', 'louvain', 'reason', 'refined_class', 'region', 'region_subcluster', 'sample_type', 'study', 'subcluster'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [14]:
cell_types_to_keep = ['olfactory ensheathing cell', 'macrophage', 'ependymal cell']

In [15]:
subsampled_adata = subsample_selection(pretrain_adata, 'study', 0.5, cell_types_to_keep, 'cell_type')
subsampled_adata

Saunders 687705
Rosenberg 132727


AnnData object with n_obs × n_vars = 414707 × 2000 
    obs: 'Age', 'Subclass', 'Taxonomy_group', 'Tissue', 'age', 'batch', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'class', 'cluster', 'cluster_id', 'louvain', 'reason', 'refined_class', 'region', 'region_subcluster', 'sample_type', 'study', 'subcluster'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [16]:
subsampled_adata.obs['cell_type'].value_counts()

neuron                            278659
oligodendrocyte                    56016
astrocyte                          33834
endothelial cell                   23463
oligodendrocyte precursor cell      8478
brain pericyte                      6696
microglial cell                     3069
ependymal cell                      2483
macrophage                          1754
olfactory ensheathing cell           255
Name: cell_type, dtype: int64

In [17]:
out_of_sample_adata = adata[~adata.obs['study'].isin(['Saunders', 'Rosenberg'])]

In [19]:
out_of_sample_adata

View of AnnData object with n_obs × n_vars = 153810 × 2000 
    obs: 'Age', 'Subclass', 'Taxonomy_group', 'Tissue', 'age', 'batch', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'class', 'cluster', 'cluster_id', 'louvain', 'reason', 'refined_class', 'region', 'region_subcluster', 'sample_type', 'study', 'subcluster'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [21]:
subsampled_adata.concatenate(out_of_sample_adata).write_h5ad("./data/mouse_brain/mouse_brain_normalized_hvg_subsampled.h5ad")

... storing 'Age' as categorical
... storing 'Subclass' as categorical
... storing 'Taxonomy_group' as categorical
... storing 'Tissue' as categorical
... storing 'age' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'class' as categorical
... storing 'cluster' as categorical
... storing 'cluster_id' as categorical
... storing 'louvain' as categorical
... storing 'reason' as categorical
... storing 'refined_class' as categorical
... storing 'region' as categorical
... storing 'region_subcluster' as categorical
... storing 'sample_type' as categorical
... storing 'study' as categorical
... storing 'subcluster' as categorical
