In [13]:
import cellxgene_census
import numpy as np
from tqdm import tqdm

In [5]:
census = cellxgene_census.open_soma()

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.


In [9]:
obs_value_filter = (
    'assay != "Smart-seq2"'
    ' and assay != "Smart-seq v4"'
    ' and assay != "Smart-seq"'
    ' and is_primary_data == True'
)

obs_value_filter

'assay != "Smart-seq2" and assay != "Smart-seq v4" and assay != "Smart-seq" and is_primary_data == True'

In [10]:
adata = cellxgene_census.get_anndata(
    census = census,
    organism = 'Mus musculus',
    obs_value_filter = obs_value_filter,
    obs_coords = slice(0, 1000)
)



In [11]:
adata

AnnData object with n_obs × n_vars = 0 × 52417
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [19]:
obs_value_filter = (
    'assay == "10x 3\' v2" and is_primary_data == True'
)

obs_value_filter

'assay == "10x 3\' v2" and is_primary_data == True'

In [22]:
adata = cellxgene_census.get_anndata(
    census = census,
    organism = 'Mus musculus',
    # obs_value_filter = obs_value_filter,
    obs_coords = slice(0, 10)
)

In [23]:
adata

AnnData object with n_obs × n_vars = 11 × 52417
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [24]:
# From https://github.com/theislab/scTab/blob/devel/notebooks/store_creation/01_download_data.ipynb

In [7]:
PROTOCOLS = [
    "10x 5' v2", 
    "10x 3' v3", 
    "10x 3' v2", 
    "10x 5' v1", 
    "10x 3' v1", 
    "10x 3' transcription profiling", 
    "10x 5' transcription profiling"
]

In [27]:
adata = cellxgene_census.get_anndata(
    census = census,
    organism = 'Mus musculus',
    obs_value_filter = f"is_primary_data == True and assay in {PROTOCOLS}",
    obs_coords = slice(0, 10)
)

adata



AnnData object with n_obs × n_vars = 0 × 52417
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [1]:
# adata = cellxgene_census.get_anndata(
#     census = census,
#     organism = 'Mus musculus',
#     obs_value_filter = f"is_primary_data == True and assay in {PROTOCOLS}",
# )

# adata

# This runs out of memory. Let's use the scTab strategy of batched download instead.
# It seems like cellxgene downloads the data in dense format?

In [8]:
COLUMN_NAMES = [
    "soma_joinid",
    "is_primary_data",
    "dataset_id"
]

In [9]:
obs = (
    census["census_data"]["mus_musculus"]
    .obs
    .read(
        column_names=COLUMN_NAMES,
        value_filter=f"is_primary_data == True and assay in {PROTOCOLS}"
    )
    .concat()
    .to_pandas()
)

In [12]:
BASE_PATH = 'data/'

In [14]:
# download in batches to not run out of memory
for i, idxs in tqdm(enumerate(np.array_split(obs['soma_joinid'].to_numpy(), 20))):
    adata = cellxgene_census.get_anndata(
        census=census,
        organism="Mus musculus",
        X_name='raw',
        obs_coords=idxs.tolist(),
        column_names={"var": ['feature_id', 'feature_name']},
    )
    adata.write_h5ad(BASE_PATH + f'{i}.h5ad')

20it [35:31, 106.57s/it]
