In [None]:
import warnings
from typing import List

import anndata
import cellxgene_census
import numpy as np
import scanpy as sc

warnings.filterwarnings("ignore")


def remove_missing_embedding_cells(adata: anndata.AnnData, emb_names: List[str]):
    """Embeddings with missing data contain all NaN,
    so we must find the intersection of non-NaN rows in the fetched embeddings
    and subset the AnnData accordingly.
    """
    filt = np.ones(adata.shape[0], dtype="bool")
    for key in emb_names:
        nan_row_sums = np.sum(np.isnan(adata.obsm[key]), axis=1)
        total_columns = adata.obsm[key].shape[1]
        filt = filt & (nan_row_sums != total_columns)
    adata = adata[filt].copy()

    return adata

In [None]:
# human embeddings
CENSUS_VERSION = "2023-12-15"
EXPERIMENT_NAME = "homo_sapiens"

# These are embeddings available to this Census version
embedding_names = ["geneformer", "scvi", "scgpt", "uce"]
census = cellxgene_census.open_soma(census_version=CENSUS_VERSION)

In [None]:
def download_dataset(dataset_id, dataset_name):
    obs_value_filter = (
     f"dataset_id in ['{dataset_id}']" 
     + " and is_primary_data == True")

    obs_df = census["census_data"][EXPERIMENT_NAME].obs.read(value_filter=obs_value_filter, column_names=["soma_joinid"])
    obs_df = obs_df.concat().to_pandas()

    print(obs_df.shape[0], "cells in", obs_value_filter)

    # Subset to 100K
    n_subset_cells = 100000

    print("Selecting", min(n_subset_cells, obs_df.shape[0]), "random cells")
    idx_rand = np.random.choice(obs_df.shape[0], size=min(n_subset_cells, obs_df.shape[0]), 
                                replace=False)
    soma_joinids_subset = obs_df["soma_joinid"].values[idx_rand].tolist()

    # Let's get the AnnData
    adata = cellxgene_census.get_anndata(
        census=census,
        organism=EXPERIMENT_NAME,
        obs_coords=soma_joinids_subset,
        obs_embeddings=embedding_names,
    )

    adata = remove_missing_embedding_cells(adata, embedding_names)
    adata.write_h5ad(f'{dataset_name}.h5ad')

In [None]:
download_dataset('53d208b0-2cfd-4366-9866-c3c6114081bc', 'Tabula_Sapiens_HS_2022_all')

In [None]:
download_dataset('fd072bc3-2dfb-46f8-b4e3-467cb3223182', 'Suo_ImmuneDev_HS_2022_all')

In [None]:
download_dataset('f7c1c579-2dc0-47e2-ba19-8165c5a0e353', 'Cao_dev_HS_2020_all')

In [None]:
download_dataset('2adb1f8a-a6b1-4909-8ee8-484814e2d4bf', 'Han_HS_2020_all')

In [None]:
download_dataset('cd4c96bb-ad66-4e83-ba9e-a7df8790eb12', 
                 'Triana_BoneMarrow_HS_2021_healthy')

In [None]:
download_dataset('8c42cfd0-0b0a-46d5-910c-fc833d83c45e', 
                 'Travaglini_Lung_HS_2021_10x')

In [None]:
download_dataset('019c7af2-c827-4454-9970-44d5e39ce068', 
                 'Burclaff_intestine_HS_2022_all')

In [None]:
download_dataset('4ed927e9-c099-49af-b8ce-a2652d069333', 
                 'Eraslan_MultiTissue_HS_2022_all')