In [1]:
import sfaira
import os

2022-02-14 13:03:41.132379: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-14 13:03:41.132402: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
cache_path = os.path.join(".", "data")
store_path = os.path.join(".", "data", "store")

# Download data

See also the tutorial ./cellxgene_download.ipynb for a detailed explanation of the cellxgene download API.

In [3]:
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)

Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.


In [4]:
target_collections = ["9c8808ce-1138-4dbe-818c-171cff10e650"]

In [5]:
dsg.subset(key="collection_id", values=target_collections)

In [6]:
dsg.download()

In [7]:
dsg.show_summary()

26ae14da-9e5f-4d18-abae-18a5a328feef
	 ('cellxgene', 'Mus musculus', 'peripheral lymph node', "10x 3' v2", 'lymphadenitis (disease)')
cfa3c355-ee77-4fc8-9a00-78e61d23024c
	 ('cellxgene', 'Homo sapiens', 'cervical lymph node', "10x 3' v2", 'healthy')


# Create store

First, we streamline the data object to the desired format of the store.
Second, we write the streamlined object to disk in the DAO format.
We iterate over all objects in the dsg instance.

In [8]:
for k, ds in dsg.datasets.items():
    if ds.adata is None:
        ds.load(load_raw=False, allow_caching=True)
    ds.streamline_features(
        remove_gene_version=True,
        match_to_release={"Homo sapiens": "104", "Mus musculus": "104"},
        subset_genes_to_type="protein_coding"
    )
    ds.streamline_metadata(
        schema="sfaira", clean_obs=True, clean_var=True, clean_uns=True, clean_obs_names=True
    )
    ds.write_distributed_store(
        dir_cache=store_path, 
        store_format='dao', 
        dense=True, 
        chunks=128,
        compression_kwargs={"compressor": "default", "overwrite": True, "order": "C"}
    )

  self._set_arrayXarray_sparse(i, j, x)


# Load store

In [9]:
obs_keys_to_load = ["cell_type"]

In [10]:
data_stores = sfaira.data.load_store(cache_path=store_path, 
                                     store_format='dao', 
                                     columns=obs_keys_to_load)

Let's select the Homo sapiens part of this store:

In [11]:
data_stores.stores

{'Mus musculus': <sfaira.data.store.stores.single.StoreDao at 0x7fa1a238fd90>,
 'Homo sapiens': <sfaira.data.store.stores.single.StoreDao at 0x7fa1a238f190>}

In [12]:
data_store = data_stores.stores['Homo sapiens']

# Create a Cart

In [13]:
cart = data_store.checkout(obs_keys=obs_keys_to_load)

In [14]:
cart.x.shape

(4355, 19357)

Note that the Cart only has the target obs keys now and leaves the remained of the .obs in the store on disk!

In [15]:
cart.obs.shape

(4355, 1)

In [16]:
cart.obs.head()

Unnamed: 0,cell_type
0,endothelial cell of lymphatic vessel
1,endothelial cell of lymphatic vessel
2,endothelial cell of lymphatic vessel
3,endothelial cell of lymphatic vessel
4,endothelial cell of lymphatic vessel


# Create generators 

Let's define a map_fn to define the output of generator:

In [17]:
import numpy as np


onto_cl = sfaira.consts.OCS.cell_type
adata_ids = sfaira.consts.AdataIdsSfaira()

leave_maps = onto_cl.prepare_maps_to_leaves(include_self=True)


def encoder(x) -> np.ndarray:
    if isinstance(x, str):
        x = [x]
    x = onto_cl.convert_to_id(x)
    # Encodes unknowns to empty rows.
    idx = [
        leave_maps[y] if y not in [
            adata_ids.unknown_metadata_identifier,
            adata_ids.not_a_cell_celltype_identifier,
        ] else np.array([])
        for y in x
    ]
    oh = np.zeros((len(x), onto_cl.n_leaves,), dtype="float32")
    for i, y in enumerate(idx):
        scale = len(y)
        if scale > 0:
            oh[i, y] = 1. / scale
    return oh


def map_fn(x_sample, obs_sample):
    gene_expression = np.asarray(x_sample)
    cell_type_label = encoder(obs_sample['cell_type'].values)
    
    return ((gene_expression, ), (cell_type_label))

time for precomputing ancestors: 13.703369140625


Next, we create a cart which is used to iterator over the dataset:

In [18]:
g = data_store.checkout(
    idx=None,  # specify from which idxs to sample from - None == sample from all indicies
    # idx = np.arange(0, 100)  # idx can also be a np.ndarray 
    map_fn=map_fn, 
    obs_keys=['cell_type'],  # specify which obs keys you need
)

#### python generator

.adaptor('python') returns a normal python generator


In [19]:
gen_py = g.adaptor(
    'python',
    repeat=5,  # repeat data set 5 times until generator runs out of samples,
)
next(gen_py)

((array([0., 0., 1., ..., 0., 0., 0.], dtype=float32),),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32))

#### pytorch DataLoader

.adapator('torch-iter-loader') returns pytorch DataLoader that can be used to fit pytorch models

kwargs are passed through to the constructor of the DataLoader

https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

In [20]:
gen_torch = g.adaptor('torch-iter-loader', batch_size=128, repeat=5)
next(iter(gen_torch))

[[tensor([[0., 0., 1.,  ..., 0., 0., 0.],
          [1., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.]])],
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])]

#### tf.data.Dataset

.adapator('tensorflow') returns a tf.data.Dataset that can be used to fit tensorflow models

kwargs are passed through to the tf.data.Dataset.from_generator function

https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [21]:
import tensorflow as tf


output_spec = (
    (tf.TensorSpec(shape=(1, data_store.n_vars, ), dtype=tf.float32)),
    (tf.TensorSpec(shape=(onto_cl.n_leaves, ), dtype=tf.float32))
)

gen_tf = g.adaptor('tensorflow', output_signature=output_spec, repeat=5).batch(128)

next(iter(gen_tf))

2022-02-14 13:06:32.241445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-02-14 13:06:32.241484: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-14 13:06:32.241513: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pc): /proc/driver/nvidia/version does not exist
2022-02-14 13:06:32.243037: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(<tf.Tensor: shape=(128, 1, 19357), dtype=float32, numpy=
 array([[[0., 0., 1., ..., 0., 0., 0.]],
 
        [[1., 0., 1., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 1., ..., 0., 0., 0.]]], dtype=float32)>,
 <tf.Tensor: shape=(128, 1458), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>)