In [None]:
import tempfile

from torch.utils.data import DataLoader

from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch

First, copy the input data. This can be done by copying https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad to a  directory name hdf5s. 

In [None]:
#Create a SingleCellMemMapDataset
data = SingleCellMemMapDataset("97e_scmm", "hdf5s/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad")


In [None]:
#Save the dataset to the disk. 
data.save()

In [None]:
#Reload the data
reloaded_data = SingleCellMemMapDataset("97e_scmm")

There are various numbers of columns per observation. However, for a batch size of 1
the data does not need to be collated. It will then be outputted in a torch tensor of shape
(1, 2, num_obs) The first row of lengh num_obs contains the column pointers, and the second row contains the corresponding values.

In [None]:
model = lambda x : x

dataloader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)


The data can be collated with a batch size of 1 and must be collated with larger batch sizes. 
This will collate several sparse matrices into the CSR (Compressed Sparse Row) torch tensor
format.

In [None]:
model = lambda x : x

dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)


Alternatively, if there are multiple AnnData files, they can be converted into a single SingleCellMemMapDataset. If the 
hdf5 directory has one or more AnnData files, the SingleCellCollection class crawls the filesystem to recursively find 
AnnData files (with the h5ad extension). The code below is in scripts/convert_h5ad_to_scdl.py. It will create a new
dataset at example_dataset. This can also be called with the convert_h5ad_to_scdl command.

In [None]:
# path to dir holding hdf5s data
hdf5s = "./hdf5s"

# path to output dir where SCDataset will be stored
example_dataset = "./scdataset_output"

In [None]:
from bionemo.scdl.io.single_cell_collection import SingleCellCollection
with tempfile.TemporaryDirectory() as temp_dir:
    coll = SingleCellCollection(temp_dir)
    coll.load_h5ad_multi(hdf5s, max_workers=4, use_processes=True)
    coll.flatten(example_dataset, destroy_on_copy=True)