# Details about Nvidia Merlin

Documentation links:

* General documentation: https://nvidia-merlin.github.io/Merlin/stable/README.html
* GitHub: https://github.com/NVIDIA-Merlin/dataloader
* Docker containers: https://nvidia-merlin.github.io/Merlin/stable/containers.html


Installation via pip:
1. Install cudf + dask-cudf: `python -m pip install cudf-cu11==23.08 rmm-cu11==23.08 dask-cudf-cu11==23.08 --extra-index-url https://pypi.nvidia.com/`
2. Install merlin dataloader: `python -m pip install merlin-dataloader`


# Details about data

The parquet files have the folowing columns:

In [1]:
from merlin.dtypes import boolean, float32, int64


PARQUET_SCHEMA = {
    'X': float32, # -> gene expression values (normalized to 10.000 counts per cell + log1p transformed)
    'soma_joinid': int64,  # soma_joinid from CELLxGENE
    'is_primary_data': boolean,  # binary indicator whether data is primary data or not (currently all data is primary data)
    'dataset_id': int64,  # name of the associated data set
    'donor_id': int64,  # name of the donor (caution! This might not be unique across datasets -> use tech_sample column instead)
    'assay': int64,  # name of the used assay
    'cell_type': int64,  # cell type label
    'development_stage': int64,  # development stage label
    'disease': int64,  # disease state label
    'tissue': int64,  # specfic tissue label
    'tissue_general': int64,  # general tissue label
    'tech_sample': int64,  # batch indicator 
    'idx': int64,  # consecutive enumeration of all cells in the train, val and test data
}

  warn(f"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}")
  from .autonotebook import tqdm as notebook_tqdm


All categorical meta data (['dataset_id', 'donor_id', 'assay', 'cell_type', 'development_stage', 'disease', 'tissue', 'tissue_general', 'tech_sample']) are encoded as integers. 

The lookup tables to map the integer labels to their corresponding string labels can be found under: `join(DATA_PATH, categorical_lookup)`

E.g. the mapping for the `cell_type` column can be found in the `cell_type.parquet` file.

# Use with PyTorch Lightning DataModule

In [2]:
from cellnet.datamodules import MerlinDataModule

In [3]:
# path to merlin store
DATA_PATH = '/mnt/dssmcmlfs01/merlin_cxg_2023_05_15_sf-log1p'

In [4]:
datamodule = MerlinDataModule(
    path=DATA_PATH,
    columns=['cell_type'],
    batch_size=2048,
    sub_sample_frac=1., # randomly subsample data (can be between (0., 1.])
)

In [5]:
import gc


# get dataloaders for train, valiation and test set
train_loader = datamodule.train_dataloader()
val_loader = datamodule.val_dataloader()
test_loader = datamodule.test_dataloader()


# how to use dataloaders
for ix, (batch, _) in enumerate(train_loader):
    # put your training code here:
    print('X:', batch['X'])
    print('cell_type:', batch['cell_type'])

    # Merlin tends to use a lot of GPU memory if the garbage collection isn't called regularly
    # -> manually call python garbage collection every 10 steps 
    if ix % 10 == 0:
        gc.collect()

    # don't iterate over all traning data for this tutorial
    if ix == 1:
        break


X: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
cell_type: tensor([  7, 127, 152,  ...,  22, 127,   4], device='cuda:0')
X: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
cell_type: tensor([  9, 132, 118,  ...,  14, 129, 127], device='cuda:0')


# Use as standalone PyTorch DataLoader

In [6]:
from os.path import join

from cellnet.datamodules import merlin_dataset_factory, set_default_kwargs_dataset
from merlin.dataloader.torch import Loader

In [7]:
# path to merlin store
DATA_PATH = '/mnt/dssmcmlfs01/merlin_cxg_2023_05_15_sf-log1p'

In [8]:
# manually create data loaders for train and validation set
train_dataset = merlin_dataset_factory(
    join(DATA_PATH, 'train'), 
    columns=['cell_type'], 
    dataset_kwargs=set_default_kwargs_dataset(training=True)
)
train_loader = Loader(train_dataset, batch_size=2048, shuffle=True)


val_dataset = merlin_dataset_factory(
    join(DATA_PATH, 'val'), 
    columns=['cell_type'], 
    dataset_kwargs=set_default_kwargs_dataset(training=False)
)
val_loader = Loader(val_dataset, batch_size=2048, shuffle=False)


# how to use dataloaders
for ix, (batch, _) in enumerate(train_loader):
    # put your training code here:
    print('X:', batch['X'])
    print('cell_type:', batch['cell_type'])

    # Merlin tends to use a lot of GPU memory if the garbage collection isn't called regularly
    # -> manually call python garbage collection every 10 steps 
    if ix % 10 == 0:
        gc.collect()

    # don't iterate over all traning data for this tutorial
    if ix == 1:
        break


X: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
cell_type: tensor([46, 44, 38,  ..., 67, 19, 60], device='cuda:0')
X: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.4717, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
cell_type: tensor([ 14,  64, 131,  ...,  60,  63, 132], device='cuda:0')
