#### Importing all the required **Python** and **R** libraries 

In [1]:
import pandas as pd
import scanpy as sc
import warnings
import scarches as sca
warnings.filterwarnings("ignore")

import decoupler as dc

import sys
sys.path.append('../scripts')
%load_ext autoreload
%autoreload 2
#%load_ext lab_black

  from .autonotebook import tqdm as notebook_tqdm
 captum (see https://github.com/pytorch/captum).


In [2]:
sc.set_figure_params(frameon=False)
sc.settings.figdir = '/home/daniele/Code/scmouse_atlas/reports/figures/'

#### Read and bin

In [4]:
import numpy as np
from scipy.sparse import issparse, csr_matrix

def bin_data(adata, binning, key_to_process=None, result_binned_key="binned_data"):
    """
    Bins numerical data into discrete categories based on quantiles.

    Parameters:
        adata (AnnData): The input data object.
        key_to_process (str): Key in `adata.layers` to process.
        binning (int): Number of bins (must be an integer).
        result_binned_key (str): Key to store the binned results.

    Raises:
        ValueError: If `binning` is not an integer or data contains negative values.
    """
    if not isinstance(binning, int):
        raise ValueError(f"Binning must be an integer, but got {binning}.")

    layer_data = adata.layers[key_to_process] if key_to_process is not None else adata.X
    layer_data = layer_data.A if issparse(layer_data) else layer_data  # Convert sparse to dense if needed

    if layer_data.min() < 0:
        raise ValueError(f"Expecting non-negative data, but got min value {layer_data.min()}.")

    binned_rows = []
    bin_edges = []

    for row in layer_data:
        if row.max() == 0:
            binned_rows.append(np.zeros_like(row, dtype=np.int64))
            bin_edges.append(np.array([0] * binning))
            continue

        non_zero_ids = row.nonzero()
        non_zero_row = row[non_zero_ids]

        # Define bin thresholds based on quantiles
        bins = np.quantile(non_zero_row, np.linspace(0, 1, binning - 1))

        # Assign bin indices
        non_zero_digits = np.digitize(non_zero_row, bins)  # Converts values into bin indices
        binned_row = np.zeros_like(row, dtype=np.int64)
        binned_row[non_zero_ids] = non_zero_digits

        binned_rows.append(binned_row)
        bin_edges.append(np.concatenate([[0], bins]))

    # Convert binned data back to sparse format
    adata.layers[result_binned_key] = csr_matrix(np.stack(binned_rows))
    adata.obsm["bin_edges"] = np.stack(bin_edges)


In [38]:
adata_source = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/07_mouse_inhouse_integrated_scanvi_refined.h5ad')
adata_target = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/09_mouse_public_qced.h5ad')

In [10]:
gene_common = list(set(adata_source.var_names).intersection(adata_target.var_names))

In [11]:
manual_genes_human = pd.read_csv('../../../supplementary_data/human/human_manual_genes.csv')

In [12]:
manual_genes_human.columns = ['genesymbol','manual']
manual_genes_human['pathway'] = '_' #dummy for decoupler
manual_genes_human = manual_genes_human[manual_genes_human['manual']]

In [13]:
mouse_manual_genes = dc.translate_net(manual_genes_human, target_organism='mouse')

In [14]:
man_genes = list(set(mouse_manual_genes['genesymbol'].values).intersection(gene_common))

#### Reference

In [39]:
source_manual = adata_source[:, man_genes].copy()

In [40]:
batch_key = 'donor_id'
celltype_key = 'Level_1_refined'

In [41]:
sca.models.SCVI.setup_anndata(source_manual, layer='binned_data', batch_key=batch_key, labels_key=celltype_key)


In [42]:
vae = sca.models.SCVI(
    source_manual,
    n_layers=2,
    encode_covariates=True,
    deeply_inject_covariates=False,
    use_layer_norm="both",
    use_batch_norm="none",
)

In [43]:
vae.train()

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 20/20: 100%|█| 20/20 [11:23<00:00, 33.88s/it, v_num=1, train_loss_step=3.01e+3, train_loss_epoch=2.7

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 20/20: 100%|█| 20/20 [11:23<00:00, 34.18s/it, v_num=1, train_loss_step=3.01e+3, train_loss_epoch=2.7


In [44]:
scanvae = sca.models.SCANVI.from_scvi_model(vae, unlabeled_category = "Unknown")
scanvae.train()

[34mINFO    [0m Training for [1;36m7[0m epochs.                                                                                    


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 7/7: 100%|█| 7/7 [09:37<00:00, 82.31s/it, v_num=1, train_loss_step=2.71e+3, train_loss_epoch=2.77e+3

INFO: `Trainer.fit` stopped: `max_epochs=7` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=7` reached.


Epoch 7/7: 100%|█| 7/7 [09:37<00:00, 82.45s/it, v_num=1, train_loss_step=2.71e+3, train_loss_epoch=2.77e+3


In [45]:
source_manual.obs['predictions'] = scanvae.predict()
print("Acc: {}".format(np.mean(source_manual.obs.predictions == source_manual.obs.Level_1_refined)))

Acc: 0.9817819881627358


#### Target

In [46]:
adata_target = adata_target[:, gene_common].copy()
bin_data(adata_target, 50, key_to_process = None, result_binned_key="binned_data")
target_manual = adata_target[:, man_genes].copy()



In [72]:
target_manual.obs[celltype_key] = "Unknown"

In [73]:
target_manual.obs[batch_key] = target_manual.obs['Sample_unique'].astype('category')

In [76]:
model_surgery = sca.models.SCANVI.load_query_data(
    target_manual,
    scanvae,
    freeze_dropout = True,
)

In [77]:
model_surgery._unlabeled_indices = np.arange(target_manual.n_obs)
model_surgery._labeled_indices = []
print("Labelled Indices: ", len(model_surgery._labeled_indices))
print("Unlabelled Indices: ", len(model_surgery._unlabeled_indices))

Labelled Indices:  0
Unlabelled Indices:  238540


In [78]:
model_surgery.train(
    plan_kwargs=dict(weight_decay=1e-4),
    check_val_every_n_epoch=2,
)

[34mINFO    [0m Training for [1;36m34[0m epochs.                                                                                   


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 34/34: 100%|█| 34/34 [16:01<00:00, 27.73s/it, v_num=1, train_loss_step=1.98e+3, train_loss_epoch=2.2

INFO: `Trainer.fit` stopped: `max_epochs=34` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=34` reached.


Epoch 34/34: 100%|█| 34/34 [16:01<00:00, 28.29s/it, v_num=1, train_loss_step=1.98e+3, train_loss_epoch=2.2


In [85]:
adata_full = source_manual.concatenate(target_manual, batch_key = 'provenance', batch_categories = ['in_house_datasets', 'public_datasets'] )
adata_full.obs['Level_1_label_transfer'] = model_surgery.predict(adata_full)
adata_full.obsm['X_scANVI'] = model_surgery.get_latent_representation(adata_full)

[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             


In [31]:
del source_manual, target_manual
import gc
gc.collect()


1406

In [88]:
#adata_full_all_genes = adata_source.concatenate(adata_target, batch_key = 'provenance', batch_categories = ['in_house_datasets', 'public_datasets'] )
adata_full_all_genes = adata_full_all_genes[adata_full.obs_names].copy()

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [89]:
adata_full_all_genes.obs['Level_1_label_transfer'] = adata_full.obs['Level_1_label_transfer']
adata_full_all_genes.obsm['X_scANVI'] = adata_full.obsm['X_scANVI']

In [90]:
adata_full_all_genes.write_h5ad('/mnt/storage/Daniele/atlases/mouse/10_mouse_all_integrated_scanvi.h5ad')