#### Importing all the required **Python** and **R** libraries 

In [1]:
import pandas as pd
import scanpy as sc
import warnings
import scarches as sca
warnings.filterwarnings("ignore")

import decoupler as dc

import sys
sys.path.append('../scripts')
%load_ext autoreload
%autoreload 2
#%load_ext lab_black

  from .autonotebook import tqdm as notebook_tqdm
 captum (see https://github.com/pytorch/captum).


In [2]:
sc.set_figure_params(frameon=False)
sc.settings.figdir = '/home/daniele/Code/scmouse_atlas/reports/figures/'

#### Read and bin

In [4]:
import numpy as np
from scipy.sparse import issparse, csr_matrix

def bin_data(adata, binning, key_to_process=None, result_binned_key="binned_data"):
    """
    Bins numerical data into discrete categories based on quantiles.

    Parameters:
        adata (AnnData): The input data object.
        key_to_process (str): Key in `adata.layers` to process.
        binning (int): Number of bins (must be an integer).
        result_binned_key (str): Key to store the binned results.

    Raises:
        ValueError: If `binning` is not an integer or data contains negative values.
    """
    if not isinstance(binning, int):
        raise ValueError(f"Binning must be an integer, but got {binning}.")

    layer_data = adata.layers[key_to_process] if key_to_process is not None else adata.X
    layer_data = layer_data.A if issparse(layer_data) else layer_data  # Convert sparse to dense if needed

    if layer_data.min() < 0:
        raise ValueError(f"Expecting non-negative data, but got min value {layer_data.min()}.")

    binned_rows = []
    bin_edges = []

    for row in layer_data:
        if row.max() == 0:
            binned_rows.append(np.zeros_like(row, dtype=np.int64))
            bin_edges.append(np.array([0] * binning))
            continue

        non_zero_ids = row.nonzero()
        non_zero_row = row[non_zero_ids]

        # Define bin thresholds based on quantiles
        bins = np.quantile(non_zero_row, np.linspace(0, 1, binning - 1))

        # Assign bin indices
        non_zero_digits = np.digitize(non_zero_row, bins)  # Converts values into bin indices
        binned_row = np.zeros_like(row, dtype=np.int64)
        binned_row[non_zero_ids] = non_zero_digits

        binned_rows.append(binned_row)
        bin_edges.append(np.concatenate([[0], bins]))

    # Convert binned data back to sparse format
    adata.layers[result_binned_key] = csr_matrix(np.stack(binned_rows))
    adata.obsm["bin_edges"] = np.stack(bin_edges)


In [5]:
adata_source = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/03_mouse_larry_barcoded_annotated.h5ad')
adata_target = sc.read_h5ad('/mnt/storage/Daniele/atlases/mouse/02_mouse_no_larry_qced.h5ad')

In [6]:
gene_common = list(set(adata_source.var_names).intersection(adata_target.var_names))

In [7]:
manual_genes_human = pd.read_csv('../../../supplementary_data/human/human_manual_genes.csv')

In [8]:
manual_genes_human.columns = ['genesymbol','manual']
manual_genes_human['pathway'] = '_' #dummy for decoupler
manual_genes_human = manual_genes_human[manual_genes_human['manual']]

In [9]:
mouse_manual_genes = dc.translate_net(manual_genes_human, target_organism='mouse')

In [10]:
man_genes = list(set(mouse_manual_genes['genesymbol'].values).intersection(gene_common))

#### Reference

In [11]:
adata_source = adata_source[:, gene_common].copy()
bin_data(adata_source, 50, key_to_process = None, result_binned_key="binned_data")
source_manual = adata_source[:, man_genes].copy()

In [12]:
batch_key = 'donor_id'
celltype_key = 'Level_1'

In [13]:
sca.models.SCVI.setup_anndata(source_manual, layer='binned_data', batch_key=batch_key, labels_key=celltype_key)




In [14]:
vae = sca.models.SCVI(
    source_manual,
    n_layers=2,
    encode_covariates=True,
    deeply_inject_covariates=False,
    use_layer_norm="both",
    use_batch_norm="none",
)

In [15]:
vae.train()

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 134/134: 100%|██████████| 134/134 [16:04<00:00,  5.20s/it, v_num=1, train_loss_step=3.16e+3, train_loss_epoch=3.22e+3]

INFO: `Trainer.fit` stopped: `max_epochs=134` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=134` reached.


Epoch 134/134: 100%|██████████| 134/134 [16:04<00:00,  7.20s/it, v_num=1, train_loss_step=3.16e+3, train_loss_epoch=3.22e+3]


In [16]:
scanvae = sca.models.SCANVI.from_scvi_model(vae, unlabeled_category = "Unknown")
scanvae.train()

[34mINFO    [0m Training for [1;36m10[0m epochs.                                                                                   


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 10/10: 100%|██████████| 10/10 [02:04<00:00, 12.49s/it, v_num=1, train_loss_step=3.15e+3, train_loss_epoch=3.22e+3]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 10/10: 100%|██████████| 10/10 [02:04<00:00, 12.48s/it, v_num=1, train_loss_step=3.15e+3, train_loss_epoch=3.22e+3]


In [17]:
source_manual.obs['predictions'] = scanvae.predict()
print("Acc: {}".format(np.mean(source_manual.obs.predictions == source_manual.obs.Level_1)))

Acc: 0.9744022174726156


#### Target

In [18]:
adata_target = adata_target[:, gene_common].copy()
bin_data(adata_target, 50, key_to_process = None, result_binned_key="binned_data")
target_manual = adata_target[:, man_genes].copy()



In [19]:
model_surgery = sca.models.SCANVI.load_query_data(
    target_manual,
    scanvae,
    freeze_dropout = True,
)

In [20]:
model_surgery._unlabeled_indices = np.arange(target_manual.n_obs)
model_surgery._labeled_indices = []
print("Labelled Indices: ", len(model_surgery._labeled_indices))
print("Unlabelled Indices: ", len(model_surgery._unlabeled_indices))

Labelled Indices:  0
Unlabelled Indices:  333953


In [21]:
model_surgery.train(
    max_epochs=20,
    plan_kwargs=dict(weight_decay=0.0),
    check_val_every_n_epoch=2,
)

[34mINFO    [0m Training for [1;36m20[0m epochs.                                                                                   


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 20/20: 100%|██████████| 20/20 [17:11<00:00, 41.09s/it, v_num=1, train_loss_step=2.62e+3, train_loss_epoch=2.84e+3]

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 20/20: 100%|██████████| 20/20 [17:11<00:00, 51.55s/it, v_num=1, train_loss_step=2.62e+3, train_loss_epoch=2.84e+3]


In [23]:
adata_full = source_manual.concatenate(target_manual)
adata_full.obs['Level_1_label_transfer'] = model_surgery.predict(adata_full)
adata_full.obsm['X_scANVI'] = model_surgery.get_latent_representation(adata_full)

[34mINFO    [0m Input AnnData not setup with scvi-tools. attempting to transfer AnnData setup                             


In [26]:
adata_full

AnnData object with n_obs × n_vars = 393845 × 2272
    obs: 'donor_id', 'disease', 'model', 'barcoded', 'sex', 'strain', 'genotype', 'treatment', 'cell_filtering', 'sampleID', 'larry_positive', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'leiden', 'leiden_0.1', 'Level_0', 'leiden_0.5', 'Level_1', '_scvi_batch', '_scvi_labels', 'predictions', 'total_counts_malat', 'log1p_total_counts_malat', 'pct_counts_malat', 'batch', 'Level_1_label_transfer'
    var: 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts-0', 'mean_counts-0', 'log1p_mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'log1p_total_counts-0', 'n_cells_by_counts-1', 'mean_counts-1', 'log1p_mean_counts-1', 'pct_dropout_by_counts-1', 'total_count

In [None]:
del source_manual, target_manual
import gc
gc.collect()


180

In [24]:
adata_full_all_genes = adata_source.concatenate(adata_target)
adata_full_all_genes = adata_full_all_genes[adata_full.obs_names].copy()

In [25]:
adata_full_all_genes

AnnData object with n_obs × n_vars = 393841 × 21469
    obs: 'donor_id', 'disease', 'model', 'barcoded', 'sex', 'strain', 'genotype', 'treatment', 'cell_filtering', 'sampleID', 'larry_positive', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'leiden_0.5', 'Level_0', 'leiden_myeloid_level_1', 'Level_1', 'leiden_lymphoid_level_1', 'leiden_fibroblast_level_1', 'leiden_endothelial_level_1', 'leiden_epithelial_level_1', 'total_counts_malat', 'log1p_total_counts_malat', 'pct_counts_malat', 'batch'
    var: 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts-0', 'mean_counts-0', 'log1p_mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'log1p_total_counts-0', 'n_cells_by_counts-1', 'mean_counts-1', 'log1p_mean_coun

In [26]:
adata_full_all_genes.obs['Level_1_label_transfer'] = adata_full.obs['Level_1_label_transfer']
adata_full_all_genes.obsm['X_scANVI'] = adata_full.obsm['X_scANVI']

In [27]:
adata_full_all_genes.write_h5ad('/mnt/storage/Daniele/atlases/mouse/06_mouse_inhouse_integrated_scanvi.h5ad')