**Author:** Elias Rafael Ruiz-Morales

**Institution:** Wellcome Sanger institute

**May, 2022**

scVI integration.


----

# Notebook to run scVI integration


In [1]:
from __future__ import print_function
import torch

import sys, os
data_type = 'float32'
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True'
sys.path.insert(1, '../results/scVI/')

In [2]:
# Seed for reproducibility
import numpy as np
import pandas as pd
import scanpy as sc
from typing import Tuple

# scVI imports
import scvi
from scvi.dataset import AnnDatasetFromAnnData
from scvi.inference import UnsupervisedTrainer
from scvi.models.vae import VAE

torch.manual_seed(0)
np.random.seed(0)
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)



def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = '../results/images/scVI_integration'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable



-----
anndata     0.7.6
scanpy      1.7.0
sinfo       0.3.1
-----
PIL                 8.3.0
absl                NA
anndata             0.7.6
anyio               NA
attr                21.2.0
babel               2.9.1
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
cairo               1.20.1
certifi             2021.05.30
cffi                1.14.4
chardet             4.0.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
debugpy             1.3.0
decorator           5.0.9
defusedxml          0.7.1
dunamai             1.5.5
fsspec              2021.06.1
get_version         3.2
google              NA
h5py                3.3.0
idna                2.10
igraph              0.8.3
ipykernel           6.0.0
ipython_genutils    0.2.0
ipywidgets          7.6.3
jedi                0.18.0
jinja2              3.0.1
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_ser

'/opt/conda/envs/scvi-singularity/bin/python'

### Loading non-normalized data

In [5]:
#adata_original = sc.read('../data/S1_preprocessed_noNormalized_adata_20220523.h5ad')
#adata_original.X.shape

(196363, 32743)

In [6]:
#adata = sc.read('../data/S1_preprocessed_normalized_adata.h5ad')
#adata.X.shape

### Compute the scVI latent space

Based on the scVI documentation. Tutorial 3K PBMCs

In [3]:
#del(adata_original)
adata = sc.read('../data/S1_preprocessed_noNormalized_adata_20220523.h5ad')
adata.X.shape

In [4]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

normalizing counts per cell
    finished (0:00:03)


In [5]:
np.unique(adata.obs['donor_souporcell'])

array(['Hrv100', 'Hrv106', 'Hrv107', 'Hrv111', 'Hrv124', 'Hrv141',
       'Hrv142', 'Hrv97', 'Hrv98', 'Hrv99', 'souporcell_doublet'],
      dtype=object)

In [6]:
adata.obs['sample']

Pla_HDBR10917729_AAACCCACATCGAACT    Pla_HDBR10917729
Pla_HDBR10917729_AAACCCAGTAAGACCG    Pla_HDBR10917729
Pla_HDBR10917729_AAACCCAGTGGGATTG    Pla_HDBR10917729
Pla_HDBR10917729_AAACGAAAGCCTAACT    Pla_HDBR10917729
Pla_HDBR10917729_AAACGAAAGCCTGGAA    Pla_HDBR10917729
                                           ...       
Pla_HDBR12808831_TTTGTTGTCCAAGAGG    Pla_HDBR12808831
Pla_HDBR12808831_TTTGTTGTCGGTCATA    Pla_HDBR12808831
Pla_HDBR12808831_TTTGTTGTCTCGACGG    Pla_HDBR12808831
Pla_HDBR12808831_TTTGTTGTCTGTGCTC    Pla_HDBR12808831
Pla_HDBR12808831_TTTGTTGTCTTTCTAG    Pla_HDBR12808831
Name: sample, Length: 196363, dtype: category
Categories (16, object): ['Pla_HDBR10917729', 'Pla_HDBR10917730', 'Pla_HDBR10917731', 'Pla_HDBR10917732', ..., 'Pla_HDBR12808826', 'Pla_HDBR12808827', 'Pla_HDBR12808829', 'Pla_HDBR12808831']

In [7]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    batch_key="donor_souporcell", #samples as a batch
    #donor as a covariate of the cells
    categorical_covariate_keys=['sample'] 
    #categorical_covariate_keys=['donor_souporcell',], #used in attemp 15/nov
    #continuous_covariate_keys=[""]
)

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"donor_souporcell"[0m[1m][0m                                    


Using batches from adata.obs["donor_souporcell"]


[34mINFO    [0m No label_key inputted, assuming all cells have same label                           


No label_key inputted, assuming all cells have same label


[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          


Using data from adata.layers["raw_counts"]


[34mINFO    [0m Computing library size prior per batch                                              


Computing library size prior per batch


[34mINFO    [0m Successfully registered anndata object containing [1;36m196363[0m cells, [1;36m32743[0m vars, [1;36m11[0m      
         batches, [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates   
         and [1;36m0[0m extra continuous covariates.                                                  


Successfully registered anndata object containing 196363 cells, 32743 vars, 11 batches, 1 labels, and 0 proteins. Also registered 1 extra categorical covariates and 0 extra continuous covariates.


[34mINFO    [0m Please do not further modify adata until model is trained.                          


Please do not further modify adata until model is trained.


In [8]:
#---- check #layers

In [9]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [10, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 10
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [10]:
models[10]



In [None]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    curr_df.to_csv('../results/scVI/20220523_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_corrected_by_donor.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 41/41: 100%|███████████████████████████████████████████████████████████████████████████| 41/41 [23:12<00:00, 33.96s/it, loss=1.05e+04, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 41/41: 100%|███████████████████████████████████████████████████████████████████████████| 41/41 [23:02<00:00, 33.73s/it, loss=1.01e+04, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 41/41: 100%|███████████████████████████████████████████████████████████████████████████| 41/41 [22:49<00:00, 33.41s/it, loss=1.03e+04, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 17/41:  39%|█████████████████████████████▎                                             | 16/41 [08:56<13:57, 33.51s/it, loss=1.05e+04, v_num=1]

## Saving data

In [None]:
adata.write('../results/scVI/S2_scVIintegrated_adata_20220523.h5ad')