# Introduction to single-cell multi-view profiler (scMVP)
In this introductory tutorial, we present the different tasks of a scMVP workflow
1. Loading the multi-omics data
2. Training the multi-view model
3. Retrieving the common latent space and imputed multi-omics values
4. Perform cell clustering and differential expression 
5. Visualize the common latent space and clustering with umap

In [18]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scMVP.dataset import LoadData,GeneExpressionDataset, CellMeasurement
from scMVP.models import VAE_Attention, Multi_VAE_Attention, VAE_Peak_SelfAttention
from scMVP.inference import UnsupervisedTrainer
from scMVP.inference import MultiPosterior, MultiTrainer
import torch

import scanpy as sc
import anndata

import scipy.io as sp_io
from scipy.sparse import csr_matrix, issparse

In [2]:
# If your PC do not have GPU, your can use multi threads to accelerate the training 
torch.set_num_threads(40) # do not use all CPU threads

## Step1: Loading data

 loading the sci-CAR cell line dataset described in Junyue Cao et al. (2018).

* Junyue Cao, et al. "Joint profiling of chromatin accessibility and gene 
expression in thousands of single cells." Science 361.6409 (2018): 1380-1385. 
<br>

You can also downloaded processed input in **README.md.**

In [3]:
counts_atac = pd.read_csv("/home/xcx/MYBenchmark-datas/1469/counts_atac.csv", index_col = 0).values

# from test_unioncom_acc import lsi_ATAC

# # counts_atac = np.where(counts_atac <1, 0, 1)
# counts_atac = lsi_ATAC(counts_atac, k = min(101, counts_atac.shape[0]))

# df = pd.DataFrame(data=counts_atac.todense().T)
# df.to_csv(os.path.join("/home/xcx/MYBenchmark-codes/5-scMVP/mydata/1469/tfidf_counts_atac.csv"), index=False, header=None)

In [4]:
counts_rna = pd.read_csv("/home/xcx/MYBenchmark-datas/1469/counts_rna.csv", index_col = 0).values

In [5]:
from test_unioncom_acc import lsi_ATAC

# counts_atac = np.where(counts_atac <1, 0, 1)
counts_atac_tfidf = lsi_ATAC(counts_atac, k = min(101, counts_atac.shape[0]))
counts_atac_tfidf.shape

(1469, 15857)

In [6]:
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import scipy.io as sio

raw_mtx_rna = csr_matrix(counts_rna.T)
raw_mtx_rna = coo_matrix(raw_mtx_rna)
sio.mmwrite("/home/xcx/MYBenchmark-codes/5-scMVP/mydata/1469/raw_counts_rna.mtx", raw_mtx_rna)

raw_mtx_rna = csr_matrix(counts_atac.T)
raw_mtx_rna = coo_matrix(raw_mtx_rna)
sio.mmwrite("/home/xcx/MYBenchmark-codes/5-scMVP/mydata/1469/tfidf_counts_atac.mtx", raw_mtx_rna)

In [6]:
input_path = "/home/xcx/MYBenchmark-codes/5-scMVP/mydata/1469/"
output_path = "/home/xcx/results/1469/5-scMVP/"

snare_p0_dataset = {
                "gene_expression": "raw_counts_rna.mtx",
                "atac_expression": "tfidf_counts_atac.mtx",
                "gene_names": 'rna_features.txt',
                "gene_barcodes": 'rna_barcodes.txt',
                "atac_names": 'atac_features.txt',
                "atac_barcodes": 'atac_barcodes.txt'
                }
dataset = LoadData(dataset=snare_p0_dataset,data_path=input_path,
                       dense=False,gzipped=False, atac_threshold=0.001, file_separator = ",",
                       cell_threshold=1)

[2023-08-11 14:24:32,153] INFO - scMVP.dataset.scMVP_dataloader | Preprocessing joint profiling dataset.
[2023-08-11 14:24:32,692] INFO - scMVP.dataset.scMVP_dataloader | hereeee
[2023-08-11 14:24:32,695] INFO - scMVP.dataset.scMVP_dataloader | 1111111111111
[2023-08-11 14:24:32,741] INFO - scMVP.dataset.scMVP_dataloader | [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[2023-08-11 14:24:32,798] INFO - scMVP.dataset.scMVP_dataloader | <class 'numpy.ndarray'>
[2023-08-11 14:24:32,799] INFO - scMVP.dataset.scMVP_dataloader | (1469, 10081)
[2023-08-11 14:24:32,847] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2023-08-11 14:24:32,848] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2023-08-11 14:24:32,891] INFO - scMVP.dataset.dataset | Computing the library size for the new data
[2023-08-11 14:24:32,893] INFO - scMVP.dataset.dataset | Downsampled from 1469 to 1469 cells


## Step2: train your scMVP model, and get trained output

* __n_epochs__: Maximum number of epochs to train the model. If the likelihood change is small than a set threshold training will stop automatically. 
* __lr__: learning rate. Set to 0.001 here. 
* __use_batches__: If the value of true than batch information is used in the training. Here it is set to false because the cortex data only contains one batch. 
* __use_cuda__: Set to true to use CUDA (GPU required) 
* __n_centroids__: Set the number of cell types
* __n_alfa__: Set the weight of KL loss

In [7]:
n_epochs = 10
lr = 1e-3
use_batches = False
use_cuda = True # False if using CPU
n_centroids = 5 
n_alfa = 1.0

import datetime
starttime = datetime.datetime.now()

multi_vae = Multi_VAE_Attention(dataset.nb_genes, len(dataset.atac_names), n_batch=0, n_latent=10, n_centroids=n_centroids, n_alfa = n_alfa, mode="mm-vae") # should provide ATAC num, alfa, mode and loss type
trainer = MultiTrainer(
    multi_vae,
    dataset,
    train_size=0.9,
    use_cuda=use_cuda,
    frequency=5,
)

if os.path.exists("{}/multi_vae_traine.pkl".format(output_path)):
    trainer.model.load_state_dict(torch.load("{}/multi_vae_trainer.pkl".format(output_path)))
else:
    # around 1-3 min on 1080Ti
    trainer.train(n_epochs=n_epochs, lr=lr)
    torch.save(trainer.model.state_dict(), '%s/multi_vae_trainer.pkl' % output_path)

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

0


In [6]:
import datetime
starttime = datetime.datetime.now()

# create posterior from trained model
full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)),type_class=MultiPosterior)
latent, latent_rna, latent_atac, cluster_gamma, cluster_index, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
imputed_values = full.sequential().imputation()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

0


In [7]:
df = pd.DataFrame(data=latent)
df.to_csv(os.path.join("/home/xcx/results/1469/5-scMVP/1469_latent.csv"), index=False)
df = pd.DataFrame(data=latent_rna)
df.to_csv(os.path.join("/home/xcx/results/1469/5-scMVP/1469_z_rna.csv"), index=False)
df = pd.DataFrame(data=latent_atac)
df.to_csv(os.path.join("/home/xcx/results/1469/5-scMVP/1469_z_atac.csv"), index=False)

In [8]:
latent_rna = pd.read_csv("/home/xcx/results/1469/5-scMVP/1469_z_rna.csv", index_col=None).values
latent_atac = pd.read_csv("/home/xcx/results/1469/5-scMVP/1469_z_atac.csv", index_col=None).values
latent = pd.read_csv("/home/xcx/results/1469/5-scMVP/1469_latent.csv", index_col=None).values

In [11]:
# UMAP visulization
matplotlib.use('TkAgg')
prior_adata = anndata.AnnData(X=latent, dtype=latent.dtype)
cell_embeddings = pd.read_csv("/home/xcx/SNARE-GSE126074/1469/anno.txt", header = None)

prior_adata.obs['cell_type'] = np.array(cell_embeddings)
prior_adata.obsm["X_multi_vi"] = latent

# louvain cluster
sc.pp.neighbors(prior_adata, use_rep="X_multi_vi", n_neighbors=30)
sc.tl.umap(prior_adata, min_dist=0.3)
sc.tl.louvain(prior_adata)
sc.pl.umap(prior_adata, color=['louvain'])
plt.show()

# origin cell label
sc.pl.umap(prior_adata, color=['cell_type'])
plt.show()

# Kmeans cluster
from sklearn.cluster import KMeans
latent_code = prior_adata.obsm["X_multi_vi"]

kmeans = KMeans(n_clusters=3, random_state=0).fit(latent_code) 
prior_adata.obs['kmeans'] = kmeans.labels_.astype(str)
sc.pl.umap(prior_adata, color=['kmeans'])
plt.show()

## (Optional) Pre-training
- For large and complex realistic joint profiling dataset like SHARE-seq skin dataset, we recommend to perform pre-training before scMVP training.

- We next give a demo for pretraining of scRNA and scATAC, and then perform scMVP training.

### Step1: Construct separate dataset obeject

In [7]:
# ATAC pretraining
import datetime
starttime = datetime.datetime.now()

atac_dataset = GeneExpressionDataset()
cell_attributes_dict = {
    "barcodes": dataset.barcodes
    }
atac_dataset.populate_from_data(
    X=dataset.atac_expression, # notice the normalization
    batch_indices=None,
    gene_names=dataset.atac_names,
    cell_attributes_dict=cell_attributes_dict,
    Ys=[],
)
rna_dataset = GeneExpressionDataset()
Ys = []
measurement = CellMeasurement(
        name="atac_expression",
        data=atac_dataset.X,
        columns_attr_name="atac_names",
        columns=atac_dataset.gene_names,
    )
Ys.append(measurement)
cell_attributes_dict = {
    "barcodes": dataset.barcodes
    }
rna_dataset.populate_from_data(
    X=dataset.X,
    batch_indices=None,
    gene_names=dataset.gene_names,
    cell_attributes_dict=cell_attributes_dict,
    Ys=Ys,
)

print(atac_dataset)
print(rna_dataset)

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

[2023-08-11 14:24:46,916] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2023-08-11 14:24:46,918] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]
[2023-08-11 14:24:46,922] INFO - scMVP.dataset.dataset | Remapping batch_indices to [0,N]
[2023-08-11 14:24:46,923] INFO - scMVP.dataset.dataset | Remapping labels to [0,N]


GeneExpressionDataset object with n_cells x nb_genes = 1469 x 10081
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'barcodes', 'labels', 'local_means', 'batch_indices', 'local_vars'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
GeneExpressionDataset object with n_cells x nb_genes = 1469 x 933
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'barcodes', 'labels', 'local_means', 'batch_indices', 'atac_expression', 'local_vars'
    cell_categorical_attribute_names: 'batch_indices', 'labels'
    cell_measurements_columns: {'atac_expression': 'atac_names'}
0


### Step2: Pretrain and visualize ATAC dataset
(approximate 1min on 1080Ti)

In [8]:
# ATAC pretraining
import datetime
starttime = datetime.datetime.now()

n_epochs = 10
lr = 1e-3
use_batches = False
use_cuda = True # False if using CPU
n_centroids = 5 
n_alfa = 1.0

pre_atac_vae = VAE_Peak_SelfAttention(atac_dataset.nb_genes, n_latent=20,n_batch=0, n_layers=1, log_variational=True, reconstruction_loss="nb")
pre_atac_trainer = UnsupervisedTrainer(
    pre_atac_vae,
    atac_dataset,
    train_size=0.9,
    use_cuda=use_cuda,
    frequency=5,
)

if os.path.isfile('%s/pre_atac_trainer.pkl' % output_path):
    pre_atac_trainer.model.load_state_dict(torch.load('%s/pre_atac_trainer.pkl' % output_path))
    pre_atac_trainer.model.eval()
else:
    pre_atac_trainer.train(n_epochs=3, lr=lr)
    torch.save(pre_atac_trainer.model.state_dict(), '%s/pre_atac_trainer.pkl' % output_path)
    pre_atac_trainer.model.eval()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

1


In [9]:
# ATAC pretrainer_posterior:
import datetime
starttime = datetime.datetime.now()

full = pre_atac_trainer.create_posterior(pre_atac_trainer.model, atac_dataset, indices=np.arange(len(atac_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
prior_adata = anndata.AnnData(X=atac_dataset.X)
prior_adata.obsm["X_multi_vi"] = latent
prior_adata.obs['cell_type'] = torch.tensor(labels.reshape(-1,1))
sc.pp.neighbors(prior_adata, use_rep="X_multi_vi", n_neighbors=30)
sc.tl.umap(prior_adata, min_dist=0.3)

sc.tl.louvain(prior_adata)
sc.pl.umap(prior_adata, color=['louvain'])
plt.show()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

22


### Step3: Perform similar pretraining for scRNA dataset
(less than 1min)

In [10]:
# RNA embedding
import datetime
starttime = datetime.datetime.now()

pre_vae = VAE_Attention(rna_dataset.nb_genes, n_latent=20,n_batch=0, n_layers=1, log_variational=True, reconstruction_loss="nb")
pre_trainer = UnsupervisedTrainer(
    pre_vae,
    rna_dataset,
    train_size=0.9,
    use_cuda=True,
    frequency=5,
)


if os.path.isfile('%s/pre_trainer.pkl' % output_path):
    pre_trainer.model.load_state_dict(torch.load('%s/pre_trainer.pkl' % output_path))
    pre_trainer.model.eval()

else:
    pre_trainer.train(n_epochs=10, lr=lr)
    torch.save(pre_trainer.model.state_dict(), '%s/pre_trainer.pkl' % output_path)
    pre_trainer.model.eval()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

0


In [11]:
# RNA pretrainer_posterior:
import datetime
starttime = datetime.datetime.now()

full = pre_trainer.create_posterior(pre_trainer.model, rna_dataset, indices=np.arange(len(rna_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
imputed_values = full.sequential().imputation()

# UMAP visulization
prior_adata = anndata.AnnData(X=rna_dataset.X)
prior_adata.obsm["X_multi_vi"] = latent
prior_adata.obs['cell_type'] = torch.tensor(labels.reshape(-1,1))
sc.pp.neighbors(prior_adata, use_rep="X_multi_vi", n_neighbors=30)
sc.tl.umap(prior_adata, min_dist=0.3)

sc.tl.louvain(prior_adata)
sc.pl.umap(prior_adata, color=['louvain'])
plt.show()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

4


### Step4:  Perform scMVP training with two pretrained models
(few minutes)

In [12]:
# get n_centroids from scRNA data
n_centroids=len(np.unique(prior_adata.obs['louvain'].tolist()))

import datetime
starttime = datetime.datetime.now()


multi_vae = Multi_VAE_Attention(rna_dataset.nb_genes, len(rna_dataset.atac_names), n_batch=0, n_latent=20, n_centroids=n_centroids, n_alfa = n_alfa, mode="mm-vae") # should provide ATAC num, alfa, mode and loss type
trainer = MultiTrainer(
    multi_vae,
    rna_dataset,
    train_size=0.9,
    use_cuda=use_cuda,
    frequency=5,
)


if os.path.isfile('%s/multi_vae_trainer.pkl' % output_path):
    trainer.model.load_state_dict(torch.load('%s/multi_vae_trainer.pkl' % output_path))
    trainer.model.eval()
else:
    pre_trainer = UnsupervisedTrainer(
        pre_vae,
        rna_dataset,
        train_size=0.9,
        use_cuda=use_cuda,
        frequency=5,
    )
    pre_trainer.model.load_state_dict(torch.load('%s/pre_trainer.pkl' % output_path))

    pre_atac_trainer = UnsupervisedTrainer(
        pre_atac_vae,
        atac_dataset,
        train_size=0.9,
        use_cuda=use_cuda,
        frequency=5,
    )
    pre_atac_trainer.model.load_state_dict(torch.load('%s/pre_atac_trainer.pkl' % output_path))

    n_centroids=len(np.unique(prior_adata.obs['louvain'].tolist()))

    # joint RNA and ATAC embedding
    trainer.model.init_gmm_params_with_louvain(latent,np.array(prior_adata.obs['louvain'].tolist()).astype(int))

    trainer.model.RNA_encoder.load_state_dict(pre_trainer.model.z_encoder.state_dict())
    for param in trainer.model.RNA_encoder.parameters():
        param.requires_grad = True
    trainer.model.ATAC_encoder.load_state_dict(pre_atac_trainer.model.z_encoder.state_dict())
    for param in trainer.model.ATAC_encoder.parameters():
        param.requires_grad = True
    trainer.train(n_epochs=15, lr=lr)
    torch.save(trainer.model.state_dict(), '%s/multi_vae_trainer.pkl' % output_path)
    trainer.model.eval()
    
#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

0


In [None]:
#### Then same with direct scMVP training

In [13]:
import datetime
starttime = datetime.datetime.now()

# create posterior from trained model
full = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)),type_class=MultiPosterior)
latent, latent_rna, latent_atac, cluster_gamma, cluster_index, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
imputed_values = full.sequential().imputation()

#long running
endtime = datetime.datetime.now()
print((endtime-starttime).seconds)

1
