# Iterative single-cell multi-omic integration using online learning

In [None]:
# Please load following modules
import pyliger
import numpy as np

# Scenario 1: sampling minibatches from fully observed datasets

We first create a Liger object by passing the filenames of HDF5 files containing the raw count data. The data can be downloaded [here](https://www.dropbox.com/sh/d7fnebmjfv9ueqi/AAA9vlwlCqe-P6T_iPYDier9a?dl=0). Liger assumes by default that the HDF5 files are formatted by the 10X CellRanger pipeline. Large datasets are often generated over multiple 10X runs (for example, multiple biological replicates). In such cases it may be necessary to merge the HDF5 files from each run into a single HDF5 file. We provide the mergeH5 function for this purpose (see below for details).

In [None]:
from anndata import read_h5ad
ctrl_dge = read_h5ad('./src/pyliger/datasets/pbmc_ctrl.h5ad', backed='r+')
stim_dge = read_h5ad('./src/pyliger/datasets/pbmc_stim.h5ad', backed='r+')

In [None]:
adata_list = [stim_dge, ctrl_dge]
pbmcs = pyliger.create_liger(adata_list)

We then perform the normalization, gene selection, and gene scaling in an online fashion, reading the data from disk in small batches.

In [None]:

pyliger.normalize(pbmcs)
pyliger.select_genes(pbmcs, var_thresh=0.2, do_plot=False)
pyliger.scale_not_center(pbmcs)

In [None]:
W = np.loadtxt('/Users/lulu/Desktop/W1.txt')
V1 = np.loadtxt('/Users/lulu/Desktop/V1.txt')
V2 = np.loadtxt('/Users/lulu/Desktop/V2.tx t')

In [None]:
W.shape

In [None]:
pbmcs.adata_list

# Online Integrative Nonnegative Matrix Factorization

Now we can use online iNMF to factorize the data, again using only minibatches that we read from the HDF5 files on demand (default mini-batch size = 5000). Sufficient number of iterations is crucial for obtaining ideal factorization result. If the size of the mini-batch is set to be close to the size of the whole dataset (i.e. an epoch only contains one iteration), max.epochs needs to be increased accordingly for more iterations.

In [None]:
pyliger.online_iNMF(pbmcs, k = 20, miniBatch_size = 5000, max_epochs = 5, W_init=W, V_init=[V1, V2])

In [None]:
#%%timeit -n1 -r10
pyliger.online_iNMF(pbmcs, k = 20, miniBatch_size = 5000, max_epochs = 5, verbose=False)

# Quantile Normalization and Downstream Analysis

After performing the factorization, we can perform quantile normalization to align the datasets.

In [None]:
pyliger.quantile_norm(pbmcs, max_sample=2000)

We can also visualize the cell factor loadings in two dimensions using t-SNE or UMAP.

In [None]:
pyliger.run_umap(pbmcs, distance = 'cosine', n_neighbors = 30, min_dist = 0.3)

In [None]:
%matplotlib notebook
all_plots = pyliger.plot_by_dataset_and_cluster(pbmcs, axis_labels = ['UMAP 1', 'UMAP 2'], return_plots = True)
all_plots

# Scenario 2: iterative refinement by incorporating new datasets

In [None]:
# Please load following modules
import pyliger
import numpy as np
import pandas as pd
import scipy.io
from scipy.sparse import csr_matrix

In [None]:
# Load Dataset
allen_smarter_cells = pyliger.datasets.allen_smarter_cells(backed='r+')
allen_smarter_nuclei = pyliger.datasets.allen_smarter_nuclei(backed='r+')

In [None]:
MOp = pyliger.create_liger([allen_smarter_cells])
pyliger.normalize(MOp)
pyliger.select_genes(MOp, var_thresh=2)
pyliger.scale_not_center(MOp)

In [None]:
pyliger.online_iNMF(MOp, k = 40, max_epochs = 1)

In [None]:
pyliger.quantile_norm(MOp)
pyliger.run_umap(MOp)

In [None]:
%matplotlib notebook
all_plots = pyliger.plot_by_dataset_and_cluster(MOp, axis_labels = ['UMAP 1', 'UMAP 2'], return_plots = True)
all_plots

In [None]:
pyliger.online_iNMF(MOp, X_new = [allen_smarter_nuclei], k = 40, max_epochs=1, V_init=[V2])

In [None]:
pyliger.quantile_norm(MOp, max_sample=2000)
pyliger.run_umap(MOp, use_raw=True)

In [None]:
%matplotlib notebook
all_plots = pyliger.plot_by_dataset_and_cluster(MOp, axis_labels = ['UMAP 1', 'UMAP 2'], return_plots = True)
all_plots

# Scenario 3: projecting new datasets

In [None]:
# Please load following modules
import pyliger
import numpy as np
import pandas as pd
import scipy.io
from scipy.sparse import csr_matrix

In [None]:
# Load Dataset
allen_smarter_cells = pyliger.datasets.allen_smarter_cells(backed='r+')
allen_smarter_nuclei = pyliger.datasets.allen_smarter_nuclei(backed='r+')

In [None]:
MOp = pyliger.create_liger([allen_smarter_cells])
pyliger.normalize(MOp)
pyliger.select_genes(MOp, var_thresh=2)
pyliger.scale_not_center(MOp)

In [None]:
pyliger.online_iNMF(MOp, k = 40, max_epochs = 1, W_init=W, V_init=[V1])
pyliger.quantile_norm(MOp, max_sample=2000)
pyliger.run_umap(MOp)

In [None]:
%matplotlib notebook
all_plots = pyliger.plot_by_dataset_and_cluster(MOp, axis_labels = ['UMAP 1', 'UMAP 2'], return_plots = True)
all_plots

In [None]:
all_plots[1].save(filename = '/Users/lulu/Desktop/1.png', height=12, width=10, units = 'in', dpi=500)