### Imports

In [None]:
import sys
sys.path.append('../popalign/')
import popalign as PA
import importlib

In [None]:
# path of the popalign library
PA.__file__

### Load data
Run the first cell if samples are stored individually  
Run the second cell if samples are stored in the same matrix (requires meta data file)

In [None]:
'''
Load samples from distinct matrix files

Parameters
----------
samples : dict
    Dictionary of sample names (keys) and paths to their respective matrix files (values)
genes : str
    Path to a .tsv 10X gene file. Optional if existing_obj is provided
outputfolder : str
    Path (or name) of the output folder to create
existing_obj : dict, optional
    Object previously returned by either load_samples() or load_screen(). New samples will be added to that object
'''

mysamples = {
    'CTRL' : '../data/samples/PBMC.mtx',
    'GMCSF_1ng/ml' : '../data/samples/GMCSF.mtx',
    'IFNG_1ng/ml' : '../data/samples/IFNG.mtx',
    'IL2_10ng/ml' : '../data/samples/IL2.mtx',
    'CD40L_20ng/ml' : '../data/samples/CD40L.mtx',
}
mygenes = '../data/samples/genes.tsv'

pop = PA.load_samples(samples=mysamples, 
                      genes=mygenes,
                      outputfolder='output_samples',
                      existing_obj=None)

In [None]:
'''
Load data from a multiplexed experiment

Parameters
----------
matrix : str
    Path to a sparse matrix
barcodes : str
    Path to a .tsv 10X barcodes file
metafile : str
    Path to a metadata file. Must contains `cell_barcodes` and `sample_id` columns
genes : str
    Path to a .tsv 10X gene file. Optional if existing_obj is provided
outputfolder : str, optional
    Path (or name) of the output folder to create
existing_obj : dict, optional
    Object previously returned by either load_samples() or load_screen(). New samples will be added to that object
only : list, optional
    List of sample names to load (other samples with names not in list will not be loaded)
col : str, optional
    Name of a specific column in the meta data to use. Defaults to None
value : str or int, optional
    Value in the specified meta data column `col` to use to filter samples to load. Defaults to None
'''
mymatrix = '../data/screen/PBMC-MULT-6/PBMC-MULT-6.mtx'
mybarcodes = '../data/screen/PBMC-MULT-6/barcodes.tsv'
mygenes = '../data/screen/PBMC-MULT-6/genes.tsv'
mymetadata = '../data/screen/PBMC-MULT-6/updated_meta.csv'

pop = PA.load_multiplexed(matrix=mymatrix, 
                     barcodes=mybarcodes,
                     genes=mygenes,
                     metafile=mymetadata,
                     outputfolder='output_pbmc6_matt',
                     only=[], # list of sample names to only load the specified samples
                     col=None, # either None or a column name from the meta data
                     value=None,
                     existing_obj=None) # if col != None, specify value in column to filter samples

In [None]:
'''
PICKLE TO SAVE THE DATA
LOAD THE DATA VERY EFFICIENTLY
'''

import pickle
with open('mult6_data.pickle', 'wb') as handle:
    pickle.dump(pop, handle)

#with open('mult6_data.pickle', 'rb') as handle:
#    pop = pickle.load(handle)

In [None]:
'''
Display number of cells in each sample, total number of cells

Parameters
----------
pop : dict
    Popalign object
'''
PA.print_ncells(pop)

### Normalize and gene filter

In [None]:
'''
Normalize the samples of object `pop` and applies a normalization factor

Parameters
----------
pop : dict
    Popalign object
scaling_factor : int or None, optional
    Number used to scale the data values. If None, that factor is computed automatically
ncells : int or None
    Number of cells to randomly subsample to try different normalization factors to use less memory. If None, all cells are used.
'''

PA.normalize(pop, scaling_factor=None, ncells=30000)

In [None]:
'''
Plot genes by their log(mean) and log(coefficient of variation)
Can be used multiple times to find best offset

Parameters
----------
offset: float
    Value (its log) will be added to the intercept of the linear fit to filter genes
'''

PA.plot_gene_filter(pop, offset=1.13)

In [None]:
'''
Filter genes from data in `pop`
Can discard ribosomal genes that start with RPS or RPL
Can discard mitochondrial genes that start with MT-

Parameters
----------
pop :dict
    Popalign object
remove_ribsomal : bool
    Wether to remove or not the ribosomal genes
remove_mitochondrial : bool
    Wether to remove or not the mitochondrial geneset
'''
PA.filter(pop, remove_ribsomal=True, remove_mitochondrial=False)

In [None]:
'''
Remove red blood cells from data

Parameters
----------
pop : dict
    Popalign object
species : str
    name of experiment species. Can be human or mouse
'''
#PA.removeRBC(pop, species='human')

### Dimensionality reduction

In [None]:
'''
Compute feature spaces and minimize the reconstruction error to pick a final feature space
Run Gene Set Enrichment Analysis (GSEA)

Parameters
----------
pop : dict
    Popalign object
ncells : int
    Number of cells to use
nfeats : int or list of ints
    Number(s) of features to use
nreps : int
    Number of repetitions to perform for each k in nfeats
niter : int
    Maximum number of iterations to perform for each instance of the algoirthm
'''

importlib.reload(PA)
PA.onmf(pop, ncells=5000, nfeats=[7,9], nreps=3, niter=500)

### Grid of samples in embedding space

In [None]:
'''
Generate a grid plot of sample plots in an embedding space

Parameters
----------
pop : dict
    Popalign object
method : str
    Embedding method. One of tsne, umap
figsize : tuple
    Figure size
size_background : float, int
    Point size for the embedding scatter in the background
size_samples : float, int
    Point size for the highlighted samples
'''

PA.samples_grid(pop, method='tsne', figsize=(20,20))

### Build probabilistic models

In [None]:
'''
Build a Gaussian Mixture Model on feature projected data for each sample
Type model populations

Parameters
----------
pop : dict
    Popalign object
ks : int or tuple
    Number or range of components to use
niters : int
    number of replicates to build for each k in `ks`
training : int or float
    If training is float, the value will be used a percentage to select cells for the training set. Must follow 0<value<1
    If training is int, that number of cells will be used for the training set.
nreplicates : int
    Number of replicates to generate. These replicates model will be used to provide confidence intervals later in the analysis.
reg_covar : str or float
    If 'auto', the regularization value will be computed from the feature data
    If float, value will be used as reg_covar parameter to build GMMs
rendering : str
    One of grouped, individual
types : dict, str or None
    Dictionary of cell types. Keys should be cell types labels and values should be lists of valid genes
    If None, a default PBMC cell types dictionary is provided
figsizegrouped : tuple, optional
    Size of the figure for the renderings together. Default is (20,20)
figsizesingle : tuple, optional
    Size of the figure for each single sample rendering. Default is (5,5)
only: list or str, optional
    Sample label or list of sample labels. Will force GMM construction for specified samples only. Defaults to None
'''

importlib.reload(PA)
PA.build_gmms(pop, 
              ks=(3,12), 
              niters=2,
              training=.8, 
              nreplicates=0,
              reg_covar='auto', 
              rendering='grouped', 
              types='defaultpbmc', # either None, 'defaultpbmc' or a dictionary
              figsizegrouped=(20,20),
              figsizesingle=(6,5),
              only=None)

In [None]:
# If the renderings do not have the right figure size, regenerate the renderings with:
'''
Parameters
----------
pop : dict
    Popalign object
figsizegrouped : tuple
    Figure size for the grid rendering plotof all samples together
figsizesingle : tuple
    Figure size of an individual sample rendering plot
mode : str
    One of grouped, individual or unique.
    Grouped will render the models individually and together in a separate grid
    Inidividual will only render the models individually
    Unique will render the data's unique model
'''
importlib.reload(PA)
PA.render_models(pop, figsizegrouped=(30,30), samples=order, figsizesingle=(6,5), mode='grouped')

### Align subpopulations

In [None]:
'''
Align the components of each sample's model to the components of a reference model

Parameters
----------
pop : dict
    Popalign object
ref : str
    Name of reference sample
method : str
    Method to perform the alignment
    If 'conservative', the reference component and the test component have to be each other's best match to align
    If 'test2ref', the closest reference component is found for each test component
    If 'ref2test', the closest test component is found for each test component
figsizedeltas : tuple, optional
    Size of the figure for the delta plot. Default is (10,5)
figsizeentropy : tuple, optional
    Size of the figure for the entropy plot. Default is (10,5)
'''
importlib.reload(PA)
PA.align(pop, ref='CONTROL_CD3',
         method='test2ref', # one of: test2ref, ref2test, conservative
         figsizedeltas=(10,10),
         figsizeentropy=(10,10))

In [None]:
importlib.reload(PA)
PA.plot_deltas_test(pop, 20, (15,15)) # pop, pointsize, figsize arguments

In [None]:
'''
SAVE ALIGNMENT MATRICES TO CSV FILES
'''

import pandas as pd
import os

dname = 'alignment_matrices_control_cd3'
PA.mkdir(os.path.join(pop['output'], dname))

cols = ['%d %s' % (i,s) for i,s in enumerate(pop['samples']['CONTROL_CD3']['gmm_types'])]
for x in pop['order']:
    try:
        arr = pop['samples'][x]['fullalignments']
        rows = ['%d %s' % (i,s) for i,s in enumerate(pop['samples'][x]['gmm_types'])]
        df = pd.DataFrame(arr, index=rows, columns=cols)
        df.to_csv(os.path.join(pop['output'], dname, '%s.csv' % x))
    except:
        pass

In [None]:
'''
Find  differentially expressed genes between a refernce subpopulation
and the subpopulation of a sample that aligned to it

Parameters
----------
pop : dict
    Popalign object
refcomp : int
    Subpopulation number of the reference sample's GMM
sample : str
    Name of the sample to compare
nbins : int, optional
    Number of histogram bins to use
nleft : int
    Number of underexpressed genes to retrieve
nright : int
    Number of overexpressed genes to retrieve
renderhists : bool
    Render histograms or not for the top differentially expressed genes
usefiltered : bool
    Wether to use filtered genes or not. If False, all genes will be used to run the differential expression
'''

refcomp = 1
sample = 'Budesonide'
genelist = PA.diffexp(pop,
                      refcomp=refcomp,
                      sample=sample,
                      nbins=20,
                      cutoff=.6,
                      renderhists=True,
                      usefiltered=True)

In [None]:
'''
Plot specific genes for cells of a reference subpopulation S and subpopulations that aligned to S

Parameters
----------
pop : dict
    Popalign dict
refcomp : int
    Reference subpopulation number
genelist : list
    List of genes to plot
clustersamples : bool
    Cluster the samples
clustercells : bool
    Cluster the cells within each sample subpopulation
savename : str, optional
    The user can specify a name for the file to be written. When savename is None, a filename is computed with the reference component number. Default is None
figsize : tuple, optional
    Size of the figure. Default is (15,15)
cmap : str, optional
    Name of the Matplotlib colormap to use. Default is Purples
samplelimits : bool, optional
    Wether to draw vertical lines on the heatmap to visually separate cells from different samples
scalegenes : bool, optional
    Wether to scale the genes by substracting the min and dividing by the max for each gene
only: str, optional
    Sample label
equalncells: bool, optional
    Randomly subsets cells so that subpopulations have the same number of cells
'''

PA.plot_heatmap(pop,
                refcomp=refcomp,
                genelist=genelist,
                clustersamples=True,
                clustercells=False,
                savename='%d_%s' % (refcomp,sample) ,# either None or a string
                figsize=(20,20),
                cmap='Purples',
                samplelimits=False,
                scalegenes=True,
                only=None,
                equalncells=False)

### Visualize genes in subpopulations

In [None]:
'''
Plot a heatmap of genes ~ GMM subpopulations cells for a given model

Parameters
----------
pop : dict
    Popalign object
sample : str
    Sample name to select model from pop dictionary
genelist : list
    List of gene names. If empty, the filtered genes will be used.
savename : str
    File name to use
metric : str
    Metric to use to cluster
method : str
    Method to use to cluster
clustergenes : boolean
    Wether or not to cluster the genes (rows)
cmap : str
    Name of the colormap to use
figsize : tuple
    Figure size
'''
PA.plot_genes_gmm_cells(pop,
               sample='unique',
               genelist=genelist,
               savename='',
               metric='correlation',
               method='single',
               clustergenes=False,
               clustercells=False,
               cmap='magma',
               figsize=(15,15)
)

### Rank samples

In [None]:
'''
Generate a ranking plot of the samples against a reference model

Parameters
----------
pop : dict
    Popalign object
ref : str
    Reference sample name
k : int
    Number of random cells to use
niter : int
    Number of iterations to perform
method : str
    Scoring method to use. 'LLR' for log-likelihood ratio, 'LL' for log-likelihood.
mincells : int
    If a sample has less than `mincells` cells, is discarded
figsize : tuple, optional
    Size of the figure. Default is (10,5)
'''

PA.rank(pop,
        ref='CONTROL', # label of the reference sample
        k=100, # number of cells to randomly sample from sample
        niter=200, # number of iterations
        method='LLR', # LLR for log-likelihood ratio or LL for log-likelihood
        mincells=50, # sample's minimum number of cells to be ranked
        figsize=(10,5)) # plot figure size

### Build unique GMM for all samples

In [None]:
'''
Build a unique Gaussian Mixture Model on the feature projected data

Parameters
----------
pop : dict
    Popalign object
ks : int or tuple
    Number or range of components to use
niters : int
    number of replicates to build for each k in `ks`
training : int or float
    If training is float, the value will be used a percentage to select cells for the training set. Must follow 0<value<1
    If training is int, that number of cells will be used for the training set.
reg_covar : boolean or float
    If True, the regularization value will be computed from the feature data
    If False, 1e-6 default value is used
    If float, value will be used as reg_covar parameter to build GMMs
types : dict, str or None
    Dictionary of cell types.
    If None, a default PBMC cell types dictionary is provided
figsize : tuple, optional
    Size of the figure. Default is (6,5)
'''
PA.build_unique_gmm(pop, 
                    ks=(5,20), 
                    niters=3, 
                    training=0.4, 
                    reg_covar=True, 
                    types='defaultpbmc', # either None, 'defaultpbmc' or a dictionary
                    figsize=(6,5))

In [None]:
'''
Plot proportions of the samples in each of the GMM components

Parameters
----------
pop: dict
    Popalign object
pcells: float
    Percentage of random cells to use for each query repetition
nreps: int
    Number of times to repeat the query process with different cells for each sample
figsize : tuple, optional
    Size of the figure. Default is (5,20)
'''
PA.plot_query(pop,  pcells=.2, nreps=10, figsize=(10,20), sharey=False)

### 2D visualization

In [None]:
'''
Run an embedding algorithm and plot the data in a scatter plot
Higlight specific gene expression

pop: dict
    Popalign object
method : str
    Embedding method. One of umap, tsne. Defaults to tsne
sample: str, optional
    Highlight specified sample
compnumber: int, optional
    Component number of specified sample
marker : str
    Valid gene name. Defaults to None
size : float or int
    Point size. Defaults to .1
'''
PA.scatter(pop, method='tsne', marker='CD3D')
PA.scatter(pop, method='tsne', marker='LYZ')
PA.scatter(pop, method='tsne', marker='CD163')
PA.scatter(pop, method='tsne', sample='Budesonide', marker='CD163')

In [None]:
'''
Generate a grid plot of sample plots in an embedding space

Parameters
----------
pop : dict
    Popalign object
method : str
    Embedding method. One of tsne, umap
figsize : tuple
    Figure size
size_background : float, int
    Point size for the embedding scatter in the background
size_samples : float, int
    Point size for the highlighted samples
'''

PA.samples_grid(pop, method='tsne', figsize=(20,20))
PA.samples_grid(pop, method='umap', figsize=(20,20))

In [None]:
'''
Generate grid plots of sample subpopulations in an embedding space

pop : dict
    Popalign object
method : str
    Embedding method. One of tsne, umap
figsize : tuple
    Size of the figures to be generated
size_background : float, int
    Point size for the embedding scatter in the background
size_subpops : float, int
    Point size for the highlighted subpopulations
'''
PA.subpopulations_grid(pop, method='tsne', figsize=(20,20), size_subpops=20)
#PA.subpopulations_grid(pop, method='umap', figsize=(20,20))

### 3D visualization

In [None]:
# Interactive 3D visualization of the data in feature space
importlib.reload(PA)
import plotly
plotly.offline.init_notebook_mode()
PA.plotfeatures(pop)

### Work in progress

In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#-------------- WORK IN PROGRESS BELOW ----------------
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
from multiprocessing import Pool
import numpy as np
import os

def diffexp_two_matrices(M1, M2, genes, nbins=20, cutoff=.7):
    M1 = M1.toarray()
    M2 = M2.toarray()
    
    with Pool(pop['ncores']) as p:
        q = np.array(p.starmap(PA.l1norm, [(ig, M1[ig,:], M2[ig,:], nbins) for ig in range(M1.shape[0])])) # for each gene idx ig, call the l1norm function
    
    idx = np.argsort(q)
    q = q[idx]
    genes = np.array(genes)
    genes = genes[idx]
    
    downregulated_idx = np.where(np.array(q)<-cutoff)[0] # get indices of genes with low l1-norm values
    upregulated_idx = np.where(np.array(q)>cutoff)[0] # get indices of genes with high l1-norm values
    downregulated = [genes[i] for i in downregulated_idx] # get gene labels
    upregulated = [genes[i] for i in upregulated_idx] # get gene labels
    
    # render l1norm values
    x = np.arange(len(q))
    y = q
    plt.scatter(x, y, s=.1, alpha=1)
    plt.axhline(y=cutoff, color='red', linewidth=.5, label='Cutoff')
    plt.axhline(y=-cutoff, color='red', linewidth=.5)
    plt.xticks([])	
    plt.ylabel('l1-norm')
    plt.xlabel('Genes')
    plt.legend()
    
    return downregulated, upregulated

# Differential expression between two sparse matrices
downregulated, upregulated = diffexp_two_matrices(M1, M2, genes, cutoff=.9) 

In [None]:
#def subpopulations_grid(pop, method='tsne', figsize=(20,20), size_background=.1, size_subpops=1):
'''
Generate grid plots of sample subpopulations in an embedding space

pop : dict
    Popalign object
method : str
    Embedding method. One of tsne, umap
figsize : tuple
    Size of the figures to be generated
size_background : float, int
    Point size for the embedding scatter in the background
size_subpops : float, int
    Point size for the highlighted subpopulations
'''

method='tsne'
figsize=(20,20)
size_background=.1
size_subpops=1

if method not in pop: # if method not run before
    X = cat_data(pop, 'C') # retrieve feature space data
    if method == 'umap': # if method is umap
        X = umap.UMAP().fit_transform(X) # run umap
    elif method == 'tsne': # if method is tsne
        X = TSNE(n_components=2).fit_transform(X) # run tsne
    else: # if method not valid
        raise Exception('Method value not supported. Must be one of tsne, umap.') # raise exception
    pop[method] = X # store embedded coordinates
else: # if method has been run before
    X = pop[method] # retrieve embedded coordinates

x = X[:,0] # get x coordinates
y = X[:,1] # get y coordinates


C = PA.cat_data(pop, 'C') # get sample feature data
gmm = pop['gmm'] # get sample gmm
poplabels = pop['gmm_types'] # get subpopulations labels
prediction = gmm.predict(C) # get subpopulation assignments for the cells

n = C.shape[0] # get number of cells
nr, nc = nr_nc(gmm.n_components) # get number of rows and columns for the grid plot
fig, axes = plt.subplots(nr,nc,figsize=figsize) # create figure and subaxes
axes = axes.flatten()

for i in range(gmm.n_components): # for each subpopulation of sample
    ax = axes[i] # assign sub axis
    idx = np.where(prediction==i)[0] # get cell indices for that subpopulations
    xtmp = x[idx] # subset cells
    ytmp = y[idx] # subset cells
    ax.scatter(x, y, c='lightgrey', s=size_background) # plot all cells as background
    ax.scatter(xtmp, ytmp, c='purple', s=size_subpops) # plot subpopulation cells on top
    ax.set(xticks=[]) # remove x ticks
    ax.set(yticks=[]) # remove y ticks
    ax.set(title='Subpopulation #%d\n%s' % (i, poplabels[i])) # set title
    if i % nc == 0:
        ax.set(ylabel='%s2' % method) # set y label
    if i >= len(pop['order'])-nc:
        ax.set(xlabel='%s1' % method) # set x label

rr = len(axes)-gmm.n_components # count how many empty plots in grid
for i in range(1,rr+1):
    ax = axes[-i] # backtrack extra sub axes
    ax.axis('off') # clear empty axis from plot

dname = 'embedding/subpopulations/' # folder name
mkdir(os.path.join(pop['output'], dname)) # create folder if does not exist
plt.savefig(os.path.join(pop['output'], dname, '%s_subpopulations.png' % sample), dpi=200) # save plot
plt.close() # close plot
start = end # update start index