In [1]:
#import sys
#sys.path.append('../popalign/')
import popalign as PA
import importlib

In [2]:
PA.__file__

'/anaconda3/envs/py3/lib/python3.7/site-packages/popalign/__init__.py'

In [3]:
# Load data (example)
LOAD = 'screen'

if LOAD == 'samples':
    mysamples = {
        'CTRL' : '../data/samples/PBMC.mtx',
        'GMCSF_1ng/ml' : '../data/samples/GMCSF.mtx',
        'IFNG_1ng/ml' : '../data/samples/IFNG.mtx',
        'IL2_10ng/ml' : '../data/samples/IL2.mtx',
        'CD40L_20ng/ml' : '../data/samples/CD40L.mtx',
    }
    mygenes = '../data/samples/genes.tsv'
    pop = PA.load_samples(samples=mysamples, 
                          genes=mygenes)
    
elif LOAD == 'screen':
    mymatrix = '../data/screen/drug_screen/pbmcmult4cd3minus.mtx'
    mybarcodes = '../data/screen/drug_screen/barcodes.tsv'
    mygenes = '../data/screen/drug_screen/features.tsv'
    mymetadata = '../data/screen/drug_screen/meta.csv'
    pop = PA.load_screen(matrix=mymatrix, 
                         barcodes=mybarcodes, 
                         metafile=mymetadata, 
                         genes=mygenes)

In [None]:
# Perform column normalization
# Find best normalization factor
PA.normalize(pop)

In [None]:
# Plot genes (log cv ~ log cv) and filtering line --use multiple times to find best offset (usually between .7 and 1.5)
PA.plot_gene_filter(pop, offset=1.1)

In [None]:
# Gene filter the data with the last offset value used in the previous step
PA.filter(pop)

In [None]:
# Remove red blood cells from the data
PA.removeRBC(pop, 'human')

In [None]:
# Generate multiple feature spaces and pick the best one based on reconstruction error
# Run GSEA on each feature
# Generate QC plots
PA.onmf(pop, ncells=5000, nfeats=[5,7], nreps=3, niter=500)

In [None]:
# Build a Gaussian Mixture model for each sample
# Type the models subpopulations
importlib.reload(PA)
PA.build_gmms(pop, ks=(5,20), nreps=3, reg_covar=True, rendering='grouped', types=None)

In [None]:
# Calculate all the subpopulations entropies for each samples
PA.entropy(pop)

In [None]:
# Align subpopulations of each sample against a reference model's subpopulations
importlib.reload(PA)
PA.align(pop, ref='CTRL', method='conservative')

In [None]:
# Rank each sample against a reference sample's model
PA.rank(pop, ref='CTRL', k=100, niter=200, mincells=50)

In [None]:
# Build a unique GMM for the samples concatenated together
PA.build_unique_gmm(pop, ks=(5,20), nreps=3, reg_covar=True, types=None)

In [None]:
# Generate a query plot
importlib.reload(PA)
PA.plot_query(pop)

In [None]:
# Interactive 3D visualization of the data in feature space
import plotly
plotly.offline.init_notebook_mode()
PA.plotfeatures(pop)

In [None]:
"""
typelist = list(types.keys())
genelist = np.concatenate([types[t] for t in typelist])

gmm = pop['samples']['CTRL']['gmm'] # get gmm
prediction = gmm.predict(pop['samples']['CTRL']['C']) # prediction the cells assignments for that sample
types = PA.default_types()
genes = pop['genes']

df = pd.DataFrame(columns=range(gmm.n_components), index=typelist) # create empty dataframe

for t in types: # for each cell type in the dictionary
    gidx = [np.where(genes==x)[0][0] for x in types[t] if x in genes] # get the indices of the valid genes for that cell type
    for i in range(gmm.n_components): # for each component of the sample
        cidx = np.where(prediction==i)[0] # get the matching cell indices
        sub = pop['samples']['CTRL']['M'][:,cidx] # subset the normalized data
        sub = sub[gidx,:] # subset the desired genes
        df.at[t,i] = sub.mean() # update the dataframe with the mean of those cells for those genes
"""

In [None]:
"""
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.T)
df = pd.DataFrame(columns=range(gmm.n_components), index=typelist, data=x_scaled.T)
"""