In [1]:
import sys
sys.path.append('/home/xinyiz/pamrats')

import time
import os

import scanpy
import numpy as np
import scipy.sparse as sp

import torch
from torch import optim

# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import average_precision_score

import image.loadImage as loadImage
import gae.gae.optimizer as optimizer
import image.modelsCNN as modelsCNN

import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN,MiniBatchKMeans,AgglomerativeClustering
from sklearn import metrics

import anndata as ad
import gc

In [2]:
#load pretrained GAE

datadir='/home/xinyiz/2021-01-13-mAD-test-dataset'
sampleidx={'disease13':'AD_mouse9494','control13':'AD_mouse9498','disease8':'AD_mouse9723','control8':'AD_mouse9735'}


scaleddata=scanpy.read_h5ad(datadir+'/2020-12-27-starmap-mAD-raw.h5ad')

cellCoord={}
for s in sampleidx.keys():
    sampleidx_s=sampleidx[s] 
    cellCoord[s]=((scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx_s,['y','x']].to_numpy())/0.3).astype(int)
scaleddata=None


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2" 
ifplot=True
ifcluster=True

inverseAct=None
# inverseAct=None
plottype='umap'
pca=PCA()
minCells=15 #min number of cells for analysis
# clustermethod=['kmeanbatch']
clustermethod=['leiden','agglomerative','kmeanbatch']
# clustermethod=['leiden']
#umap/leiden clustering parameters
n_neighbors=10
min_dist=0.25
n_pcs=40 #for clustering
# resolution=[0.5,0.8,1,1.5]
resolution=[0.05,0.1,0.2,0.3,0.5,0.8,1,1.5]
plotepoch=53
savenameAdd=''
#DBscan
epslist= [6,8,10]
min_sampleslist=[15,30,45] 
#agglomerative
nclusterlist=[2,3,4,5,8,10,15]
aggMetric=['euclidean']


combineCelltype={'glia':['Astro','Micro', 'OPC', 'Oligo'],'CA':['CA1', 'CA2', 'CA3']}

use_cuda=True
fastmode=False #Validate during training pass
seed=3
kernel_size=4
stride=2
padding=1

hidden1=64 #Number of channels in hidden layer 1
hidden2=128 
hidden3=256
hidden4=256
hidden5=96
fc_dim1=96*25*25
fc_dim2=6000
# fc_dim3=128
# fc_dim4=128
# gcn_dim1=2600
adv_hidden=128

model_str='cnn_vae'
targetBatch=None
diamThresh_mul=800
minThresh_mul=12
overlap=int(diamThresh_mul*0.5)
name='all_thresh25_01'
logsavepath='/mnt/external_ssd/xinyi/log/train_jointGAEcnn_starmap/'+name
modelsavepath='/mnt/external_ssd/xinyi/models/train_jointGAEcnn_starmap/'+name
plotsavepath='/mnt/external_ssd/xinyi/plots/train_jointGAEcnn_starmap/'+name

#Load data
plot_samples={'disease13':'AD_mouse9494','control13':'AD_mouse9498','disease8':'AD_mouse9723','control8':'AD_mouse9735'}
datadir='/home/xinyiz/2021-01-13-mAD-test-dataset'    

In [4]:
# Set cuda and seed
np.random.seed(seed)
if use_cuda and (not torch.cuda.is_available()):
    print('cuda not available')
    use_cuda=False
torch.manual_seed(seed)
if use_cuda:
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.enabled = True


In [5]:
# Create model
if model_str=='cnn_vae':
    model = modelsCNN.CNN_VAE(kernel_size, stride, padding, 1, hidden1, hidden2, hidden3, hidden4, hidden5, fc_dim1,fc_dim2)
      
if use_cuda:
    model.cuda()    
model.load_state_dict(torch.load(os.path.join(modelsavepath,str(plotepoch)+'.pt')))


<All keys matched successfully>

In [6]:
np.random.seed(seed)
def plotembeddingbyCT(ctlist,savename,excludelist,embedding,savepath,plotname,plotdimx=0,plotdimy=1,savenameAdd=''):
    celltypes=np.unique(ctlist)
    celltypes_dict={}
    idx=0
    for ct in celltypes:
        celltypes_dict[ct]=idx
        idx+=1
        
    colortest=sns.color_palette("husl", celltypes.size)
#     np.random.shuffle(colortest)
    fig, ax = plt.subplots(dpi=400)
    for ct in celltypes:
        if ct in excludelist:
            continue
        idx=(ctlist==ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[celltypes_dict[ct]],label=ct,s=1.5,alpha=0.5
            )

    plt.gca().set_aspect('equal', 'datalim')
    fig.set_figheight(5)
    fig.set_figwidth(5)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.1,
                     box.width, box.height * 0.9])
    # Put a legend below current axis
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
              fancybox=True, shadow=True, ncol=5)
#     ax.legend(ncol=3)
    plt.title(plotname+' embedding', fontsize=24)
    plt.savefig(os.path.join(savepath,savename+savenameAdd+'.jpg'))
#     plt.show()
    
#     fig.clf()
    plt.close('all')
    
    gc.collect()

In [7]:
np.random.seed(seed)
def plotembeddingbyCT_contrast(ctlist,savename,excludelist,embedding,savepath,plotname,plotdimx=0,plotdimy=1,savenameAdd='',maxplot=None): 
    celltypes=np.unique(ctlist)
    celltypes_dict={}
    idx=0
    for ct in celltypes:
        celltypes_dict[ct]=idx
        idx+=1

    colortest=sns.color_palette("tab10")
    if not os.path.exists(os.path.join(savepath)):
        os.makedirs(savepath)

    for ct in celltypes:
        if maxplot and int(ct)>maxplot:
            continue
        fig, ax = plt.subplots()
        if ct == 'Unassigned':
            continue

        idx=(ctlist!=ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[1],label='others',s=1,alpha=0.5
            )

        idx=(ctlist==ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[0],label=ct,s=3,alpha=0.5
            )

        plt.gca().set_aspect('equal', 'datalim')
        fig.set_figheight(10)
        fig.set_figwidth(10)
        ax.legend()
        plt.title(plotname+' embedding', fontsize=24)
        plt.gcf().savefig(os.path.join(savepath,savename+'_'+str(ct)+savenameAdd+'.jpg'))
#         plt.show()
#         nplot+=1
        
    
#         fig.clf()
        plt.close('all')
        gc.collect()

In [8]:
np.random.seed(seed)
def inverseLeakyRelu(v,slope=0.01):
    vnegidx=(v<0)
    v[vnegidx]=1/slope*v[vnegidx]
    return v

In [9]:
np.random.seed(seed)
def clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,resolution,randseed=seed):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    adata=ad.AnnData(inArray)
    scanpy.tl.pca(adata, svd_solver='arpack')
    scanpy.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
    scanpy.tl.umap(adata,min_dist=min_dist,random_state=randseed)
    scanpy.tl.leiden(adata,resolution=resolution,random_state=randseed)
    return adata.obs['leiden'].to_numpy()

def clusterLeiden(inArray,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,randseed=seed):
    for r in resolution:
        clusterRes=clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,r,randseed=seed)
#         print(clusterRes.shape)
        savenamecluster='leiden_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'n_pcs'+str(n_pcs)+'res'+str(r)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'leiden',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

        plotembeddingbyCT(clusterRes,'leiden_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterLeiden_allsample(embedding,savedir,clustersavedir,inArray,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,samplenameList,randseed=seed):
    for r in resolution:
        clusterRes=clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,r,randseed=seed)
        savenamecluster='leiden_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'n_pcs'+str(n_pcs)+'res'+str(r)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'leiden',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

        for s in plot_samples.keys():
            sidx=(samplenameList==s)
            plotembeddingbyCT(clusterRes[sidx],'leiden_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes[sidx],'leiden_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)

         

In [10]:
np.random.seed(seed)
def clusterDBscan_single(inArray,eps,min_samples,n_pcs):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    inArray=pca.fit_transform(inArray)
    labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(inArray[:,:n_pcs])
#     db = DBSCAN(eps=eps, min_samples=min_samples).fit(inArray[:,:n_pcs])
#     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#     core_samples_mask[db.core_sample_indices_] = True
#     labels = db.labels_
    return labels

def clusterDBscan(inArray,epsL,min_samplesL,n_pcs,sobj_coord_np):
    for eps in epsL:
        for min_samples in min_samplesL:
            clusterRes=clusterDBscan_single(inArray,eps,min_samples,n_pcs)
    #         print(clusterRes.shape)
            savenamecluster='dbscan_eps'+str(eps)+'msamples'+str(min_samples)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'dbscan',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

            plotembeddingbyCT(clusterRes,'dbscan_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterDBscan_allsample(embedding,savedir,clustersavedir,inArray,epsL,min_samplesL,n_pcs,sobj_coord_np,samplenameList):
    for eps in epsL:
        for min_samples in min_samplesL:
            clusterRes=clusterDBscan_single(inArray,eps,min_samples,n_pcs)
            savenamecluster='dbscan_eps'+str(eps)+'msample'+str(min_samples)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'dbscan',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

            for s in plot_samples.keys():
                sidx=(samplenameList==s)
                plotembeddingbyCT(clusterRes[sidx],'dbscan_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
                plotembeddingbyCT_contrast(clusterRes[sidx],'dbscan_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)


In [11]:
np.random.seed(seed)
def clusterAgg_single(inArray,ncluster,aggmetric,n_pcs):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    inArray=pca.fit_transform(inArray)
    labels = AgglomerativeClustering(n_clusters=ncluster,affinity=aggmetric).fit_predict(inArray[:,:n_pcs])
#     labels = agg.labels_
    return labels

def clusterAgg(inArray,nclusterL,aggmetricL,n_pcs,sobj_coord_np):
    for ncluster in nclusterL:
        for aggmetric in aggmetricL:
            clusterRes=clusterAgg_single(inArray,ncluster,aggmetric,n_pcs)
    #         print(clusterRes.shape)
            savenamecluster='agg_ncluster'+str(ncluster)+aggmetric+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'agg',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

            plotembeddingbyCT(clusterRes,'agg_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterAgg_allsample(embedding,savedir,clustersavedir,inArray,nclusterL,aggmetricL,n_pcs,sobj_coord_np,samplenameList):
    for ncluster in nclusterL:
        for aggmetric in aggmetricL:
            clusterRes=clusterAgg_single(inArray,ncluster,aggmetric,n_pcs)
            savenamecluster='agg_ncluster'+str(ncluster)+aggmetric+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'agg',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

            for s in plot_samples.keys():
                sidx=(samplenameList==s)
                plotembeddingbyCT(clusterRes[sidx],'agg_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
                plotembeddingbyCT_contrast(clusterRes[sidx],'agg_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)


In [12]:
np.random.seed(seed)
def clusterMinibatchKmean_single(inArray,ncluster,n_pcs,batchsize=100):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    batchsize=int(np.min([(inArray.shape[0]-1)/3,(inArray.shape[1]-1)/3,batchsize]))
    inArray=pca.fit_transform(inArray)
    labels = MiniBatchKMeans(n_clusters=ncluster,random_state=seed,batch_size=batchsize).fit_predict(inArray[:,:n_pcs])
    return labels

def clusterMinibatchKmean(inArray,nclusterL,n_pcs,sobj_coord_np):
    for ncluster in nclusterL:
        clusterRes=clusterMinibatchKmean_single(inArray,ncluster,n_pcs)
#         print(clusterRes.shape)
        savenamecluster='minibatchkmean_ncluster'+str(ncluster)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'minibatchkmean',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

        plotembeddingbyCT(clusterRes,'minibatchkmean_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterMinibatchKmean_allsample(embedding,savedir,clustersavedir,inArray,nclusterL,n_pcs,sobj_coord_np,samplenameList):
    for ncluster in nclusterL:
        clusterRes=clusterMinibatchKmean_single(inArray,ncluster,n_pcs)
        savenamecluster='minibatchkmean_ncluster'+str(ncluster)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'minibatchkmean',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

        for s in plot_samples.keys():
            sidx=(samplenameList==s)
            plotembeddingbyCT(clusterRes[sidx],'minibatchkmean_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes[sidx],'minibatchkmean_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)


In [13]:
#compute embeddings
mulist={}
model.eval()
for s in plot_samples.keys():
    training_samples_t=s
    imgInputnp=loadImage.load_cellCentroid(cellCoord[training_samples_t],sampleidx[training_samples_t],datadir,diamThresh_mul,ifFlip=False,seed=3,imagename='pi_sum.tif',minmaxscale=True,nchannels=1)
    muplot_all=np.zeros((imgInputnp.shape[0],fc_dim2))
    for i in range(imgInputnp.shape[0]):
        imgInput=imgInputnp[[i]]
        if use_cuda:
            imgInput=torch.tensor(imgInput).cuda().float()
        recon,z, mu, logvar = model(imgInput)

        if inverseAct=='leakyRelu':
            mu=inverseLeakyRelu(mu.cpu().detach().numpy())
        else:
            mu=mu.cpu().detach().numpy()
#             if plotRecon:
#                 if plotRecon=='meanRecon':
#                     mu=features_recon[3].cpu().detach().numpy()
        muplot_all[i]=mu
    mulist[s]=muplot_all

(22210, 22344)
(22355, 18953)
(22294, 19552)
(22452, 19616)


In [14]:
coordlist=cellCoord

In [17]:
#all cells
# ifcluster=False
np.random.seed(seed)
for s in plot_samples.keys():
    print(s)
    sampleidx=plot_samples[s]
    
    celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
    celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
    region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
    sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
        
        if inverseAct:
            samplename+='_beforeAct'
       
        sampledir=os.path.join(plotsavepath,samplename+plotRecon)
        savedir=os.path.join(sampledir,'embedding_'+plottype)
        clustersavedir=os.path.join(plotsavepath,samplename,'cluster')
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
        if not os.path.exists(savedir):
            os.mkdir(savedir)
        if not os.path.exists(clustersavedir):
            os.mkdir(clustersavedir)
            
        if plottype=='umap':
            reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
            embedding = reducer.fit_transform(muplot)
            savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
        elif plottype=='pca':
            embedding=pca.fit_transform(muplot)
            savenameAdd='_epoch'+str(plotepoch)
        
        if ifplot:
            plotembeddingbyCT(celltype_broad,'celltype_broad',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_sub,'celltype_sub',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)
            plotembeddingbyCT(region,'region',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)

            plotembeddingbyCT_contrast(celltype_sub,'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenameAdd)
        
        if embedding.shape[0]<minCells:
            continue
        if ifcluster:
            if 'leiden' in clustermethod:
                clusterLeiden(muplot,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,randseed=seed)
            if 'dbscan' in clustermethod:
                clusterDBscan(muplot,epslist,min_sampleslist,n_pcs,sobj_coord_np)
            if 'agglomerative' in clustermethod:
                clusterAgg(muplot,nclusterlist,aggMetric,n_pcs,sobj_coord_np)
            if 'kmeanbatch' in clustermethod:
                clusterMinibatchKmean(muplot,nclusterlist,n_pcs,sobj_coord_np)


disease13
control13
disease8
control8


In [None]:
# combine all latents to one plot 
np.random.seed(seed)
latents=None
samplenameList=None
sobj_coord_np=None

for s in plot_samples.keys():
    sampleidx=plot_samples[s]        
    samplename=s
    muplot=np.copy(mulist[samplename])

    if latents is None:
        latents=muplot
        sobj_coord_np=coordlist[s]
        samplenameList=np.repeat(s,muplot.shape[0])
    else:
        latents=np.vstack((latents,muplot))
        sobj_coord_np=np.concatenate((sobj_coord_np,coordlist[s]),axis=0)
        samplenameList=np.concatenate((samplenameList,np.repeat(s,muplot.shape[0])),axis=None)

sampledir=os.path.join(plotsavepath,'combined')
if inverseAct:
    sampledir+='_beforeAct'
savedir=os.path.join(sampledir,'embedding_'+plottype)
clustersavedir=os.path.join(sampledir,'cluster')
if not os.path.exists(sampledir):
    os.mkdir(sampledir)
if not os.path.exists(savedir):
    os.mkdir(savedir)
if not os.path.exists(clustersavedir):
    os.mkdir(clustersavedir)

if plottype=='umap':
    reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
    embedding = reducer.fit_transform(latents)
    savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
elif plottype=='pca':
    embedding=pca.fit_transform(latents)
    savenameAdd='_epoch'+str(plotepoch)
if ifplot:
    plotembeddingbyCT(samplenameList,'sample',[],embedding,savedir,plottype+'of all samples',savenameAdd=savenameAdd)

if ifcluster:
    if 'leiden' in clustermethod:
        clusterLeiden_allsample(embedding,savedir,clustersavedir,latents,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,samplenameList,randseed=seed)
    if 'dbscan' in clustermethod:
        clusterDBscan_allsample(embedding,savedir,clustersavedir,latents,epslist,min_sampleslist,n_pcs,sobj_coord_np,samplenameList)
    if 'agglomerative' in clustermethod:
        clusterAgg_allsample(embedding,savedir,clustersavedir,latents,nclusterlist,aggMetric,n_pcs,sobj_coord_np,samplenameList)
    if 'kmeanbatch' in clustermethod:
        clusterMinibatchKmean_allsample(embedding,savedir,clustersavedir,latents,nclusterlist,n_pcs,sobj_coord_np,samplenameList)


In [18]:
# combine all latents to one plot -- subcluster 
np.random.seed(seed)

ifplot=True
ifcluster=True

#leiden
subResolution=[0.2,0.1,0.3]

plottype='umap'
pca=PCA()
minCells_sub=10 #min number of cells for analysis
# clustermethod=['leiden','agglomerative','kmeanbatch']
clustermethod=['leiden']
#umap/leiden clustering parameters
n_neighbors_sub=5
min_dist_sub=0.25
n_pcs_sub=10 #for clustering
# resolution_sub=[0.12,0.15,0.17]
resolution_sub=[0.05,0.075,0.1,0.15,0.2,0.25,0.3]
#DBscan
epslist_sub= [6,8,10]
min_sampleslist_sub=[15,30,45] 
#agglomerative
nclusterlist_sub=[2,4,8,12]
aggMetric_sub=['euclidean']

sampledir=os.path.join(plotsavepath,'combined')
if inverseAct:
    sampledir+='_beforeAct'
savedir=os.path.join(sampledir,'embedding_'+plottype)
clustersavedir=os.path.join(sampledir,'cluster')

def subcluster(clustermethod,labels,savepath,addname):
    latents=None
    samplenameList=None
    sobj_coord_np=None
    for s in plot_samples.keys():
        sampleidx=plot_samples[s]        
        samplename=s
        muplot=np.copy(mulist[samplename])

        if latents is None:
            latents=muplot
            sobj_coord_np=coordlist[s]
            samplenameList=np.repeat(s,muplot.shape[0])
        else:
            latents=np.vstack((latents,muplot))
            sobj_coord_np=np.concatenate((sobj_coord_np,coordlist[s]),axis=0)
            samplenameList=np.concatenate((samplenameList,np.repeat(s,muplot.shape[0])),axis=None)

    sampledir=os.path.join(savepath,'combined')
    savedir_all=os.path.join(sampledir,'embedding_'+plottype)
    clustersavedir_all=os.path.join(sampledir,'cluster')
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    if not os.path.exists(sampledir):
        os.mkdir(sampledir)
    if not os.path.exists(savedir_all):
        os.mkdir(savedir_all)
    if not os.path.exists(clustersavedir_all):
        os.mkdir(clustersavedir_all)
    
    for l in np.unique(labels):
        clusteridx=(labels==l)
#         origCT=np.unique(celltype_broad)
#         celltypeplot=np.concatenate((origCT,list(combineCelltype.keys())),axis=None)
        savedir=os.path.join(savedir_all,l)
        clustersavedir=os.path.join(clustersavedir_all,l)
        if not os.path.exists(savedir):
            os.mkdir(savedir)
        if not os.path.exists(clustersavedir):
            os.mkdir(clustersavedir)

        if plottype=='umap':
            reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
            embedding = reducer.fit_transform(latents[clusteridx])
            savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
        elif plottype=='pca':
            embedding=pca.fit_transform(latents[clusteridx])
            savenameAdd='_epoch'+str(plotepoch)
        if ifplot:
            plotembeddingbyCT(samplenameList[clusteridx],'sample',[],embedding,savedir,plottype+'of all samples',savenameAdd=savenameAdd)

        if embedding.shape[0]<minCells_sub:
            continue
        if ifcluster:
            if clustermethod=='leiden':
                clusterLeiden_allsample(embedding,savedir,clustersavedir,latents[clusteridx],n_neighbors_sub,n_pcs_sub,min_dist_sub,resolution_sub,sobj_coord_np[clusteridx],samplenameList[clusteridx],randseed=seed)
            elif clustermethod=='dbscan':
                clusterDBscan_allsample(embedding,savedir,clustersavedir,latents[clusteridx],epslist_sub,min_sampleslist_sub,n_pcs_sub,sobj_coord_np[clusteridx],samplenameList[clusteridx])
            elif clustermethod=='agglomerative':
                clusterAgg_allsample(embedding,savedir,clustersavedir,latents[clusteridx],nclusterlist_sub,aggMetric_sub,n_pcs_sub,sobj_coord_np[clusteridx],samplenameList[clusteridx])
            elif clustermethod=='kmeanbatch':
                clusterMinibatchKmean_allsample(embedding,savedir,clustersavedir,latents[clusteridx],nclusterlist_sub,n_pcs_sub,sobj_coord_np[clusteridx],samplenameList[clusteridx])

def subclusterLeiden(n_neighbors,n_pcs,min_dist,resolution,addname=''):
    for r in resolution:
        savenamecluster='leiden_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'n_pcs'+str(n_pcs)+'res'+str(r)+'epoch'+str(plotepoch)
        readpath=os.path.join(clustersavedir,savenamecluster)
        if not os.path.exists(readpath):
            print('DNE: '+readpath)
            continue
        with open(readpath, 'rb') as input:
            labels = pickle.load(input)
        labels=np.array(labels)
        if np.unique(labels).shape[0]==1:
            continue
        
        savepath=os.path.join(clustersavedir,savenamecluster+'_subcluster')
        subcluster('leiden',labels,savepath,addname)

def subclusterDBscan(epsL,min_samplesL,n_pcs,addname=''):
    for eps in epsL:
        for min_samples in min_samplesL:
            savenamecluster='dbscan_eps'+str(eps)+'msamples'+str(min_samples)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            readpath=os.path.join(clustersavedir,savenamecluster)
            if not os.path.exists(readpath):
                print('DNE: '+readpath)
                continue
            with open(readpath, 'rb') as input:
                labels = pickle.load(input)
            labels=np.array(labels)
            if np.unique(labels).shape[0]==1:
                continue
            
            savepath=os.path.join(clustersavedir,savenamecluster+'_subcluster')
            subcluster('dbscan',labels,savepath,addname)

def subclusterAgg(nclusterL,aggmetricL,n_pcs,addname=''):
    for ncluster in nclusterL:
        for aggmetric in aggmetricL:
            savenamecluster='agg_ncluster'+str(ncluster)+aggmetric+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            readpath=os.path.join(clustersavedir,savenamecluster)
            if not os.path.exists(readpath):
                print('DNE: '+readpath)
                continue
            with open(readpath, 'rb') as input:
                labels = pickle.load(input)
            labels=np.array(labels)
            if np.unique(labels).shape[0]==1:
                continue
            savepath=os.path.join(clustersavedir,savenamecluster+'_subcluster')
            subcluster('agglomerative',labels,savepath,addname)
            
def subclusterMinibatchKmean(nclusterL,n_pcs,addname=''):
    for ncluster in nclusterL:
        savenamecluster='minibatchkmean_ncluster'+str(ncluster)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
        readpath=os.path.join(clustersavedir,savenamecluster)
        if not os.path.exists(readpath):
            print('DNE: '+readpath)
            continue
        with open(readpath, 'rb') as input:
            labels = pickle.load(input)
        labels=np.array(labels)
        if np.unique(labels).shape[0]==1:
            continue
            
        savepath=os.path.join(clustersavedir,savenamecluster+'_subcluster')
        subcluster('kmeanbatch',labels,savepath,addname)
        
subclusterLeiden(n_neighbors,n_pcs,min_dist,subResolution,addname='')

In [19]:
embedding

NameError: name 'embedding' is not defined