In [18]:
import sys
sys.path.append('/home/xinyiz/pamrats')

import time
import os

import scanpy
import numpy as np
import scipy.sparse as sp

import torch
from torch import optim

# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import average_precision_score

import gae.gae.optimizer as optimizer
import gae.gae.model
import gae.gae.preprocessing as preprocessing

import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN,MiniBatchKMeans,AgglomerativeClustering
from sklearn import metrics

import anndata as ad
import gc

In [19]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
ifplot=True
ifcluster=True

inverseAct='leakyRelu'
# inverseAct=None
plottype='umap'
pca=PCA()
minCells=15 #min number of cells for analysis
# clustermethod=['kmeanbatch']
clustermethod=['leiden','dbscan','agglomerative','kmeanbatch']
#umap/leiden clustering parameters
n_neighbors=10
min_dist=0.25
n_pcs=40 #for clustering
# resolution=[0.5,1,1.5,2]
resolution=[0.05,0.1,0.2,0.3]
plotepoch=8320
savenameAdd=''
#DBscan
epslist= [6,8,10]
min_sampleslist=[15,30,45] 
#agglomerative
nclusterlist=[2,3,4,5,8,10]
aggMetric=['euclidean']


combineCelltype={'glia':['Astro','Micro', 'OPC', 'Oligo'],'CA':['CA1', 'CA2', 'CA3']}

use_cuda=True
fastmode=False #Validate during training pass
seed=3
useSavedMaskedEdges=False
maskedgeName='physicalDist_gradientGeom'
hidden1=1024 #Number of units in hidden layer 1
hidden2=1024 #Number of units in hidden layer 2
# hidden3=16
fc_dim1=1024
# fc_dim2=2112
# fc_dim3=2112
# fc_dim4=2112
# gcn_dim1=2048

protein=None #'scaled_binary'
# proteinWeights=0.05
dropout=0.01
# randFeatureSubset=None
model_str='gcn_vae_xa_e2_d1_dca'
adj_decodeName=None #gala or None
plot_samples={'disease13':'AD_mouse9494','control13':'AD_mouse9498','disease8':'AD_mouse9723','control8':'AD_mouse9735'}
plot_sample_X=['logminmax']
# plot_sample_X=['corrected','scaled']
standardizeX=False
name='allphygradientGeomXA_01_dca'
logsavepath='/mnt/xinyi/pamrats/log/train_gae_starmap/'+name
modelsavepath='/mnt/xinyi/pamrats/models/train_gae_starmap/'+name
plotsavepath='/mnt/xinyi/pamrats/plots/train_gae_starmap/'+name
    

In [20]:
# Set cuda and seed
np.random.seed(seed)
if use_cuda and (not torch.cuda.is_available()):
    print('cuda not available')
    use_cuda=False
torch.manual_seed(seed)
if use_cuda:
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.enabled = True


In [21]:
#Load data
savedir=os.path.join('/mnt/xinyi/','starmap')
adj_dir=os.path.join(savedir,'a')

featureslist={}
if plot_sample_X[0] in ['corrected','scaled']:
    scaleddata=scanpy.read_h5ad('/mnt/xinyi/2021-01-13-mAD-test-dataset/2020-12-27-starmap-mAD-scaled.h5ad')
    
    for s in plot_samples.keys():
        featureslist[s+'X_'+'corrected']=torch.tensor(scaleddata.layers['corrected'][scaleddata.obs['sample']==plot_samples[s]])
        featureslist[s+'X_'+'scaled']=torch.tensor(scaleddata.layers['scaled'][scaleddata.obs['sample']==plot_samples[s]])
    
else:
    scaleddata=scanpy.read_h5ad('/mnt/xinyi/2021-01-13-mAD-test-dataset/2020-12-27-starmap-mAD-raw.h5ad')
    
    for s in plot_samples.keys():
        scaleddata_train=scaleddata.X[scaleddata.obs['sample']==plot_samples[s]]

        if plot_sample_X[0]=='logminmax':
            featurelog_train=np.log2(scaleddata_train+1/2)
            scaler = MinMaxScaler()
            featurelog_train_minmax=np.transpose(scaler.fit_transform(np.transpose(featurelog_train)))
            featureslist[s+'X_'+plot_sample_X[0]]=torch.tensor(featurelog_train_minmax)

if protein: ##adjust for scaled/corrected
    proteinsavepath=os.path.join('/mnt/xinyi/','starmap','protein')
    for s in plot_samples.keys():
        pmtx=sp.load_npz(os.path.join(proteinsavepath,plot_samples[s]+'_'+protein+'.npz'))
        pmtx=preprocessing.sparse_mx_to_torch_sparse_tensor(pmtx)
        pmtx=pmtx.to_dense()
        scalefactor=torch.sum(featureslist[s+'X_'+plot_sample_X[0]])/torch.sum(pmtx)*proteinWeights
        featureslist[s+'X_'+plot_sample_X[0]]=torch.cat((featureslist[s+'X_'+plot_sample_X[0]],pmtx*scalefactor),dim=1)

adj_list={}
adj_list['disease13']=sp.load_npz(os.path.join(adj_dir,maskedgeName+'_AD_mouse9494.npz'))
adj_list['control13']=sp.load_npz(os.path.join(adj_dir,maskedgeName+'_AD_mouse9498.npz'))
adj_list['disease8']=sp.load_npz(os.path.join(adj_dir,maskedgeName+'_AD_mouse9723.npz'))
adj_list['control8']=sp.load_npz(os.path.join(adj_dir,maskedgeName+'_AD_mouse9735.npz'))

In [22]:
# load model
num_nodes,num_features = list(featureslist.values())[0].shape
if model_str=='gcn_vae_xa':
    model  = gae.gae.model.GCNModelVAE_XA(num_features, hidden1, hidden2,fc_dim1,fc_dim2,fc_dim3,fc_dim4, dropout)
elif model_str=='fc1':
    model  = gae.gae.model.FCVAE1(num_features, hidden1,dropout)
elif model_str == 'gcn_vae_xa_e2_d1':
    model  = gae.gae.model.GCNModelVAE_XA_e2_d1(num_features, hidden1,hidden2, dropout)
elif model_str == 'gcn_vae_gcnX_inprA':
    model = gae.gae.model.GCNModelVAE_gcnX_inprA(num_features, hidden1, hidden2,gcn_dim1, dropout)
elif model_str=='fc1_dca':
    model = gae.gae.model.FCVAE1_DCA(num_features, hidden1,fc_dim1, dropout)
elif model_str=='gcn_vae_xa_e2_d1_dca':
    model = gae.gae.model.GCNModelVAE_XA_e2_d1_DCA(num_features, hidden1,hidden2,fc_dim1, dropout)
elif model_str=='gcn_vae_xa_e2_d1_dcaFork':
    model = gae.gae.model.GCNModelVAE_XA_e2_d1_DCAfork(num_features, hidden1,hidden2,fc_dim1, dropout)
elif model_str=='gcn_vae_xa_e2_d1_dcaElemPi':
    model = gae.gae.model.GCNModelVAE_XA_e2_d1_DCAelemPi(num_features, hidden1,hidden2,fc_dim1, dropout,shareGenePi)
elif model_str=='gcn_vae_xa_e2_d1_dcaConstantDisp':
    model = gae.gae.model.GCNModelVAE_XA_e2_d1_DCA_constantDisp(num_features, hidden1,hidden2,fc_dim1, dropout,shareGenePi)
else:
    print('model not found')
model.load_state_dict(torch.load(os.path.join(modelsavepath,str(plotepoch)+'.pt')))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [23]:
np.random.seed(seed)
def plotembeddingbyCT(ctlist,savename,excludelist,embedding,savepath,plotname,plotdimx=0,plotdimy=1,savenameAdd=''):
    celltypes=np.unique(ctlist)
    celltypes_dict={}
    idx=0
    for ct in celltypes:
        celltypes_dict[ct]=idx
        idx+=1
        
    colortest=sns.color_palette("husl", celltypes.size)
#     np.random.shuffle(colortest)
    fig, ax = plt.subplots(dpi=400)
    for ct in celltypes:
        if ct in excludelist:
            continue
        idx=(ctlist==ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[celltypes_dict[ct]],label=ct,s=1.5,alpha=0.5
            )

    plt.gca().set_aspect('equal', 'datalim')
    fig.set_figheight(5)
    fig.set_figwidth(5)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.1,
                     box.width, box.height * 0.9])
    # Put a legend below current axis
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
              fancybox=True, shadow=True, ncol=5)
#     ax.legend(ncol=3)
    plt.title(plotname+' embedding', fontsize=24)
    plt.savefig(os.path.join(savepath,savename+savenameAdd+'.jpg'))
#     plt.show()
    
#     fig.clf()
    plt.close('all')
    
    gc.collect()

In [24]:
np.random.seed(seed)
def plotembeddingbyCT_contrast(ctlist,savename,excludelist,embedding,savepath,plotname,plotdimx=0,plotdimy=1,savenameAdd='',maxplot=None): 
    celltypes=np.unique(ctlist)
    celltypes_dict={}
    idx=0
    for ct in celltypes:
        celltypes_dict[ct]=idx
        idx+=1

    colortest=sns.color_palette("tab10")
    if not os.path.exists(os.path.join(savepath)):
        os.makedirs(savepath)

    for ct in celltypes:
        if maxplot and int(ct)>maxplot:
            continue
        fig, ax = plt.subplots()
        if ct == 'Unassigned':
            continue

        idx=(ctlist!=ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[1],label='others',s=1,alpha=0.5
            )

        idx=(ctlist==ct)
        ax.scatter(
            embedding[idx, plotdimx],
            embedding[idx, plotdimy],
            color=colortest[0],label=ct,s=3,alpha=0.5
            )

        plt.gca().set_aspect('equal', 'datalim')
        fig.set_figheight(10)
        fig.set_figwidth(10)
        ax.legend()
        plt.title(plotname+' embedding', fontsize=24)
        plt.gcf().savefig(os.path.join(savepath,savename+'_'+str(ct)+savenameAdd+'.jpg'))
#         plt.show()
#         nplot+=1
        
    
#         fig.clf()
        plt.close('all')
        gc.collect()

In [25]:
np.random.seed(seed)
def inverseLeakyRelu(v,slope=0.01):
    vnegidx=(v<0)
    v[vnegidx]=1/slope*v[vnegidx]
    return v

In [26]:
np.random.seed(seed)
def clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,resolution,randseed=seed):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    adata=ad.AnnData(inArray)
    scanpy.tl.pca(adata, svd_solver='arpack')
    scanpy.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
    scanpy.tl.umap(adata,min_dist=min_dist,random_state=randseed)
    scanpy.tl.leiden(adata,resolution=resolution,random_state=randseed)
    return adata.obs['leiden'].to_numpy()

def clusterLeiden(inArray,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,randseed=seed):
    for r in resolution:
        clusterRes=clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,r,randseed=seed)
#         print(clusterRes.shape)
        savenamecluster='leiden_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'n_pcs'+str(n_pcs)+'res'+str(r)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'leiden',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

        plotembeddingbyCT(clusterRes,'leiden_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterLeiden_allsample(inArray,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,samplenameList,randseed=seed):
    for r in resolution:
        clusterRes=clusterLeiden_single(inArray,n_neighbors,n_pcs,min_dist,r,randseed=seed)
        savenamecluster='leiden_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'n_pcs'+str(n_pcs)+'res'+str(r)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'leiden',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'leiden',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

        for s in plot_samples.keys():
            sidx=(samplenameList==s)
            plotembeddingbyCT(clusterRes[sidx],'leiden_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes[sidx],'leiden_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)

         

In [27]:
np.random.seed(seed)
def clusterDBscan_single(inArray,eps,min_samples,n_pcs):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    inArray=pca.fit_transform(inArray)
    labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(inArray[:,:n_pcs])
#     db = DBSCAN(eps=eps, min_samples=min_samples).fit(inArray[:,:n_pcs])
#     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#     core_samples_mask[db.core_sample_indices_] = True
#     labels = db.labels_
    return labels

def clusterDBscan(inArray,epsL,min_samplesL,n_pcs,sobj_coord_np):
    for eps in epsL:
        for min_samples in min_samplesL:
            clusterRes=clusterDBscan_single(inArray,eps,min_samples,n_pcs)
    #         print(clusterRes.shape)
            savenamecluster='dbscan_eps'+str(eps)+'msamples'+str(min_samples)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'dbscan',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

            plotembeddingbyCT(clusterRes,'dbscan_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterDBscan_allsample(inArray,epsL,min_samplesL,n_pcs,sobj_coord_np,samplenameList):
    for eps in epsL:
        for min_samples in min_samplesL:
            clusterRes=clusterDBscan_single(inArray,eps,min_samples,n_pcs)
            savenamecluster='dbscan_eps'+str(eps)+'msample'+str(min_samples)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'dbscan',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'dbscan',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

            for s in plot_samples.keys():
                sidx=(samplenameList==s)
                plotembeddingbyCT(clusterRes[sidx],'dbscan_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
                plotembeddingbyCT_contrast(clusterRes[sidx],'dbscan_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)

         

In [28]:
np.random.seed(seed)
def clusterAgg_single(inArray,ncluster,aggmetric,n_pcs):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    inArray=pca.fit_transform(inArray)
    labels = AgglomerativeClustering(n_clusters=ncluster,affinity=aggmetric).fit_predict(inArray[:,:n_pcs])
#     labels = agg.labels_
    return labels

def clusterAgg(inArray,nclusterL,aggmetricL,n_pcs,sobj_coord_np):
    for ncluster in nclusterL:
        for aggmetric in aggmetricL:
            clusterRes=clusterAgg_single(inArray,ncluster,aggmetric,n_pcs)
    #         print(clusterRes.shape)
            savenamecluster='agg_ncluster'+str(ncluster)+aggmetric+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'agg',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

            plotembeddingbyCT(clusterRes,'agg_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterAgg_allsample(inArray,nclusterL,aggmetricL,n_pcs,sobj_coord_np,samplenameList):
    for ncluster in nclusterL:
        for aggmetric in aggmetricL:
            clusterRes=clusterAgg_single(inArray,ncluster,aggmetric,n_pcs)
            savenamecluster='agg_ncluster'+str(ncluster)+aggmetric+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
            with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
                pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
            plotembeddingbyCT(clusterRes,'agg',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes,'agg',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

            for s in plot_samples.keys():
                sidx=(samplenameList==s)
                plotembeddingbyCT(clusterRes[sidx],'agg_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
                plotembeddingbyCT_contrast(clusterRes[sidx],'agg_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)

         

In [29]:
np.random.seed(seed)
def clusterMinibatchKmean_single(inArray,ncluster,n_pcs,batchsize=100):
    n_pcs=np.min([inArray.shape[0]-1,inArray.shape[1]-1,n_pcs])
    batchsize=int(np.min([(inArray.shape[0]-1)/3,(inArray.shape[1]-1)/3,batchsize]))
    inArray=pca.fit_transform(inArray)
    labels = MiniBatchKMeans(n_clusters=ncluster,random_state=seed,batch_size=batchsize).fit_predict(inArray[:,:n_pcs])
    return labels

def clusterMinibatchKmean(inArray,nclusterL,n_pcs,sobj_coord_np):
    for ncluster in nclusterL:
        clusterRes=clusterMinibatchKmean_single(inArray,ncluster,n_pcs)
#         print(clusterRes.shape)
        savenamecluster='minibatchkmean_ncluster'+str(ncluster)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'minibatchkmean',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenamecluster)

        plotembeddingbyCT(clusterRes,'minibatchkmean_location',[],sobj_coord_np,savedir,'location'+' of '+s,savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean_location',[],sobj_coord_np,os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster)

def clusterMinibatchKmean_allsample(inArray,nclusterL,n_pcs,sobj_coord_np,samplenameList):
    for ncluster in nclusterL:
        clusterRes=clusterMinibatchKmean_single(inArray,ncluster,n_pcs)
        savenamecluster='minibatchkmean_ncluster'+str(ncluster)+'n_pcs'+str(n_pcs)+'epoch'+str(plotepoch)
        with open(os.path.join(clustersavedir,savenamecluster), 'wb') as output:
            pickle.dump(clusterRes, output, pickle.HIGHEST_PROTOCOL)
        plotembeddingbyCT(clusterRes,'minibatchkmean',[],embedding,savedir,plottype+' of all samples',savenameAdd=savenamecluster)
        plotembeddingbyCT_contrast(clusterRes,'minibatchkmean',[],embedding,os.path.join(savedir,'contrast'),plottype+' of all samples',savenameAdd=savenamecluster,maxplot=50)

        for s in plot_samples.keys():
            sidx=(samplenameList==s)
            plotembeddingbyCT(clusterRes[sidx],'minibatchkmean_location'+s,[],sobj_coord_np[sidx],savedir,'location'+' of '+s,savenameAdd=savenamecluster)
            plotembeddingbyCT_contrast(clusterRes[sidx],'minibatchkmean_location'+s,[],sobj_coord_np[sidx],os.path.join(savedir,'contrast'),'location'+' of '+s,savenameAdd=savenamecluster,maxplot=50)

         

In [30]:
#compute embeddings
mulist={}
for s in plot_samples.keys():
    adj=adj_list[s]
    adj_norm = preprocessing.preprocess_graph(adj)
    adj_decode=None
    if adj_decodeName == 'gala':
        adj_decode=preprocessing.preprocess_graph_sharp(adj)
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        features=featureslist[samplename]
        if standardizeX:
            features=torch.tensor(scale(features,axis=0, with_mean=True, with_std=True, copy=True))
        if use_cuda:
            model.cuda()
            features = features.cuda().float()
            adj_norm=adj_norm.cuda()
            if adj_decodeName:
                adj_decode=adj_decode.cuda()
        
        model.eval()
        if adj_decodeName==None:
            adj_recon,mu,logvar,z, features_recon = model(features, adj_norm)
        else:
            adj_recon,mu,logvar,z, features_recon = model(features, adj_norm,adj_decode)
        if inverseAct=='leakyRelu':
            muplot=inverseLeakyRelu(mu.cpu().detach().numpy())
        else:
            muplot=mu.cpu().detach().numpy()
        mulist[samplename]=muplot

In [14]:
#all cells
np.random.seed(seed)
for s in plot_samples.keys():
#     if s=='disease13':
#         continue
    print(s)
    sampleidx=plot_samples[s]
    
    celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
    celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
    region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
    sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
        
        if inverseAct:
            samplename+='_beforeAct'
       
        sampledir=os.path.join(plotsavepath,samplename)
        savedir=os.path.join(plotsavepath,samplename,'embedding_'+plottype)
        clustersavedir=os.path.join(plotsavepath,samplename,'cluster')
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
        if not os.path.exists(savedir):
            os.mkdir(savedir)
        if not os.path.exists(clustersavedir):
            os.mkdir(clustersavedir)
            
        if plottype=='umap':
            reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
            embedding = reducer.fit_transform(muplot)
            savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
        elif plottype=='pca':
            embedding=pca.fit_transform(muplot)
            savenameAdd='_epoch'+str(plotepoch)
        
        if ifplot:
            plotembeddingbyCT(celltype_broad,'celltype_broad',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_sub,'celltype_sub',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)
            plotembeddingbyCT(region,'region',[],embedding,savedir,plottype+' of '+s,savenameAdd=savenameAdd)

            plotembeddingbyCT_contrast(celltype_sub,'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s,savenameAdd=savenameAdd)
        
        if embedding.shape[0]<minCells:
            continue
        if ifcluster:
            if 'leiden' in clustermethod:
                clusterLeiden(muplot,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,randseed=seed)
                assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
            if 'dbscan' in clustermethod:
                clusterDBscan(muplot,epslist,min_sampleslist,n_pcs,sobj_coord_np)
                assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
            if 'agglomerative' in clustermethod:
                clusterAgg(muplot,nclusterlist,aggMetric,n_pcs,sobj_coord_np)
                assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
            if 'kmeanbatch' in clustermethod:
                clusterMinibatchKmean(muplot,nclusterlist,n_pcs,sobj_coord_np)
                assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0


disease13
control13
disease8


KeyboardInterrupt: 

In [None]:
# testmtx=metrics.pairwise_distances(pca.fit_transform(muplot)[:,:40])
# muplot=np.copy(mulist[s+'X_'+xcorr])
# clusterDBscan(muplot,epslist,min_sampleslist,n_pcs,sobj_coord_np)
# np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))
# plt.hist(testmtx)
# plt.savefig(os.path.join(sampledir,'distancePC40hist.jpg'))


In [14]:
# separate plots by cell type
np.random.seed(seed)
for s in plot_samples.keys():
    if s in ['disease13','control13','disease8']:
        continue
    print(s)
    sampleidx=plot_samples[s]
    celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
    celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
    region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
    sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
    
    origCT=np.unique(celltype_broad)
    celltypeplot=np.concatenate((origCT,list(combineCelltype.keys())),axis=None)
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
        
        if inverseAct:
            samplename+='_beforeAct'
        sampledir=os.path.join(plotsavepath,samplename)
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
        
        for ct in celltypeplot:
#             if s=='disease8' and ct in ['Astro','CA1','CA2','CA3','DG','Endo','Ex','Inhi']:
            if s=='control8' and ct in ['Astro']:
                continue
            print(ct)
            
            savedir=os.path.join(plotsavepath,samplename,'embedding_'+plottype+'_'+ct)
            clustersavedir=os.path.join(plotsavepath,samplename,'cluster'+'_'+ct)
            if not os.path.exists(savedir):
                os.mkdir(savedir)
            if not os.path.exists(clustersavedir):
                os.mkdir(clustersavedir)
            
            if ct in origCT:
                ct_idx=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']==ct
            else:
                ct_idx=False
                for i in combineCelltype[ct]:
                    ct_idx=np.logical_or(ct_idx,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']==i)
            
            if plottype=='umap':
                reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
                embedding = reducer.fit_transform(muplot[ct_idx])
                savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
            elif plottype=='pca':
                embedding=pca.fit_transform(muplot[ct_idx])
                savenameAdd='_epoch'+str(plotepoch)
                
            if ifplot:
    #             plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+s+' '+reg)
                plotembeddingbyCT(celltype_sub[ct_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+s+' '+ct,savenameAdd=savenameAdd)
                plotembeddingbyCT(region[ct_idx],'region',[],embedding,savedir,plottype+' of '+s+' '+ct,savenameAdd=savenameAdd)

                plotembeddingbyCT_contrast(celltype_sub[ct_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s+' '+ct,savenameAdd=savenameAdd)
            
            if embedding.shape[0]<minCells:
                continue
            if ifcluster:
                if 'leiden' in clustermethod:
                    clusterLeiden(muplot[ct_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[ct_idx],randseed=seed)
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'dbscan' in clustermethod:
                    clusterDBscan(muplot[ct_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'agglomerative' in clustermethod:
                    clusterAgg(muplot[ct_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'kmeanbatch' in clustermethod:
                    clusterMinibatchKmean(muplot[ct_idx],nclusterlist,n_pcs,sobj_coord_np[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0

control8
CA1
CA2
CA3
DG
Endo
Ex
Inhi
LHb


  "n_neighbors is larger than the dataset size; truncating to "


Micro
OPC
Oligo
SMC
glia
CA


In [15]:
# separate plots by region
np.random.seed(seed)
for s in plot_samples.keys():
    print(s)
    sampleidx=plot_samples[s]
    celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
    celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
    region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
    sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
        
        if inverseAct:
            samplename+='_beforeAct'
        sampledir=os.path.join(plotsavepath,samplename)
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
        
        for reg in np.unique(region):
            print(reg)
            savedir=os.path.join(plotsavepath,samplename,'embedding_'+plottype+'_'+reg)
            clustersavedir=os.path.join(plotsavepath,samplename,'cluster'+'_'+reg)
            if not os.path.exists(savedir):
                os.mkdir(savedir)
            if not os.path.exists(clustersavedir):
                os.mkdir(clustersavedir)
            
            reg_idx=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']==reg
            
            if plottype=='umap':
                reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
                embedding = reducer.fit_transform(muplot[reg_idx])
                savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
            elif plottype=='pca':
                embedding=pca.fit_transform(muplot[reg_idx])
                savenameAdd='_epoch'+str(plotepoch)
                
            if ifplot:
                plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+s+' '+reg,savenameAdd=savenameAdd)
                plotembeddingbyCT(celltype_sub[reg_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+s+' '+reg,savenameAdd=savenameAdd)
    #             plotembeddingbyCT(region,'region',[],embedding[reg_idx],savedir,'UMAP of '+s)

                plotembeddingbyCT_contrast(celltype_sub[reg_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+s+' '+reg,savenameAdd=savenameAdd)
            
            if embedding.shape[0]<minCells:
                continue
            if ifcluster:
                if 'leiden' in clustermethod:
                    clusterLeiden(muplot[reg_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[reg_idx],randseed=seed)
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'dbscan' in clustermethod:
                    clusterDBscan(muplot[reg_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[reg_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'agglomerative' in clustermethod:
                    clusterAgg(muplot[reg_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[reg_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'kmeanbatch' in clustermethod:
                    clusterMinibatchKmean(muplot[reg_idx],nclusterlist,n_pcs,sobj_coord_np[reg_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0

disease13
Cortex
Hippocampus
White Matter
control13
Cortex
Hippocampus
White Matter
disease8
Cortex
Hippocampus
White Matter
control8
Cortex
Hippocampus
White Matter


In [33]:
# combine all latents to one plot 
np.random.seed(seed)
for xcorr in plot_sample_X:
    latents=None
    celltype_broad=None
    celltype_sub=None
    region=None
    samplenameList=None
    sobj_coord_np=None
    
    for s in plot_samples.keys():
        sampleidx=plot_samples[s]        
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
            
        if latents is None:
            latents=muplot
            celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
            celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
            region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
            sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
            samplenameList=np.repeat(s,muplot.shape[0])
        else:
            latents=np.vstack((latents,muplot))
            celltype_broad=np.concatenate((celltype_broad,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']),axis=None)
            celltype_sub=np.concatenate((celltype_sub,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']),axis=None)
            region=np.concatenate((region,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']),axis=None)
            sobj_coord_np=np.concatenate((sobj_coord_np,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()),axis=0)
            samplenameList=np.concatenate((samplenameList,np.repeat(s,muplot.shape[0])),axis=None)
       
    origCT=np.unique(celltype_broad)
    celltypeplot=np.concatenate((origCT,list(combineCelltype.keys())),axis=None)
    sampledir=os.path.join(plotsavepath,'combined'+xcorr)
    if inverseAct:
        sampledir+='_beforeAct'
    savedir=os.path.join(sampledir,'embedding_'+plottype)
    clustersavedir=os.path.join(sampledir,'cluster')
    if not os.path.exists(sampledir):
        os.mkdir(sampledir)
    if not os.path.exists(savedir):
        os.mkdir(savedir)
    if not os.path.exists(clustersavedir):
        os.mkdir(clustersavedir)
    
    if plottype=='umap':
        reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
        embedding = reducer.fit_transform(latents)
        savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
    elif plottype=='pca':
        embedding=pca.fit_transform(latents)
        savenameAdd='_epoch'+str(plotepoch)
    if ifplot:
        plotembeddingbyCT(samplenameList,'sample',[],embedding,savedir,plottype+'of all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(celltype_broad,'celltype_broad',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(celltype_sub,'celltype_sub',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(region,'region',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)

        plotembeddingbyCT_contrast(celltype_sub,'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+'all samples',savenameAdd=savenameAdd)    
    
    if embedding.shape[0]<minCells:
        continue
#     if ifcluster:
#         if 'leiden' in clustermethod:
#             clusterLeiden_allsample(latents,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,samplenameList,randseed=seed)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'dbscan' in clustermethod:
#             clusterDBscan_allsample(latents,epslist,min_sampleslist,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'agglomerative' in clustermethod:
#             clusterAgg_allsample(latents,nclusterlist,aggMetric,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'kmeanbatch' in clustermethod:
#             clusterMinibatchKmean_allsample(latents,nclusterlist,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
    #by region
    for reg in np.unique(region):
        savedir=os.path.join(sampledir,'embedding_'+plottype+'_'+reg)
        clustersavedir=os.path.join(sampledir,'cluster'+'_'+reg)
        if not os.path.exists(savedir):
            os.mkdir(savedir)
        if not os.path.exists(clustersavedir):
            os.mkdir(clustersavedir)

        reg_idx=region==reg

        if plottype=='umap':
            reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
            embedding = reducer.fit_transform(latents[reg_idx])
            savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
        elif plottype=='pca':
            embedding=pca.fit_transform(latents[reg_idx])
            savenameAdd='_epoch'+str(plotepoch)
        if ifplot:
            plotembeddingbyCT(samplenameList[reg_idx],'sample',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_sub[reg_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
    #             plotembeddingbyCT(region,'region',[],embedding[reg_idx],savedir,'UMAP of '+s)

            plotembeddingbyCT_contrast(celltype_sub[reg_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
        
        if embedding.shape[0]<minCells:
            continue
#         if ifcluster:
#             if 'leiden' in clustermethod:
#                 clusterLeiden_allsample(latents[reg_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[reg_idx],samplenameList[reg_idx],randseed=seed)
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'dbscan' in clustermethod:
#                 clusterDBscan_allsample(latents[reg_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'agglomerative' in clustermethod:
#                 clusterAgg_allsample(latents[reg_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'kmeanbatch' in clustermethod:
#                 clusterMinibatchKmean_allsample(latents[reg_idx],nclusterlist,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
        #by region and celltype
        for ct in celltypeplot:
            if not ((reg=='Cortex' and ct in ['Ex']) or (reg=='Hippocampus' and ct in ['CA1','DG','Micro','CA'])):
#             if not (reg=='Hippocampus' and ct in ['CA']):
                continue
            print(reg+ct)
            savedir=os.path.join(sampledir,'embedding_'+plottype+'_'+reg+ct)
            clustersavedir=os.path.join(sampledir,'cluster'+'_'+reg+ct)
            if not os.path.exists(savedir):
                os.mkdir(savedir)
            if not os.path.exists(clustersavedir):
                os.mkdir(clustersavedir)

            
            if ct in origCT:
                ct_idx=celltype_broad==ct
            else:
                ct_idx=False
                for i in combineCelltype[ct]:
                    ct_idx=np.logical_or(ct_idx,celltype_broad==i)
            ct_idx=np.logical_and(reg_idx,ct_idx)      
            
            if np.sum(ct_idx)<3:
                continue
            if plottype=='umap':
                reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
                embedding = reducer.fit_transform(latents[ct_idx])
                savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
            elif plottype=='pca':
                embedding=pca.fit_transform(latents[ct_idx])
                savenameAdd='_epoch'+str(plotepoch)
                
            if ifplot:
                plotembeddingbyCT(samplenameList[ct_idx],'sample',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
        #         plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg)
                plotembeddingbyCT(celltype_sub[ct_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
    #             plotembeddingbyCT(region[ct_idx],'region',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct)

                plotembeddingbyCT_contrast(celltype_sub[ct_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
        
            if embedding.shape[0]<minCells:
                continue
            if ifcluster:
                if 'leiden' in clustermethod:
                    clusterLeiden_allsample(latents[ct_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[ct_idx],samplenameList[ct_idx],randseed=seed)
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'dbscan' in clustermethod:
                    clusterDBscan_allsample(latents[ct_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'agglomerative' in clustermethod:
                    clusterAgg_allsample(latents[ct_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'kmeanbatch' in clustermethod:
                    clusterMinibatchKmean_allsample(latents[ct_idx],nclusterlist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#     #by celltype
#     for ct in celltypeplot:
#         print(ct)
#         savedir=os.path.join(sampledir,'embedding_'+plottype+'_'+ct)
#         clustersavedir=os.path.join(sampledir,'cluster'+'_'+ct)
#         if not os.path.exists(savedir):
#             os.mkdir(savedir)
#         if not os.path.exists(clustersavedir):
#             os.mkdir(clustersavedir)

#         if ct in origCT:
#             ct_idx=celltype_broad==ct
#         else:
#             ct_idx=False
#             for i in combineCelltype[ct]:
#                 ct_idx=np.logical_or(ct_idx,celltype_broad==i)

#         if plottype=='umap':
#             reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
#             embedding = reducer.fit_transform(latents[ct_idx])
#             savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
#         elif plottype=='pca':
#             embedding=pca.fit_transform(latents[ct_idx])
#             savenameAdd='_epoch'+str(plotepoch)
#         if ifplot:
#             plotembeddingbyCT(samplenameList[ct_idx],'sample',[],embedding,savedir,plottype+' of '+'all samples'+' '+ct,savenameAdd=savenameAdd)
#     #         plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg)
#             plotembeddingbyCT(celltype_sub[ct_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+'all samples'+' '+ct,savenameAdd=savenameAdd)
#             plotembeddingbyCT(region[ct_idx],'region',[],embedding,savedir,plottype+' of '+'all samples'+' '+ct,savenameAdd=savenameAdd)

#             plotembeddingbyCT_contrast(celltype_sub[ct_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+'all samples'+' '+ct,savenameAdd=savenameAdd)
        
#         if embedding.shape[0]<minCells:
#             continue
#         if ifcluster:
#             if 'leiden' in clustermethod:
#                 clusterLeiden_allsample(latents[ct_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[ct_idx],samplenameList[ct_idx],randseed=seed)
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'dbscan' in clustermethod:
#                 clusterDBscan_allsample(latents[ct_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'agglomerative' in clustermethod:
#                 clusterAgg_allsample(latents[ct_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'kmeanbatch' in clustermethod:
#                 clusterMinibatchKmean_allsample(latents[ct_idx],nclusterlist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0

CortexEx
HippocampusCA1
HippocampusDG
HippocampusMicro
HippocampusCA


In [31]:
ifcluster=True
#for testing: combine latent of some samples
np.random.seed(seed)
plot_samples={'control13':'AD_mouse9498','control8':'AD_mouse9735'}
for xcorr in plot_sample_X:
    latents=None
    celltype_broad=None
    celltype_sub=None
    region=None
    samplenameList=None
    sobj_coord_np=None
    
    for s in plot_samples.keys():
        sampleidx=plot_samples[s]        
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])
            
        if latents is None:
            latents=muplot
            celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
            celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
            region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
            sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
            samplenameList=np.repeat(s,muplot.shape[0])
        else:
            latents=np.vstack((latents,muplot))
            celltype_broad=np.concatenate((celltype_broad,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']),axis=None)
            celltype_sub=np.concatenate((celltype_sub,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']),axis=None)
            region=np.concatenate((region,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']),axis=None)
            sobj_coord_np=np.concatenate((sobj_coord_np,scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()),axis=0)
            samplenameList=np.concatenate((samplenameList,np.repeat(s,muplot.shape[0])),axis=None)
       
    origCT=np.unique(celltype_broad)
    celltypeplot=np.concatenate((origCT,list(combineCelltype.keys())),axis=None)
    sampledir=os.path.join(plotsavepath,'controls'+xcorr)
    if inverseAct:
        sampledir+='_beforeAct'
    savedir=os.path.join(sampledir,'embedding_'+plottype)
    clustersavedir=os.path.join(sampledir,'cluster')
    if not os.path.exists(sampledir):
        os.mkdir(sampledir)
    if not os.path.exists(savedir):
        os.mkdir(savedir)
    if not os.path.exists(clustersavedir):
        os.mkdir(clustersavedir)
    
    if plottype=='umap':
        reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
        embedding = reducer.fit_transform(latents)
        savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
    elif plottype=='pca':
        embedding=pca.fit_transform(latents)
        savenameAdd='_epoch'+str(plotepoch)
    if ifplot:
        plotembeddingbyCT(samplenameList,'sample',[],embedding,savedir,plottype+'of all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(celltype_broad,'celltype_broad',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(celltype_sub,'celltype_sub',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)
        plotembeddingbyCT(region,'region',[],embedding,savedir,plottype+'all samples',savenameAdd=savenameAdd)

        plotembeddingbyCT_contrast(celltype_sub,'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+'all samples',savenameAdd=savenameAdd)    
    
    if embedding.shape[0]<minCells:
        continue
#     if ifcluster:
#         if 'leiden' in clustermethod:
#             clusterLeiden_allsample(latents,n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np,samplenameList,randseed=seed)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'dbscan' in clustermethod:
#             clusterDBscan_allsample(latents,epslist,min_sampleslist,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'agglomerative' in clustermethod:
#             clusterAgg_allsample(latents,nclusterlist,aggMetric,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#         if 'kmeanbatch' in clustermethod:
#             clusterMinibatchKmean_allsample(latents,nclusterlist,n_pcs,sobj_coord_np,samplenameList)
#             assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
    #by region
    for reg in np.unique(region):
        savedir=os.path.join(sampledir,'embedding_'+plottype+'_'+reg)
        clustersavedir=os.path.join(sampledir,'cluster'+'_'+reg)
        if not os.path.exists(savedir):
            os.mkdir(savedir)
        if not os.path.exists(clustersavedir):
            os.mkdir(clustersavedir)

        reg_idx=region==reg

        if plottype=='umap':
            reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
            embedding = reducer.fit_transform(latents[reg_idx])
            savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
        elif plottype=='pca':
            embedding=pca.fit_transform(latents[reg_idx])
            savenameAdd='_epoch'+str(plotepoch)
        if ifplot:
            plotembeddingbyCT(samplenameList[reg_idx],'sample',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
            plotembeddingbyCT(celltype_sub[reg_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
    #             plotembeddingbyCT(region,'region',[],embedding[reg_idx],savedir,'UMAP of '+s)

            plotembeddingbyCT_contrast(celltype_sub[reg_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+'all samples'+' '+reg,savenameAdd=savenameAdd)
        
        if embedding.shape[0]<minCells:
            continue
#         if ifcluster:
#             if 'leiden' in clustermethod:
#                 clusterLeiden_allsample(latents[reg_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[reg_idx],samplenameList[reg_idx],randseed=seed)
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'dbscan' in clustermethod:
#                 clusterDBscan_allsample(latents[reg_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'agglomerative' in clustermethod:
#                 clusterAgg_allsample(latents[reg_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
#             if 'kmeanbatch' in clustermethod:
#                 clusterMinibatchKmean_allsample(latents[reg_idx],nclusterlist,n_pcs,sobj_coord_np[reg_idx],samplenameList[reg_idx])
#                 assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
        #by region and celltype
        for ct in celltypeplot:
            if not ((reg=='Cortex' and ct in ['Ex']) or (reg=='Hippocampus' and ct in ['CA1','DG','Micro','CA'])):
#             if not (reg=='Hippocampus' and ct in ['CA']):
                continue
            print(reg+ct)
            savedir=os.path.join(sampledir,'embedding_'+plottype+'_'+reg+ct)
            clustersavedir=os.path.join(sampledir,'cluster'+'_'+reg+ct)
            if not os.path.exists(savedir):
                os.mkdir(savedir)
            if not os.path.exists(clustersavedir):
                os.mkdir(clustersavedir)

            
            if ct in origCT:
                ct_idx=celltype_broad==ct
            else:
                ct_idx=False
                for i in combineCelltype[ct]:
                    ct_idx=np.logical_or(ct_idx,celltype_broad==i)
            ct_idx=np.logical_and(reg_idx,ct_idx)      
            
            if np.sum(ct_idx)<3:
                continue
            if plottype=='umap':
                reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
                embedding = reducer.fit_transform(latents[ct_idx])
                savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
            elif plottype=='pca':
                embedding=pca.fit_transform(latents[ct_idx])
                savenameAdd='_epoch'+str(plotepoch)
                
            if ifplot:
                plotembeddingbyCT(samplenameList[ct_idx],'sample',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
        #         plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad',[],embedding,savedir,plottype+' of '+'all samples'+' '+reg)
                plotembeddingbyCT(celltype_sub[ct_idx],'celltype_sub',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
    #             plotembeddingbyCT(region[ct_idx],'region',[],embedding,savedir,plottype+' of '+reg+' all samples'+' '+ct)

                plotembeddingbyCT_contrast(celltype_sub[ct_idx],'celltype_sub',[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+reg+' all samples'+' '+ct,savenameAdd=savenameAdd)
        
            if embedding.shape[0]<minCells:
                continue
            if ifcluster:
                if 'leiden' in clustermethod:
                    clusterLeiden_allsample(latents[ct_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[ct_idx],samplenameList[ct_idx],randseed=seed)
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'dbscan' in clustermethod:
                    clusterDBscan_allsample(latents[ct_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'agglomerative' in clustermethod:
                    clusterAgg_allsample(latents[ct_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                if 'kmeanbatch' in clustermethod:
                    clusterMinibatchKmean_allsample(latents[ct_idx],nclusterlist,n_pcs,sobj_coord_np[ct_idx],samplenameList[ct_idx])
                    assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
    

CortexEx
HippocampusCA1
HippocampusDG
HippocampusMicro
HippocampusCA


In [32]:
plot_samples={'disease13':'AD_mouse9494','control13':'AD_mouse9498','disease8':'AD_mouse9723','control8':'AD_mouse9735'}

ifcluster=True
# separate plots by region and cell types
np.random.seed(seed)
for s in plot_samples.keys():
#     if s in ['disease13']:
#         continue
    print(s)
    sampleidx=plot_samples[s]
    celltype_broad=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'top_level']
    celltype_sub=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'cell_type_label']
    region=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']
    sobj_coord_np=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,['x','y']].to_numpy()
    
    origCT=np.unique(celltype_broad)
    celltypeplot=np.concatenate((origCT,list(combineCelltype.keys())),axis=None)
    for xcorr in plot_sample_X:
        samplename=s+'X_'+xcorr
        muplot=np.copy(mulist[samplename])

        if inverseAct:
            samplename+='_beforeAct'
        sampledir=os.path.join(plotsavepath,samplename)
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
            
        for r in np.unique(region):
            print(r)
            ridx=scaleddata.obs.loc[scaleddata.obs['sample']==sampleidx,'region']==r
            for reg in celltypeplot:
                if not ((r=='Cortex' and reg in ['Ex']) or (r=='Hippocampus' and reg in ['CA1','DG','Micro','CA'])):
                    continue
#                 if not (r=='Hippocampus' and reg in ['CA']):
#                     continue
#                 if s=='disease8' and r=='Cortex' and (reg in ['Astro','CA1','CA2','CA3','DG','Endo','Ex','Inhi','LHb','Micro','OPC','Oligo','SMC']):
#                     continue
                print(reg)
                savedir0=os.path.join(plotsavepath,samplename,'embedding_'+plottype+'_'+reg)
                savedir=os.path.join(plotsavepath,samplename,'embedding_'+plottype+'_'+reg,r)
                clustersavedir0=os.path.join(plotsavepath,samplename,'cluster'+'_'+reg)
                clustersavedir=os.path.join(plotsavepath,samplename,'cluster'+'_'+reg,r)
                if not os.path.exists(savedir0):
                    os.mkdir(savedir0)
                if not os.path.exists(savedir):
                    os.mkdir(savedir)
                if not os.path.exists(clustersavedir0):
                    os.mkdir(clustersavedir0)
                if not os.path.exists(clustersavedir):
                    os.mkdir(clustersavedir)

                if reg in origCT:
                    ct_idx=celltype_broad==reg
                else:
                    ct_idx=False
                    for i in combineCelltype[reg]:
                        ct_idx=np.logical_or(ct_idx,celltype_broad==i)
                
                reg_idx=np.logical_and(ridx,ct_idx)
                if np.sum(reg_idx)<=3:
                    print('skipped')
                    continue
                
                if plottype=='umap':
                    reducer = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,random_state=seed)
                    embedding = reducer.fit_transform(muplot[reg_idx])
                    savenameAdd='_nn'+str(n_neighbors)+'mdist0'+str(int(min_dist*100))+'epoch'+str(plotepoch)
                elif plottype=='pca':
                    embedding=pca.fit_transform(muplot[reg_idx])
                    savenameAdd='_epoch'+str(plotepoch)
                if ifplot:
    #                 plotembeddingbyCT(celltype_broad[reg_idx],'celltype_broad_'+r,[],embedding,savedir,plottype+' of '+r+' '+s+' '+reg)
                    plotembeddingbyCT(celltype_sub[reg_idx],'celltype_sub_'+r,[],embedding,savedir,plottype+' of '+r+' '+s+' '+reg,savenameAdd=savenameAdd)
        #             plotembeddingbyCT(region,'region',[],embedding[reg_idx],savedir,s)
                    plotembeddingbyCT_contrast(celltype_sub[reg_idx],'celltype_sub_'+r,[],embedding,os.path.join(savedir,'contrast'),plottype+' of '+r+' '+s+' '+reg,savenameAdd=savenameAdd)
                    
#                     plotembeddingbyCT(celltype_sub[reg_idx],'celltype_sub_location_'+r,[],sobj_coord_np[reg_idx],savedir,'location'+' of '+r+' '+s,savenameAdd='')

                if embedding.shape[0]<minCells:
                    continue
                if ifcluster:
                    if 'leiden' in clustermethod:
                        clusterLeiden(muplot[reg_idx],n_neighbors,n_pcs,min_dist,resolution,sobj_coord_np[reg_idx],randseed=seed)
                        assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                    if 'dbscan' in clustermethod:
                        clusterDBscan(muplot[reg_idx],epslist,min_sampleslist,n_pcs,sobj_coord_np[reg_idx])
                        assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                    if 'agglomerative' in clustermethod:
                        clusterAgg(muplot[reg_idx],nclusterlist,aggMetric,n_pcs,sobj_coord_np[reg_idx])
                        assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0
                    if 'kmeanbatch' in clustermethod:
                        clusterMinibatchKmean(muplot[reg_idx],nclusterlist,n_pcs,sobj_coord_np[reg_idx])
                        assert np.sum(muplot-np.copy(mulist[s+'X_'+xcorr]))==0

disease13
Cortex
Ex
Hippocampus
CA1
DG
Micro
CA
White Matter
control13
Cortex
Ex
Hippocampus
CA1
DG
Micro
CA
White Matter
disease8
Cortex
Ex
Hippocampus
CA1
DG
Micro
CA
White Matter
control8
Cortex
Ex
Hippocampus
CA1
DG
Micro
CA
White Matter


In [15]:
s

'control8'