In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

import neuro_morpho_toolbox as nmt
%matplotlib inline
#ns=nmt.neuron_set('/home/penglab/Documents/Janelia_1000')
import pickle
pickle_in = open("/home/penglab/FeaCal/ns.pickle","rb")
example_ = pickle.load(pickle_in)
ns= example_[0]

/home/penglab/anaconda3/lib/python3.7/site-packages/neuro_morpho_toolbox/
Loading CCF Atlas data...
Loading time: 0.90
Loading CCF brain structure data...
Loading time: 0.01


In [2]:
def getDuplicateColumns(df):
    '''
    Get a list of duplicate columns.
    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
    :param df: Dataframe object
    :return: List of columns whose contents are duplicates.
    '''
    duplicateColumnNames = set()
    # Iterate over all the columns in dataframe
    for x in range(df.shape[1]):
        # Select column at xth index.
        col = df.iloc[:, x]
        # Iterate over all the columns in DataFrame from (x+1)th index till end
        for y in range(x + 1, df.shape[1]):
            # Select column at yth index.
            otherCol = df.iloc[:, y]
            # Check if two columns at x 7 y index are equal
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
 
    return list(duplicateColumnNames)

### function readVaa3dFeature(addr, nameF)
* **addr** is the location of that plain txt generated by Vaa3D
* **nameF** is the name of that feature

from IPython.display import display

def readVaa3DFeature(addr, nameF):
    df = pd.read_csv(addr, sep=r'\t', engine='python').transpose()
    df.rename(columns = df.loc['ID',:], inplace=True)
    df.drop(index='ID', inplace = True) 
    print('Drop duplicate columns '+ str(getDuplicateColumns(df)))
    df.drop(columns = getDuplicateColumns(df),inplace = True)
    print('Loading ' + str(nameF) +' features successfully, the shape of that dataframe is '+ str(df.shape))
    col_mask = df.isnull().any(axis=0) 
    row_mask = df.isnull().any(axis=1) 
    if not df.loc[row_mask,col_mask]:
        print('NAN value exists for the following feature:')
        display(df.loc[row_mask,col_mask])
    return df

In [3]:
from IPython.display import display

def readVaa3DFeature(addr, nameF):
    print('Loading ' + str(nameF) +' features') 
    df = pd.read_csv(addr, sep=r'\t', header=[0], index_col=[0], delimiter="\t").transpose()
    df.rename(columns={'Number of Bifurcatons':'Number of Bifurcations'}, inplace=True)
    use_cols = ['Number of Stems', 
            'Overall Width', 'Overall Height', 'Overall Depth', 
            'Total Length', 
            'Max Euclidean Distance', 'Max Path Distance', 
            'Number of Bifurcations', 'Number of Branches', 'Number of Tips',
            'Max Branch Order','Average Contraction', 'Average Fragmentation',
            'Average Bifurcation Angle Local', 'Average Bifurcation Angle Remote', 
            'Hausdorff Dimension'
           ]
    df = df[use_cols]
    feature_name = use_cols
    if nameF == 'axon':
        new_feature_name = ['A_'+i.replace(' ', '_') for i in use_cols]
    if nameF == 'proximal axon':
        new_feature_name = ['AL_'+i.replace(' ', '_') for i in use_cols]
    if nameF == 'dendrite':
        new_feature_name = ['D_'+i.replace(' ', '_') for i in use_cols]
    
    df.rename(columns=dict(zip(feature_name, new_feature_name)), inplace=True)

    col_mask = df.isnull().any(axis=0) 
    row_mask = df.isnull().any(axis=1) 
    if not (df.loc[row_mask,col_mask]).shape == (0,0):
        print('NAN value exists for the following feature:')
        display(df.loc[row_mask,col_mask])
        df.drop(index = df.loc[row_mask,col_mask].index, inplace = True)
        print('Related .swc files were removed')
    print('Loading successfully, the shape of that dataframe is '+ str(df.shape))
    return df

## Dendrite features

In [4]:
dendriteFea = readVaa3DFeature('/home/penglab/FeaCal/Jmorpho_features/dendrite.features/temp', 'dendrite')


Loading dendrite features
NAN value exists for the following feature:


ID,D_Average_Contraction,D_Average_Fragmentation,D_Average_Bifurcation_Angle_Local,D_Average_Bifurcation_Angle_Remote
AA0309,0.814347,58.0,,
AA0411,,,,


Related .swc files were removed
Loading successfully, the shape of that dataframe is (989, 16)


# Deal with data 

In [5]:

def scale(df, log=True):
    scaled_data = np.array(df) / np.sum(df, axis=1).values.reshape(-1,1)
    scaled_data[np.isnan(scaled_data)]=0
    return scaled_data


### Dendrite features

In [6]:
lm_dendrite = nmt.features("L-measure of dendrite")
lm_dendrite.add_raw_data(dendriteFea)



lm_dendrite_df = lm_dendrite.raw_data.copy().loc[:,:]
use_cols = [
    'D_Number_of_Stems', 
    'D_Overall_Width', 
    'D_Overall_Height', 
    'D_Overall_Depth', 
    'D_Total_Length',
    'D_Max_Euclidean_Distance', 
    'D_Max_Path_Distance', 
    'D_Number_of_Branches', 
    'D_Max_Branch_Order',
]
lm_dendrite_df = lm_dendrite_df[use_cols]
lm_dendrite_df["D_Depth_Width-Ratio"] = lm_dendrite_df["D_Overall_Depth"] / lm_dendrite_df["D_Overall_Width"]


col_mask = lm_dendrite_df.isnull().any(axis=0) 
row_mask = lm_dendrite_df.isnull().any(axis=1) 
if not (lm_dendrite_df.loc[row_mask,col_mask]).shape == (0,0):
    print('NAN value exists for the following feature:')
    display(lm_dendrite_df.loc[row_mask,col_mask])
    lm_dendrite_df.drop(index = lm_dendrite_df.loc[row_mask,col_mask].index, inplace = True)
    print('Related feature samples were removed')
print('Loading successfully, the shape of that dataframe is '+ str(lm_dendrite_df.shape))


lm_dendrite_df_scale = pd.DataFrame(scale(lm_dendrite_df), 
                                    index=lm_dendrite_df.index, 
                                    columns=lm_dendrite_df.columns
                                   )


Number of input neurons: 989
Number of input features: 16
Loading successfully, the shape of that dataframe is (989, 10)


In [11]:
lm_dendrite_df_scale.to_excel('/home/penglab/FeaCal/Jmorpho_features/lm_dendrite_df_scale.xlsx')

# Calculate co-clustering matrix


## Using axon morphology as features
SETTING CLUSTER NUMBER FROM 8 TO 40

### Hierarchy Clustering
For Hierarchy method
* the most suitable parameter is {'L_method': 'weighted', 'L_metric': 'mahalanobis', 'criterionH': 'distance', 'depth': 2, 'R': None, 't': 0.9, 'optimal_ordering': False}
* the ARI is 0.08785414289406115
* The setting cluster number's limit is satisfied, the final number of cluster is 8

### Kmeans Clustering
For Kmeans method
* the most suitable parameter is {'n_clusters': 8, 'init': 'k-means++', 'n_init': 21, 'max_iter': 300, 'tol': 0.0001, 'precompute_distances': False, 'verbose': 0, 'random_state': None, 'copy_x': True, 'n_jobs': None, 'algorithm': 'auto'}
* the ARI is 0.08343509698163794
* The setting cluster number's limit is satisfied, the final number of cluster is 8

### DBSCAN Clustering
For DBSCAN method
* the most suitable parameter is {'eps': 0.31, 'min_samples': 5, 'metric': 'euclidean', 'metric_params': None, 'algorithm': 'auto', 'leaf_size': 30, 'p': None, 'n_jobs': None}
* the ARI is 0.10187925011450155
* The setting cluster number's limit is satisfied, the final number of cluster is 8

### HDBSCAN Clustering
For HDBSCAN method, 
* the most suitable parameter is {'min_cluster_size': 5, 'metric': 'manhattan', 'alpha': 0.8, 'min_samples': 3, 'p': 2, 'algorithm': 'best', 'leaf_size': 40, 'approx_min_span_tree': True, 'gen_min_span_tree': False, 'core_dist_n_jobs': 4, 'cluster_selection_method': 'eom', 'allow_single_cluster': False, 'prediction_data': False, 'match_reference_implementation': False}
* the ARI is 0.05177167199451809
* The setting cluster number's limit is satisfied, the final number of cluster is 39

### SNN Clustering
For SNN_community method
* the most suitable parameter is {'knn': 5, 'metric': 'minkowski', 'method': 'FastGreedy'}
* the ARI is 0.06673692576743263
* The setting cluster number's limit is satisfied, the final number of cluster is 23

In [4]:
par_hier =  {'L_method': 'weighted', 'L_metric': 'mahalanobis', 'criterionH': 'distance', 'depth': 2, 'R': None, 
             't': 0.9, 'optimal_ordering': False}
par_kmeans = {'n_clusters': 8, 'init': 'k-means++', 'n_init': 21, 'max_iter': 300, 'tol': 0.0001, 
              'precompute_distances': False, 'verbose': 0, 'random_state': None, 'copy_x': True, 'n_jobs': None,
              'algorithm': 'auto'}

par_dbscan = {'eps': 0.31, 'min_samples': 5, 'metric': 'euclidean', 'metric_params': None, 'algorithm': 'auto', 
              'leaf_size': 30, 'p': None, 'n_jobs': None}

par_hdbscan = {'min_cluster_size': 5, 'metric': 'manhattan', 'alpha': 0.8, 'min_samples': 3, 'p': 2, 'algorithm':
               'best', 'leaf_size': 40, 'approx_min_span_tree': True, 'gen_min_span_tree': False, 
               'core_dist_n_jobs': 4, 'cluster_selection_method': 'eom', 'allow_single_cluster': False, 
               'prediction_data': False, 'match_reference_implementation': False}

par_snn = {'knn':5,'metric':'minkowski','method':'FastGreedy'}


In [5]:
def get_clusters(inputUMAP,method='SNN_community',karg_dict={'knn':5, 'metric':'minkowski','method':'FastGreedy'}):
    methods_allowed = ['SNN_community', 'Hierarchy', 'Kmeans', 'DBSCAN', 'HDBSCAN']
    assert method in methods_allowed, "Please set 'method' as one of the following: 'SNN_community', 'Hierarchy', 'Kmeans', 'DBSCAN', 'HDBSCAN'"
    selectedUMAP = inputUMAP.copy()
    if method=='SNN_community':
        #print('Result of SNN_community')
        if 'knn' in karg_dict.keys():
            knn = karg_dict['knn']
        else:
            knn = 5
        if 'metric' in karg_dict.keys():
            metric = karg_dict['metric']
        else:
            metric = 'minkowski'
        if 'method' in karg_dict.keys():
            community_method = karg_dict['method']
        else:
            community_method = 'FastGreedy'
        cur_clusters = nmt.get_clusters_SNN_community(selectedUMAP, knn=knn, metric=metric, method=community_method)
        

    #karg_dict={'L_method':'single','L_metric':'euclidean'.'t':0.9,'criterionH':'inconsistent', depth=2, R=None, monocrit=None}
    if method =='Hierarchy':
        #print('Result of Hierarchy CLustering')
        cur_clusters = nmt.get_clusters_Hierarchy_clustering(selectedUMAP, karg_dict)


    if method =='Kmeans':
        #print('Result of Kmeans')
        cur_clusters = nmt.get_clusters_kmeans_clustering(selectedUMAP, karg_dict)

    if method =='DBSCAN':
        #print('Result of DBSCAN')
        cur_clusters = nmt.get_clusters_dbscan_clustering(selectedUMAP, karg_dict)

    if method =='HDBSCAN':
        #print('Result of HDBSCAN')
        cur_clusters = nmt.get_clusters_hdbscan_clustering(selectedUMAP, karg_dict)
    selectedUMAP.loc[:,'Cluster'] = ['C' + str(i) for i in cur_clusters]
    return selectedUMAP

## function freq_Matrix(fre_M, cluster_method,para_test)
* **fre_M** is the square matrix recording the number of co-clustering
* **cluster_method** can be 'Hierarchy','Kmeans', 'DBSCAN','HDBSCAN','SNN_community'
* **para_test** is the input parameter dictionary for the cluster method
* **iternum** is the number of iteration to generate the coclustering matrix

In [6]:
import random
import ast
from scipy.spatial.distance import pdist, squareform
import numpy as np
import matplotlib as mpl
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import multiprocessing
import time
def fre_Matrix(fre_M, cluster_method,para_test):
    umapDF = ns.UMAP.iloc[random.sample(range(0,ns.UMAP.shape[0]), int(ns.UMAP.shape[0]*0.95)),:].copy()
    resultDF = get_clusters(umapDF.copy(),method =cluster_method,karg_dict = para_test)
    Crange, Ccounts = np.unique(resultDF["Cluster"], return_counts = True)
    for iter_C in Crange:
        selected_row = resultDF[resultDF["Cluster"]==iter_C]
        Clist = selected_row.index.tolist()
        for sample_row in Clist:
            for sample_col in Clist:
                fre_M.loc[sample_row,sample_col] =  fre_M.loc[sample_row,sample_col]+1
    return fre_M.values
def para_cocluster(cluster_method,para_test,corenum, run_num,ns_input):
    start = time.perf_counter ()
    start=time.time()
    cores = corenum#multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=cores)
    fre_M_t = pd.DataFrame(index = ns_input.UMAP.index, columns =ns_input.UMAP.index)
    fre_M_t [fre_M_t.isnull()]=0
    pool_list=[]
    result_list=[]
    for i in range(run_num):
        pool_list.append(pool.apply_async(fre_Matrix, (fre_M_t, cluster_method, para_test)))
        # 这里不能 get， 会阻塞进程

    #pool.apply_async之后的语句都是阻塞执行的，
    #调用 result.get() 会等待上一个任务执行完之后才会分配下一个任务。
    #事实上，获取返回值的过程最好放在进程池回收之后进行，避免阻塞后面的语句。
    result_list=[xx.get() for xx in pool_list]
    print(sum([xx for xx in  result_list]))
    # 最后我们使用一下语句回收进程池:
    pool.close()
    pool.join()
    elapsed = (time.time() - start)
    print('Time needed to run Hierarchy is '+ str(elapsed))
    return sum([xx for xx in  result_list])

### For dendrite morphology features


## Using soma morphology as features
SETTING CLUSTER NUMBER FROM 8 TO 40

### Hierarchy Clustering
For Hierarchy method
* the most suitable parameter is {'L_method': 'weighted', 'L_metric': 'mahalanobis', 'criterionH': 'distance', 'depth': 2, 'R': None, 't': 0.9, 'optimal_ordering': False}
* the ARI is 0.05355799672049869
* The setting cluster number's limit is satisfied, the final number of cluster is 8

### Kmeans Clustering
For Kmeans method
* the most suitable parameter is {'n_clusters': 13, 'init': 'random', 'n_init': 12, 'max_iter': 300, 'tol': 0.0001, 'precompute_distances': True, 'verbose': 0, 'random_state': None, 'copy_x': True, 'n_jobs': None, 'algorithm': 'auto'}
* the ARI is 0.036079814213255956
* The setting cluster number's limit is satisfied, the final number of cluster is 13

### DBSCAN Clustering
For DBSCAN method
* the most suitable parameter is {'eps': 0.22, 'min_samples': 5, 'metric': 'euclidean', 'metric_params': None, 'algorithm': 'auto', 'leaf_size': 30, 'p': None, 'n_jobs': None}
* the ARI is 0.039530673163787826
* The setting cluster number's limit is satisfied, the final number of cluster is 20

### HDBSCAN Clustering
For HDBSCAN method, 
* the most suitable parameter is {'min_cluster_size': 5, 'metric': 'manhattan', 'alpha': 0.9, 'min_samples': 9, 'p': 2, 'algorithm': 'generic', 'leaf_size': 40, 'approx_min_span_tree': True, 'gen_min_span_tree': False, 'core_dist_n_jobs': 4, 'cluster_selection_method': 'eom', 'allow_single_cluster': False, 'prediction_data': False, 'match_reference_implementation': False}
* the ARI is 0.020481889417205506
* The setting cluster number's limit is satisfied, the final number of cluster is 21

### SNN Clustering
For SNN_community method
* the most suitable parameter is {'knn': 5, 'metric': 'minkowski', 'method': 'FastGreedy'}
* the ARI is 0.045224072138369056
* The setting cluster number's limit is satisfied, the final number of cluster is 18

In [7]:
lm_dendrite_df_scale= pd.read_excel('/home/penglab/FeaCal/Jmorpho_features/lm_dendrite_df_scale.xlsx', index_col=0)
index_origin = ns.metadata.index.tolist()
ns.UMAP = nmt.UMAP_wrapper(lm_dendrite_df_scale, n_neighbors=100, min_dist=0.1, n_components=2, metric='euclidean',PCA_first=True,n_PC=100)
print('Store UMAP for concated Umap')

index_after = ns.UMAP.index.tolist()
index_update = [i for i in index_origin if i in index_after ]
ns.metadata = ns.metadata.loc[index_update,:]
ns.UMAP = ns.UMAP .loc[index_update,:]

Store UMAP for concated Umap


In [8]:
par_hier2 =  {'L_method': 'weighted', 'L_metric': 'mahalanobis', 'criterionH': 'distance', 'depth': 2, 'R': None, 
              't': 0.9, 'optimal_ordering': False}
par_kmeans2 = {'n_clusters': 13, 'init': 'random', 'n_init': 12, 'max_iter': 300, 'tol': 0.0001,
               'precompute_distances': True, 'verbose': 0, 'random_state': None, 'copy_x': True, 'n_jobs': None,
               'algorithm': 'auto'}

par_dbscan2 = {'eps': 0.22, 'min_samples': 5, 'metric': 'euclidean', 'metric_params': None, 'algorithm': 'auto', 
               'leaf_size': 30, 'p': None, 'n_jobs': None}

par_hdbscan2 =  {'min_cluster_size': 5, 'metric': 'manhattan', 'alpha': 0.9, 'min_samples': 9, 'p': 2, 'algorithm': 'generic', 'leaf_size': 40, 'approx_min_span_tree': True, 'gen_min_span_tree': False, 'core_dist_n_jobs': 4, 'cluster_selection_method': 'eom', 'allow_single_cluster': False, 'prediction_data': False, 'match_reference_implementation': False}

par_snn2 = {'knn':5,'metric':'minkowski','method':'FastGreedy'}


In [None]:
DM_hier = para_cocluster('Hierarchy', par_hier2,30, 5000,ns)
DM_kmeans = para_cocluster('Kmeans', par_kmeans2,30, 5000,ns)

In [None]:
DM_hierDF = pd.DataFrame(data=DM_hier, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_hierDF.to_excel('/home/penglab/FeaCal/dataSource/denMor/DM_hierDF.xlsx')
DM_kmeansDF = pd.DataFrame(data=DM_kmeans, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_kmeansDF.to_excel('/home/penglab/FeaCal/dataSource/denMor/DM_kmeansDF.xlsx')



In [12]:
DM_dbscan = para_cocluster('DBSCAN', par_dbscan2,30, 5000,ns)
DM_dbscanDF = pd.DataFrame(data=DM_dbscan, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_dbscanDF = pd.DataFrame(data=DM_dbscan, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_dbscanDF.to_excel('/home/penglab/FeaCal/dataSource/denMor/DM_dbscanDF.xlsx')

[[4740    0    0 ...    0    0    0]
 [   0 4705 4481 ...    0    0    0]
 [   0 4481 4766 ...    0    0    0]
 ...
 [   0    0    0 ... 4736    0    0]
 [   0    0    0 ...    0 4758    0]
 [   0    0    0 ...    0    0 4745]]
Time needed to run Hierarchy is 3709.1338732242584


In [9]:


DM_hdbscan = para_cocluster('HDBSCAN', par_hdbscan2,30, 5000,ns)

DM_hdbscanDF = pd.DataFrame(data=DM_hdbscan, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_hdbscanDF.to_excel('/home/penglab/FeaCal/dataSource/denMor/DM_hdbscanDF.xlsx')

[[4737   26   28 ...    0    0   84]
 [  26 4734 3910 ...    0   11   26]
 [  28 3910 4760 ...   17  369   28]
 ...
 [   0    0   17 ... 4749   97    0]
 [   0   11  369 ...   97 4757    3]
 [  84   26   28 ...    0    3 4766]]
Time needed to run Hierarchy is 7875.647224187851


In [10]:
DM_snn = para_cocluster('SNN_community', par_snn2,30, 5000,ns)
DM_snnDF = pd.DataFrame(data=DM_snn, index=ns.UMAP.index, columns=ns.UMAP.index)
DM_snnDF.to_excel('/home/penglab/FeaCal/dataSource/denMor/DM_snnDF.xlsx')

[[4756    0    0 ...    0    0    2]
 [   0 4765 4482 ...    0    0    0]
 [   0 4482 4771 ...    0    0    0]
 ...
 [   0    0    0 ... 4762    0    0]
 [   0    0    0 ...    0 4738    0]
 [   2    0    0 ...    0    0 4744]]
Time needed to run Hierarchy is 5015.162358999252
