In [1]:
import numpy as np 
import pandas as pd 
import networkx as nx
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import scipy as sp
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
from matplotlib.colors import to_rgba
from pathlib import Path
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import to_hex
from matplotlib.pyplot import cm
import matplotlib.colors as mcolors

from sklearn.metrics import silhouette_score, silhouette_samples

In [2]:
from module.utils import dendrogram_clustering
from module.CenVec.vectorise import get_centrality_df
from module.graphpeeler import layer_realisation
from module.Ceval.dendrogram_eval import get_clustering_stats

# SHS network

In [3]:
s_hs_edges = pd.read_csv('../../data/s_hs_6/edges.csv')
s_hs_nodes = pd.read_csv('../../data/s_hs_6/nodes.csv')


s_hs_nodes.columns = [i[-1] for i in s_hs_nodes.columns.str.split()]
s_hs_nodes.index.name = 'node'
s_hs_nodes.index = s_hs_nodes.index.astype(str)

s_hs_edges.columns = [i[-1] for i in s_hs_edges.columns.str.split()]
# s_hs_edges = s_hs_edges.groupby(['source', 'target'], as_index=False)['count'].sum()
s_hs_edges.weight = s_hs_edges.weight.astype(int)


In [4]:

s_hs_edges.weight = s_hs_edges.weight.abs()
s_hs_edges.columns = ['pre', 'post', 'weight']

s_hs_edges['attr'] = s_hs_edges.weight.apply(lambda x: {'weight':x, 'inv_weight':1/x})
s_hs_centralities = get_centrality_df(edges=s_hs_edges, weighted=True) # 

rank_s_hs_centralities =s_hs_centralities.copy(True)

to_rank_cols = rank_s_hs_centralities.columns

for i in to_rank_cols:
    rank_s_hs_centralities[i] = np.log(rank_s_hs_centralities[i].rank(ascending=False, method='average'))

s_hs_nodes.index = s_hs_nodes.index.astype(str)

In [5]:
df_to_cluster =   rank_s_hs_centralities # pn_rank_hs_centralities #

df_to_cluster.index = df_to_cluster.index.astype(str)
# df_to_cluster = rank_larva_centralities
which_df = s_hs_nodes.reset_index().groupby(['Sexo', 'prosocial', 'crttotal']).node.apply(list)

In [None]:
exclude = []
cents_for_clustering = set(df_to_cluster.columns) - set(exclude)


df_to_cluster_cent = df_to_cluster.loc[:, list(cents_for_clustering)]
ind_to_id = df_to_cluster_cent.index.values

In [7]:
vector_df = df_to_cluster_cent.fillna(0)

method='ward'
metric = 'euclidean'

dist_func = sp.spatial.distance.pdist 

ind_to_id = vector_df.index.values
Z = linkage(vector_df, method=method, metric=metric) ####### linkage


In [8]:
crt_total_dict = s_hs_nodes.loc[ind_to_id].reset_index().groupby('crttotal').node.apply(list).to_dict()
crt_stats_df = get_clustering_stats(Z=Z, ctype_to_id=crt_total_dict, ind_to_id=ind_to_id)
crt_stats_df['av_size'] = crt_stats_df['m_p']/crt_stats_df['n_p']
crt_stats_df['coverage'] = crt_stats_df['m_p']/(crt_stats_df['s_p'] + crt_stats_df['m_p'])


In [9]:
crt_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,0,55,0.054545,0.102996,5,11,44,2.2,0.2
1,1,65,0.030769,0.121723,6,12,53,2.0,0.184615
2,2,159,0.037736,0.297753,17,40,119,2.352941,0.251572
3,3,255,0.023529,0.477528,45,116,139,2.577778,0.454902


In [10]:
crt_stats_df.m_p.sum() / crt_stats_df.len.sum()

0.3352059925093633

In [11]:
crt_stats_df.pms.mean()

0.25

In [12]:
curso_total_dict = s_hs_nodes.loc[ind_to_id].reset_index().groupby('Curso').node.apply(list).to_dict()
curso_stats_df = get_clustering_stats(Z=Z, ctype_to_id=curso_total_dict, ind_to_id=ind_to_id)
curso_stats_df['av_size'] = curso_stats_df['m_p']/curso_stats_df['n_p']
curso_stats_df['coverage'] = curso_stats_df['m_p']/(curso_stats_df['s_p'] + curso_stats_df['m_p'])


In [13]:
curso_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,1,124,0.048387,0.23221,21,59,65,2.809524,0.475806
1,2,129,0.031008,0.241573,29,67,62,2.310345,0.51938
2,3,155,0.025806,0.290262,26,62,93,2.384615,0.4
3,6,126,0.047619,0.235955,24,62,64,2.583333,0.492063


In [14]:
curso_stats_df.m_p.sum() / curso_stats_df.len.sum()

0.4681647940074906

In [15]:
curso_stats_df.pms.mean()

0.25

In [16]:
sex_total_dict = s_hs_nodes.loc[ind_to_id].reset_index().groupby('Sexo').node.apply(list).to_dict()
sex_stats_df = get_clustering_stats(Z=Z, ctype_to_id=sex_total_dict, ind_to_id=ind_to_id)
sex_stats_df['av_size'] = sex_stats_df['m_p']/sex_stats_df['n_p']
sex_stats_df['coverage'] = sex_stats_df['m_p']/(sex_stats_df['s_p'] + sex_stats_df['m_p'])


In [17]:
sex_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,Female,252,0.02381,0.47191,56,142,110,2.535714,0.563492
1,Male,282,0.035461,0.52809,55,149,133,2.709091,0.528369


In [18]:
sex_stats_df.m_p.sum() / sex_stats_df.len.sum()

0.5449438202247191

In [19]:
sex_stats_df.pms.mean()

0.5

In [20]:
group_total_dict = s_hs_nodes.loc[ind_to_id].reset_index().groupby('Grupo').node.apply(list).to_dict()
group_stats_df = get_clustering_stats(Z=Z, ctype_to_id=group_total_dict, ind_to_id=ind_to_id)
group_stats_df['av_size'] = group_stats_df['m_p']/group_stats_df['n_p']
group_stats_df['coverage'] = group_stats_df['m_p']/(group_stats_df['s_p'] + group_stats_df['m_p'])


In [21]:
group_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,A,120,0.058333,0.224719,20,55,65,2.75,0.458333
1,B,119,0.033613,0.222846,16,38,81,2.375,0.319328
2,C,121,0.024793,0.226592,13,28,93,2.153846,0.231405
3,D,118,0.025424,0.220974,21,46,72,2.190476,0.389831
4,E,56,0.053571,0.104869,10,22,34,2.2,0.392857


In [22]:
group_stats_df.m_p.sum() / group_stats_df.len.sum()

0.3539325842696629

In [23]:
group_stats_df.pms.mean()

0.2

In [24]:
cols_to_concat = ['Curso', 'Grupo']
s_hs_nodes['grouped'] = s_hs_nodes[cols_to_concat[0]].astype(str) + s_hs_nodes[cols_to_concat[1]].astype(str)

In [25]:
grouped_total_dict = s_hs_nodes.loc[ind_to_id].reset_index().groupby('grouped').node.apply(list).to_dict()
grouped_stats_df = get_clustering_stats(Z=Z, ctype_to_id=grouped_total_dict, ind_to_id=ind_to_id)
grouped_stats_df['av_size'] = grouped_stats_df['m_p']/grouped_stats_df['n_p']
grouped_stats_df['coverage'] = grouped_stats_df['m_p']/(grouped_stats_df['s_p'] + grouped_stats_df['m_p'])


In [64]:
grouped_stats_df.m_p.sum() / grouped_stats_df.len.sum()

0.23595505617977527

In [27]:
grouped_stats_df.pms.mean()

0.0566403223306989

## C elegans

In [28]:
ce_meta = pd.read_csv('../../data/celegans/processed/ce_meta.csv', index_col=0)
ce_all_edges = pd.read_csv('../../data/celegans/processed/ce_all_edges.csv')
ce_chem_edges = pd.read_csv('../../data/celegans/processed/ce_chem_edges.csv')

In [29]:
# ce_chem_edges['attr'] = ce_chem_edges.weight.apply(lambda x: {'weight':x, 'inv_weight':1/x})
# chem_centralities = get_centrality_df(edges=ce_chem_edges, weighted=True)
# # chem_centralities.to_csv('../data/celegans/processed/chem_centralities.csv')

# # rescale these by ranking and taking the log of the ranking 
# rank_chem_centralities =chem_centralities.copy(True)

# to_rank_cols = rank_chem_centralities.columns

# for i in to_rank_cols:
#     rank_chem_centralities[i] = np.log(rank_chem_centralities[i].rank(ascending=False, method='average'))

# # rank_chem_centralities.to_csv('../data/celegans/processed/rank_chem_centralities.csv')


In [30]:
ce_all_edges['attr'] = ce_all_edges.weight.apply(lambda x: {'weight':x, 'inv_weight':1/x})
alledge_centralities = get_centrality_df(edges=ce_all_edges, weighted=True)
# alledge_centralities.to_csv('../data/celegans/processed/alledge_centralities.csv')


# rescale these by ranking and taking the log of the ranking 
rank_alledge_centralities =alledge_centralities.copy(True)

to_rank_cols = alledge_centralities.columns

for i in to_rank_cols:
    rank_alledge_centralities[i] = np.log(rank_alledge_centralities[i].rank(ascending=False, method='average'))

# rank_alledge_centralities.to_csv('../data/celegans/processed/rank_alledge_centralities.csv')


In [31]:
df_to_cluster = rank_alledge_centralities


In [32]:
exclude = []
cents_for_clustering = set(df_to_cluster.columns) - set(exclude)

df_to_cluster_cent = df_to_cluster.loc[:, list(cents_for_clustering)]
ind_to_id = df_to_cluster_cent.index.values


In [33]:
vector_df = df_to_cluster_cent

method='ward'
metric = 'euclidean'

dist_func = sp.spatial.distance.pdist 

ind_to_id = vector_df.index.values
Z = linkage(vector_df, method=method, metric=metric) ####### linkage


In [34]:
fc_total_dict = ce_meta.loc[ind_to_id].reset_index().groupby('Final classification').Neuron.apply(list).to_dict()
fcstats_df = get_clustering_stats(Z=Z, ctype_to_id=fc_total_dict, ind_to_id=ind_to_id)
fcstats_df['av_size'] = fcstats_df['m_p']/fcstats_df['n_p']
fcstats_df['coverage'] = fcstats_df['m_p']/(fcstats_df['s_p'] + fcstats_df['m_p'])


In [35]:
fcstats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,Pharynx,20,0.2,0.072202,7,19,1,2.714286,0.95
1,interneuron,80,0.125,0.267559,15,58,22,3.866667,0.725
2,motor neuron,120,0.083333,0.401338,19,98,22,5.157895,0.816667
3,sensory neuron,79,0.101266,0.264214,20,57,22,2.85,0.721519


In [36]:
fcstats_df.m_p.sum() / fcstats_df.len.sum()

0.7759197324414716

In [37]:
fcstats_df.pms.mean()

0.2513281334894896

In [38]:
cc_total_dict = ce_meta.loc[ind_to_id].reset_index().groupby('Cell Class').Neuron.apply(list).to_dict()
cc_stats_df = get_clustering_stats(Z=Z, ctype_to_id=cc_total_dict, ind_to_id=ind_to_id)
cc_stats_df['av_size'] = cc_stats_df['m_p']/cc_stats_df['n_p']
cc_stats_df['coverage'] = cc_stats_df['m_p']/(cc_stats_df['s_p'] + cc_stats_df['m_p'])


In [39]:
cc_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,1,2,1.000000,1.000000,1,2,0,2.0,1.000000
1,2,2,1.000000,1.000000,1,2,0,2.0,1.000000
2,8,2,1.000000,1.000000,1,2,0,2.0,1.000000
3,9,2,1.000000,1.000000,1,2,0,2.0,1.000000
4,12,2,1.000000,1.000000,1,2,0,2.0,1.000000
...,...,...,...,...,...,...,...,...,...
86,114,12,0.166667,0.040134,2,4,8,2.0,0.333333
87,115,11,0.272727,0.039711,1,3,8,3.0,0.272727
88,116,13,0.307692,0.046931,2,6,7,3.0,0.461538
89,117,2,0.500000,0.250000,0,0,2,,0.000000


In [40]:
cc_stats_df.m_p.sum() / cc_stats_df.len.sum()

0.46153846153846156

In [41]:
cc_stats_df.pms.mean()

0.4474740295028808

In [42]:
ccc_total_dict = ce_meta.loc[ind_to_id].reset_index().groupby('Cook cell category').Neuron.apply(list).to_dict()
ccc_stats_df = get_clustering_stats(Z=Z, ctype_to_id=ccc_total_dict, ind_to_id=ind_to_id)
ccc_stats_df['av_size'] = ccc_stats_df['m_p']/ccc_stats_df['n_p']
ccc_stats_df['coverage'] = ccc_stats_df['m_p']/(ccc_stats_df['s_p'] + ccc_stats_df['m_p'])


In [43]:
ccc_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,SN1,26,0.153846,0.093863,7,18,8,2.571429,0.692308
1,SN2,6,0.333333,0.09375,1,2,4,2.0,0.333333
2,SN3,15,0.133333,0.050167,1,2,13,2.0,0.133333
3,SN4,12,0.166667,0.043321,2,4,8,2.0,0.333333
4,SN5,4,0.25,0.142857,0,0,4,,0.0
5,SN6,20,0.2,0.072202,5,12,8,2.4,0.6
6,category 4 interneuron,8,0.25,0.028881,1,2,6,2.0,0.25
7,head motor neuron,18,0.222222,0.064982,4,11,7,2.75,0.611111
8,layer 1 interneuron,13,0.307692,0.043478,4,10,3,2.5,0.769231
9,layer 2 interneuron,27,0.259259,0.090301,6,17,10,2.833333,0.62963


In [44]:
ccc_stats_df.m_p.sum() / ccc_stats_df.len.sum()

0.6287625418060201

In [45]:
ccc_stats_df.pms.mean()

0.13316142585459206

# Food web:

In [46]:
fw_df = pd.read_csv('../../data/foodweb/fw_df.csv', index_col=0)
fw_meta_df = pd.read_csv('../../data/foodweb/fw_meta_df.csv', index_col=0)
fw_meta_fine_df = pd.read_csv('../../data/foodweb/fw_meta_fine_df.csv', index_col=0)
fw_meta_finest_df = pd.read_csv('../../data/foodweb/fw_meta_finest_df.csv', index_col=0)

fw_df.columns = ['pre', 'post']
fw_df['weight']=1

In [47]:
rel_meta_df = pd.read_csv('../../data/foodweb/processed/layer_coarse_meta.csv', index_col=0)

In [48]:
fw_df['attr'] = fw_df.weight.apply(lambda x: {'weight':x, 'inv_weight':1/x})
fw_centralities = get_centrality_df(edges=fw_df, weighted=False)
# chem_centralities.to_csv('../data/celegans/processed/chem_centralities.csv')

# rescale these by ranking and taking the log of the ranking 
rank_fw_centralities =fw_centralities.copy(True)

to_rank_cols = rank_fw_centralities.columns

for i in to_rank_cols:
    rank_fw_centralities[i] = np.log(rank_fw_centralities[i].rank(ascending=False, method='average'))

# rank_chem_centralities.to_csv('../data/celegans/processed/rank_chem_centralities.csv')


In [49]:
df_to_cluster = rank_fw_centralities

# cell_labelling = 'Cell Class'
# which_df = fw_meta_df.groupby('type')['node'].apply(list)
# which_df = fw_meta_fine_df.groupby('type')['node'].apply(list)
which_df = fw_meta_finest_df.groupby('type')['node'].apply(list)

In [50]:
exclude = []
cents_for_clustering = set(df_to_cluster.columns) - set(exclude)

df_to_cluster_cent = df_to_cluster.loc[:, list(cents_for_clustering)]
ind_to_id = df_to_cluster_cent.index.values

vector_df = df_to_cluster_cent

method='ward'
metric = 'euclidean'

dist_func = sp.spatial.distance.pdist 

ind_to_id = vector_df.index.values
Z = linkage(vector_df, method=method, metric=metric) ####### linkage


In [51]:
meta_dict = fw_meta_df.set_index('node').type.to_dict()
types_unique = fw_meta_df.type.unique()
ind_to_meta = [meta_dict.get(i) for i in ind_to_id]

types_to_numeric =dict(zip(types_unique, [0,0,0,0,0,1,1,1,2,2,2,2,2,3,4,5,5,5,5])) # reassign the types to a numeric value
types_unique = set(types_to_numeric.values())

ind_to_numeric_meta = [types_to_numeric.get(meta) for meta in ind_to_meta]
coarse_type_dict = pd.DataFrame(zip(ind_to_id, ind_to_numeric_meta)).groupby(1)[0].apply(list).to_dict()



In [52]:
otype_total_dict = fw_meta_df.set_index('node').loc[ind_to_id].reset_index().groupby('type').node.apply(list).to_dict()
otype_stats_df = get_clustering_stats(Z=Z, ctype_to_id=otype_total_dict, ind_to_id=ind_to_id)
otype_stats_df['av_size'] = otype_stats_df['m_p']/otype_stats_df['n_p']
otype_stats_df['coverage'] = otype_stats_df['m_p']/(otype_stats_df['len'])


In [53]:
otype_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,Algae,3,0.333333,0.040541,0,0,3,,0.0
1,Bacteria,4,0.25,0.054054,0,0,4,,0.0
2,Birds,16,0.375,0.179775,4,12,4,3.0,0.75
3,Copepods,4,0.75,0.5,1,3,1,3.0,0.75
4,Crabs,5,0.2,0.041322,0,0,5,,0.0
5,Crustaceans,4,0.5,0.033058,1,2,2,2.0,0.5
6,Fish,48,0.125,0.396694,14,41,7,2.928571,0.854167
7,Macrobenthos,6,0.166667,0.049587,0,0,6,,0.0
8,Plankton,7,0.142857,0.057851,0,0,7,,0.0
9,Plants,4,0.5,0.25,1,2,2,2.0,0.5


In [54]:
otype_stats_df.m_p.sum() / otype_stats_df.len.sum()

0.5555555555555556

In [55]:
# # coarsetype_total_dict = fw_meta_df.set_index('node').loc[ind_to_id].reset_index().groupby('type').node.apply(list).to_dict()
# coarsetype_stats_df = get_clustering_stats(Z=Z, ctype_to_id=coarse_type_dict, ind_to_id=ind_to_id)
# coarsetype_stats_df['av_size'] = coarsetype_stats_df['m_p']/coarsetype_stats_df['n_p']
# coarsetype_stats_df['coverage'] = coarsetype_stats_df['m_p']/(coarsetype_stats_df['len'])


In [56]:
coarsetype_total_dict = rel_meta_df.loc[ind_to_id].reset_index().groupby('coarse_type').index.apply(list).to_dict()
coarsetype_stats_df = get_clustering_stats(Z=Z, ctype_to_id=coarse_type_dict, ind_to_id=ind_to_id)
coarsetype_stats_df['av_size'] = coarsetype_stats_df['m_p']/coarsetype_stats_df['n_p']
coarsetype_stats_df['coverage'] = coarsetype_stats_df['m_p']/(coarsetype_stats_df['len'])


In [57]:
coarsetype_stats_df.m_p.sum() / coarsetype_stats_df.len.sum()

0.7272727272727273

In [58]:
coarsetype_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,0,21,0.761905,0.173554,2,18,3,9.0,0.857143
1,1,11,0.272727,0.090909,1,3,8,3.0,0.272727
2,2,19,0.157895,0.157025,5,12,7,2.4,0.631579
3,3,48,0.125,0.396694,14,41,7,2.928571,0.854167
4,4,16,0.375,0.179775,4,12,4,3.0,0.75
5,5,6,0.333333,0.067416,1,2,4,2.0,0.333333


In [59]:
coarsetype_stats_df.pms.mean()

0.17756213823629555

In [60]:
layer_total_dict = rel_meta_df.loc[ind_to_id].reset_index().groupby('layer').index.apply(list).to_dict()
layer_stats_df = get_clustering_stats(Z=Z, ctype_to_id=layer_total_dict, ind_to_id=ind_to_id)
layer_stats_df['av_size'] = layer_stats_df['m_p']/layer_stats_df['n_p']
layer_stats_df['coverage'] = layer_stats_df['m_p']/(layer_stats_df['len'])


In [61]:
layer_stats_df.m_p.sum() / layer_stats_df.len.sum()

0.6528925619834711

In [62]:
layer_stats_df

Unnamed: 0,labels,len,lps,pms,n_p,m_p,s_p,av_size,coverage
0,0,14,0.214286,0.189189,4,10,4,2.5,0.714286
1,1,8,0.25,0.066116,1,2,6,2.0,0.25
2,2,27,0.148148,0.22314,5,14,13,2.8,0.518519
3,3,41,0.146341,0.338843,10,31,10,3.1,0.756098
4,4,31,0.290323,0.348315,7,22,9,3.142857,0.709677


In [63]:
layer_stats_df.pms.mean()

0.23312059389689624