# Import

In [1]:
import numpy as np 
import scipy as sp
import pandas as pd
import os
import time
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import itertools
# import graphviz
from matplotlib.pyplot import cm
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import re
# import plotly.express as px

# import networkx as nx
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, to_tree



In [14]:
from module.dendorgram_utils import dendrogram_clustering
from module.data_prep import prep_grn_data, prep_go_meta, prep_foodweb_data, prep_recipe_data
# from module.bipartite_vectorisation import get_bpt_dict, all_ud_bpt_vectors
# from module.mat_bpt_vec import *

from fvec.fvec import bipartite_cooarray, adjacency_cooarray, csr_row_norm, csr_col_norm

In [3]:
save_things = False

# Load data:

In [4]:
# Arabidopsis gene regulatory network and associated metadata
g_reg_edges, GO_bipartite_df = prep_grn_data()
go_meta_df = prep_go_meta()

GO_meta = pd.DataFrame({'node':GO_bipartite_df['GO'], 'type':GO_bipartite_df['GO']}) # dummy dataframe

  data = pd.read_csv('../../data/grn/AtRegNet.csv', index_col=0)


In [5]:
g_reg_edges

Unnamed: 0,pre,post
0,AT5G10140,AT1G65480
1,AT5G11260,AT1G27480
2,AT5G11260,AT5G53370
3,AT5G11260,AT1G03630
4,AT5G11260,AT1G13600
...,...,...
1638795,AT5G13080,AT5G67540
1638796,AT5G13080,AT5G67570
1638797,AT5G13080,AT5G67620
1638798,AT5G13080,AT5G67630


# Vectorisation:

## GRN vectorisation:

In [6]:
if save_things:
    grn_datadir = '../../data/grn'
    grn_processed_dir = grn_datadir + '/processed'
    if not os.path.isdir(grn_processed_dir):
        os.makedirs(grn_processed_dir)

In [7]:
tfs = g_reg_edges.pre.unique() # all transcription factors
regulated_genes = g_reg_edges.post.unique() # all genes that are regulated
g_reg_ids = list(set(regulated_genes) | set(tfs)) # all nodes in the regulatory network
gene_ids = list(set(GO_bipartite_df.gene)) # genes that have GO labelling

all_gene_ids = list(set(regulated_genes) | set(tfs) | set(GO_bipartite_df.gene))

In [8]:
len(tfs)

580

In [9]:
len(gene_ids)

31031

In [10]:
# # aggregate all GO terms for each gene and save it
# pre_post= ['gene', 'GO', 'gene', 'GO']
# fpath_prefix = grn_processed_dir + '/go_list_dict'
# go_dict = get_bpt_dict(ids=gene_ids, bpt_df=GO_bipartite_df,pre_post=pre_post, fpath_prefix=fpath_prefix)

# # ~4.5 mins

In [11]:
v = GO_bipartite_df.GO.value_counts()
single_GOs = v[v <=1].index.tolist() # GO terms that only appear once
multi_GOs = v[v >1].index.tolist() # GO terms that appear more than once


# only consider GO terms that appear more than once:
# prefix = grn_processed_dir + '/grn_GO_vectors_mutligos' # all the gene ids in the regulatory network, in/out GO vectors, ignore single GO terms.
# grn_go_vectors = all_ud_bpt_vectors(ids=g_reg_ids, edges=g_reg_edges, bpt_dict=go_dict, keep=multi_GOs, prefix=prefix)

# # ~28 mins

In [12]:
GO_bipartite_multigos_df = GO_bipartite_df[GO_bipartite_df['GO'].isin(multi_GOs)].reset_index(drop=True)

In [15]:
# create COO_arrays from edgelists
bpt_gG_coo, gene_row, GO_col = bipartite_cooarray(df=GO_bipartite_multigos_df, row_col=['gene', 'GO'], weight=False, row_order=all_gene_ids)
a_gg_coo, _ = adjacency_cooarray(df=g_reg_edges, row_col=['pre', 'post'], id_order=all_gene_ids, weight=False, directed=True)

In [21]:
# construct vector elements by the bipartite anntoations:
a_gG_out = (a_gg_coo @ bpt_gG_coo).tocsr() # out matrix
a_gG_in = (a_gg_coo.T @ bpt_gG_coo).tocsr() # in matrix


# out matrix normalisation:
a_gG_out_normalised = csr_row_norm(a_gG_out)
# in matrix normalisation:
a_gG_in_normalised = csr_row_norm(a_gG_in)

# put together into one dataframe:
all_col_names = np.concatenate([
    np.char.add(GO_col.astype(str), '_out'),
    np.char.add(GO_col.astype(str), '_in')
])
all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_gG_out_normalised, a_gG_in_normalised]).toarray(), columns=all_col_names, index=all_gene_ids)




In [30]:
# construct vector elements by the bipartite anntoations:
# a_gG_out = a_gg_coo.dot(bpt_gG_coo).tocsr() # out matrix
# a_gG_in = a_gg_coo.T.dot(bpt_gG_coo).tocsr() # in matrix

# fast implementation of a dot product:
a_gG_out = (a_gg_coo @ bpt_gG_coo).tocsr() # out matrix
a_gG_in = (a_gg_coo.T @ bpt_gG_coo).tocsr() # in matrix


# out matrix normalisation:
a_gG_out_normalised = csr_row_norm(a_gG_out)
# in matrix normalisation:
a_gG_in_normalised = csr_row_norm(a_gG_in)


#######
# remove: redundant columns and rows 
out_col_sums = np.array(a_gG_out_normalised.sum(axis=0)).flatten()  
out_col_keep = (out_col_sums != 0)
in_col_sums = np.array(a_gG_in_normalised.sum(axis=0)).flatten()  
in_col_keep = (in_col_sums != 0)

out_row_sums = np.array(a_gG_out_normalised.sum(axis=1)).flatten()  
in_row_sums = np.array(a_gG_in_normalised.sum(axis=1)).flatten()  
out_row_keep = (out_row_sums != 0)
in_row_keep = (in_row_sums != 0)
row_keep = out_row_keep + in_row_keep # only false if both in/out are false...

# columns and rows to keep:
a_gG_out_normalised = a_gG_out_normalised[:, out_col_keep][row_keep,:]
a_gG_in_normalised = a_gG_in_normalised[:, in_col_keep][row_keep,:]

# column and row names to keep:
all_col_names = np.concatenate([
    np.char.add(GO_col[out_col_keep], '_out'),
    np.char.add(GO_col[in_col_keep], '_in')
])
all_row_names = np.array(all_gene_ids)[row_keep]
#######

# construct a dataframe:
all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_gG_out_normalised, a_gG_in_normalised]).toarray(), columns=all_col_names, index=all_row_names)


In [31]:
# save the dataframe:
if save_things:
    all_norm_vec_df.to_parquet(grn_processed_dir + '/grn_GO_vector_multigos.parquet') # takes longer to save/load than recalculate...

## Recipe network vectorisation:

In [32]:
if save_things:
    rn_datadir = '../../data/rn'
    rn_processed_dir = rn_datadir + '/processed'
    if not os.path.isdir(rn_processed_dir):
        os.makedirs(rn_processed_dir)

In [33]:
rn_df, meta_df = prep_recipe_data()

cuisine_list = meta_df.cuisine.dropna().unique()

recipe_list = rn_df.r_id.dropna().unique()

ingredient_list = list(set(rn_df.ingredient.dropna().unique()) - {''})
# rn_df = rn_df[rn_df['ingredient'].isin(ingredient_list)].reset_index(drop=True)
rn_df = rn_df[rn_df['ingredient'].isin(ingredient_list)].reset_index(drop=True)

In [34]:
cuisine_list

array(['Vietnamese', 'Indian', 'French', 'Jewish', 'Spanish_Portuguese',
       'Central_SouthAmerican', 'Cajun_Creole', 'Thai', 'Scandinavian',
       'Greek', 'American', 'African', 'MiddleEastern',
       'EasternEuropean_Russian', 'Italian', 'Irish', 'Mexican',
       'Chinese', 'German', 'Mediterranean', 'Japanese', 'Moroccan',
       'Southern_SoulFood', 'English_Scottish', 'Asian', 'Southwestern'],
      dtype=object)

In [35]:
a_RC_coo, recipe_row, cuisine_col = bipartite_cooarray( \
    df=meta_df.sort_values(['r_id', 'cuisine']), \
    row_col=['r_id', 'cuisine'], \
    weight=False, \
    row_order=list(recipe_list), \
    col_order=list(cuisine_list))

In [36]:
a_RI_coo, recipe_row, ingredient_col = bipartite_cooarray( \
    df=rn_df.sort_values(['r_id', 'ingredient']), \
    row_col=['r_id', 'ingredient'], \
    weight=False, 
    row_order=list(recipe_list), \
    col_order=list(ingredient_list))

In [37]:
a_CI_csr = (a_RC_coo.T @ a_RI_coo).tocsr() # csr array

# normalisation:
a_CI_csr_normalised = csr_row_norm(a_CI_csr)

norm_is_c_vec_df = pd.DataFrame(a_CI_csr_normalised.toarray(), columns=ingredient_col, index=cuisine_col)

In [38]:
a_IC_csr = a_CI_csr.T

# standarisation - correct for the number of recipes in each cuisine:
r_sums = np.array(a_RC_coo.tocsr().sum(axis=0)).flatten()  
r_sums[r_sums == 0] = 1 # divide by 1 instead of zero
inv_r_sums = sp.sparse.diags(1 / r_sums)
standardised_a_IC_csr = a_IC_csr @ inv_r_sums

# standardised_a_IC_csr = csr_col_norm(a_IC_csr) 
#### wrong - Column normalisation (dividing by total number of ingredients used in a cuisine) is not the same as dividing by the number of recipes in each cuisine.  

# normalisation:
a_IC_csr_s_normalised = csr_row_norm(standardised_a_IC_csr)

cnorm_cs_i_bpt_df = pd.DataFrame(a_IC_csr_s_normalised.toarray(), columns= cuisine_col, index=ingredient_col)

In [39]:
# standardised_a_IC_df = pd.DataFrame(standardised_a_IC_csr.toarray(), columns= cuisine_col, index=ingredient_col)

In [40]:
if save_things:
    norm_is_c_vec_df.to_parquet(f'{rn_processed_dir}/is_c_vectors.parquet')
    cnorm_cs_i_bpt_df.to_parquet(f'{rn_processed_dir}/cs_i_vectors.parquet')

In [41]:
# old_bpt_df = pd.read_parquet('../data/recipe/old/cs_i_bpt_counts_bp_counts_vec.parquet')

In [42]:

# df = pd.DataFrame(a_IC_csr.toarray(), columns= cuisine_col, index=ingredient_col)

In [43]:
# df.sort_index(axis=1).sort_index(axis=0)

In [44]:
# old_bpt_df.sort_index(axis=1).sort_index(axis=0).iloc[1:,:].fillna(0)

In [45]:
# abs(old_bpt_df.sort_index(axis=1).sort_index(axis=0).iloc[1:,:].fillna(0) - df.sort_index(axis=1).sort_index(axis=0))

In [46]:
# abs(old_bpt_df.sort_index(axis=1).sort_index(axis=0).iloc[1:,:].fillna(0) - df.sort_index(axis=1).sort_index(axis=0)).max()

In [47]:
# abs(old_bpt_df.sort_index(axis=1).sort_index(axis=0).iloc[1:,:].fillna(0) - df.sort_index(axis=1).sort_index(axis=0)).sort_values('American')

In [48]:
# A_recipes = meta_df.query('cuisine=="American"').r_id.unique()

In [49]:
# rn_df[rn_df.r_id.isin(A_recipes)].query('ingredient=="garlic"')

In [50]:
# meta_df.sort_values('r_id')

In [51]:
# len(rn_df[rn_df.r_id.isin(A_recipes)].query('ingredient=="garlic"').r_id.unique())

In [52]:
# df.loc['garlic', 'American']

## __ testing 

In [53]:
# def edgelist_to_adjmat(edgelist):
#     '''Turn an edgelist into an adjacency matrix
    
#     edgelist is a numpy array of source-sink pairs. 

#     indices is the order in which the columns/rows are ordered. 
#     '''

#     sources, sinks = edgelist.T

#     all_ids = set(sources) | set(sinks)

#     n_all_ids = len(all_ids)

#     adj_mat = np.zeros((n_all_ids,n_all_ids))

#     i = 0
#     ind_dict = {}
#     ind_to_id = []
#     for edge in edgelist:
        
#         source_edge = ind_dict.get(edge[0])
#         if source_edge==None:
#             ind_dict[edge[0]] = i
#             source_edge = i
#             ind_to_id.append(edge[0])
#             i+=1
#         sink_edge = ind_dict.get(edge[1])
#         if sink_edge==None:
#             ind_dict[edge[1]] = i
#             sink_edge = i
#             ind_to_id.append(edge[1])

#             i+=1

#         adj_mat[source_edge, sink_edge] = 1

#     return adj_mat, ind_dict, ind_to_id
        
# def edgelist_to_bptadjmat(edgelist):
#     '''Turn an edgelist into an bipartite adjacency matrix

#     First column and second column are treated independently in the adjacency matrix
#     DOESN't necessarily produce a square matrix.
    
#     edgelist is a numpy array of source-sink pairs. 

#     indices is the order in which the columns/rows are ordered. 
#     '''

#     sources, sinks = edgelist.T

#     all_ids = set(sources) | set(sinks)

#     # n_all_ids = len(all_ids)

#     adj_mat = np.zeros((len(set(sources)),len(set(sinks)))) # sources/sinks on first/second axis

#     so_i = 0
#     si_i = 0

#     source_ind_dict = {}
#     sink_ind_dict = {}

#     source_ind_to_id = []
#     sink_ind_to_id = []

#     for edge in edgelist:
        
#         source_edge = source_ind_dict.get(edge[0])
#         if source_edge==None:
#             source_ind_dict[edge[0]] = so_i
#             source_edge = so_i
#             source_ind_to_id.append(edge[0])
#             so_i+=1
#         sink_edge = sink_ind_dict.get(edge[1])
#         if sink_edge==None:
#             sink_ind_dict[edge[1]] = si_i
#             sink_edge = si_i
#             sink_ind_to_id.append(edge[1])

#             si_i+=1

#         adj_mat[source_edge, sink_edge] = int(1)

#     return adj_mat, source_ind_to_id, sink_ind_to_id
        
# # sample_edges = np.array([['a', 'x'], ['b', 'c']])
# # edgelist_to_bptadjmat(edgelist=sample_edges)


In [54]:
# a_RC, a_RC_source_inds, a_RC_sink_inds = edgelist_to_bptadjmat(edgelist=meta_df.sort_values(['r_id', 'cuisine']).to_numpy())

In [55]:
# metric = 'euclidean'
# method = 'ward'
# ind_to_id = meta_df.cuisine.unique()
# Z = linkage(norm_is_c_vec_df.loc[ind_to_id, ingredient_list], metric=metric, method=method) ####### linkage

# fig = plt.figure(figsize=(6, 15))
# # fig.suptitle(f'{M} neighbours', fontsize=16)

# gs0 = GridSpec(1,2, figure=fig,width_ratios=[35,1], wspace=0.05)
# gs1 = GridSpecFromSubplotSpec(2,1, subplot_spec=gs0[0],
#                                                   height_ratios=[7,1],
#                                                   hspace=0.05)
# ax_col_dendrogram = fig.add_subplot(gs1[0])
# col_dendrogram = dendrogram(Z, ax=ax_col_dendrogram, color_threshold=0, above_threshold_color='black', labels=ind_to_id, orientation='right')
# xind = col_dendrogram['leaves']
# xmin,xmax = ax_col_dendrogram.get_xlim()
# plt.gca()
# plt.rcParams['axes.spines.left'] = False
# plt.rcParams['axes.spines.right'] = False
# plt.rcParams['axes.spines.top'] = False
# plt.rcParams['axes.spines.bottom'] = False
# ax_col_dendrogram.set_xticks([])
# plt.yticks(size=25)

# # plt.xticks(rotation=90, ha='center', size=30)

# plt.show()

In [56]:
# metric = 'euclidean'
# method = 'ward'
# ind_to_id = ingredient_list
# Z = linkage(cnorm_cs_i_bpt_df.loc[ingredient_list].sort_index(axis=1), metric=metric, method=method) ####### linkage

# ax = sns.clustermap(

#     cnorm_cs_i_bpt_df.loc[ingredient_list].sort_index(axis=1), metric=metric, method=method,
#     figsize=(10, 35),
#     # row_cluster=False,
#     dendrogram_ratio=(.5, .0),
#     yticklabels=True, xticklabels=True,cmap='rocket_r', standard_scale=None,
#     cbar_kws={'ticks':[0,0.5, 1]},
#     cbar_pos=(0.15, 0.07, .1, .15), col_cluster=False
# )

# cbar = ax.ax_cbar

# # here set the labelsize by 20
# cbar.tick_params(labelsize=15)
# # plt.savefig('./figs/2024/s_cs_i_clustermap.png', bbox_inches='tight')
# plt.show()

# Food web:

In [57]:
if save_things:
    fw_datadir = '../../data/fw'
    fw_processed_dir = fw_datadir + '/processed'
    if not os.path.isdir(fw_processed_dir):
        os.makedirs(fw_processed_dir)

In [58]:
# def vectorisation(edge_df, meta_df, edge_row_cols, meta_row_cols, \
#                   edge_weight=True, meta_weight=True, id_order=None, \
#                   redundant_remove=False):

#     adj_AA_coo, _id_order = adjacency_cooarray(df=edge_df, 
#                                                row_col=edge_row_cols, 
#                                                id_order=id_order, 
#                                                weight=edge_weight, 
#                                                directed=True) 
    
#     bpt_AB_coo, row_order, col_order = bipartite_cooarray(df=meta_df, 
#                                                           row_col=meta_row_cols, 
#                                                           weight=meta_weight, 
#                                                           row_order=_id_order)

#     adj_AA_out = (adj_AA_coo @ bpt_AB_coo).tocsr() # out matrix
#     adj_AA_in = (adj_AA_coo.T @ bpt_AB_coo).tocsr() # in matrix

#     # out matrix normalisation:
#     adj_AA_out_normalised = csr_row_norm(adj_AA_out)

#     # in matrix normalisation:
#     adj_AA_in_normalised = csr_row_norm(adj_AA_in)

#     # # put together into one dataframe:
#     # all_col_names = np.concatenate([col_order + '_out', col_order + '_in'])
#     # all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([adj_AA_out_normalised, adj_AA_in_normalised]).toarray(), \
#     #                             columns=all_col_names, index=row_order)

#     # by default, keep all:
#     out_col_keep, in_col_keep = [[True for _ in col_order] for i in range(2)]
#     row_keep = [True for _ in row_order]

#     if redundant_remove:
#         # remove: redundant columns and rows that are empty
#         out_col_sums = np.array(adj_AA_out_normalised.sum(axis=0)).flatten()  
#         out_col_keep = (out_col_sums != 0)
#         in_col_sums = np.array(adj_AA_in_normalised.sum(axis=0)).flatten()  
#         in_col_keep = (in_col_sums != 0)

#         out_row_sums = np.array(adj_AA_out_normalised.sum(axis=1)).flatten()  
#         in_row_sums = np.array(adj_AA_in_normalised.sum(axis=1)).flatten()  
#         out_row_keep = (out_row_sums != 0)
#         in_row_keep = (in_row_sums != 0)
#         row_keep = out_row_keep + in_row_keep # only false if both in/out are false...

#     # columns and rows to keep:
#     adj_AA_out_normalised = adj_AA_out_normalised[:, out_col_keep][row_keep,:]
#     adj_AA_in_normalised = adj_AA_in_normalised[:, in_col_keep][row_keep,:]

#     # column and row names to keep:
#     all_col_names = np.concatenate([col_order[out_col_keep] + '_out', 
#                                     col_order[in_col_keep] + '_in'])
#     all_row_names = np.array(row_order)[row_keep]

#     # construct a dataframe:
#     all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([adj_AA_out_normalised, adj_AA_in_normalised]).toarray(), \
#                                    columns=all_col_names, index=all_row_names)
#     return all_norm_vec_df

In [59]:
fw_df, meta_df_filtered, meta_fine_df_filtered, meta_fine_df_2_filtered = prep_foodweb_data()

In [60]:
preys = set(fw_df.prey)
predators = set(fw_df.predator)

all_animals = sorted(list(preys | predators))

In [61]:
# f for fine
# ff for finer
# fff for finest

prep = zip((meta_df_filtered, meta_fine_df_filtered, meta_fine_df_2_filtered), ('f', 'ff', 'fff'))

In [63]:
for index, data in enumerate(prep):
    meta = data[0]
    label = data[1]

        
    bpt_AS_coo, animal_row, type_col = bipartite_cooarray(df=meta.sort_values(['node', 'type']), row_col=['node', 'type'], weight=False, row_order=all_animals)
    a_AA_coo, _ = adjacency_cooarray(df=fw_df, row_col=['prey', 'predator'], id_order=all_animals, weight=False, directed=True)

    a_AS_out = (a_AA_coo @ bpt_AS_coo).tocsr() # out matrix
    a_AS_in = (a_AA_coo.T @ bpt_AS_coo).tocsr() # in matrix

    # out matrix normalisation:
    a_AS_out_normalised = csr_row_norm(a_AS_out)

    # in matrix normalisation:
    a_AS_in_normalised = csr_row_norm(a_AS_in)

    # put together into one dataframe:
    all_col_names = np.concatenate([
        np.char.add(type_col, '_out'),
        np.char.add(type_col, '_in')
    ])
    all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_AS_out_normalised, a_AS_in_normalised]).toarray(), columns=all_col_names, index=animal_row)
    
    if save_things:
        all_norm_vec_df.to_parquet(f'{fw_processed_dir}/{label}_vectors.parquet')

In [64]:


# bpt_AS_coo, animal_row, finest_type_col = bipartite_cooarray(df=meta_fine_df_2_filtered.sort_values(['node', 'type']), row_col=['node', 'type'], weight=False, row_order=all_animals)
# a_AA_coo, _ = adjacency_cooarray(df=fw_df, row_col=['prey', 'predator'], id_order=all_animals, weight=False, directed=True)

# a_AS_out = (a_AA_coo @ bpt_AS_coo).tocsr() # out matrix
# a_AS_in = (a_AA_coo.T @ bpt_AS_coo).tocsr() # in matrix

# # out matrix normalisation:
# a_AS_out_normalised = csr_row_norm(a_AS_out)

# # in matrix normalisation:
# a_AS_in_normalised = csr_row_norm(a_AS_in)

# # put together into one dataframe:
# all_col_names = np.concatenate([finest_type_col + '_out', finest_type_col + '_in'])
# all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_AS_out_normalised, a_AS_in_normalised]).toarray(), columns=all_col_names, index=animal_row)

# all_norm_vec_df.to_parquet('../data/foodweb/processed/finest_vectors.parquet')

# _Drosophila_ VNC connectome

In [65]:
from module.data_prep_drosophila import drosophila_data


In [66]:
datadate = '20241119' # date of drosophila VNC data pull
dm_datadir = f'../../data/{datadate}_dm_data'

if save_things:
    dm_processed_dir = dm_datadir + '/processed'
    if not os.path.isdir(dm_processed_dir):
        os.makedirs(dm_processed_dir)


df, meta_df = drosophila_data(datadate=datadate)

In [67]:
h_cat_string = 'hemilineage'
# hemilineage_fileprefix = f'{dm_datadir}/processed/hemilineage'

h_cat_meta = meta_df[['bodyId', h_cat_string]].rename(columns={'bodyId':'id', h_cat_string:'type'}).copy(True)
n_ids = h_cat_meta.id.sort_values().unique() # all neuron ids

h_cat_meta.dropna(inplace=True)# remove nans
remove = ['TBD'] # remove TBD type as well
h_cat_meta = h_cat_meta[~h_cat_meta['type'].isin(remove)]

In [68]:
a_NN_coo, _ = adjacency_cooarray(df=df, row_col=['bodyId_pre', 'bodyId_post'], id_order=n_ids, weight=True, directed=True)
# 24s

In [69]:
bpt_NH_coo, neuron_row, type_col = bipartite_cooarray(df=h_cat_meta, row_col=['id', 'type'], weight=False, row_order=n_ids)


In [70]:
a_NH_out = (a_NN_coo @ bpt_NH_coo).tocsr() # out matrix
a_NH_in = (a_NN_coo.T @ bpt_NH_coo).tocsr() # in matrix


In [71]:
# out matrix normalisation:
a_NH_out_normalised = csr_row_norm(a_NH_out)

# in matrix normalisation:
a_NH_in_normalised = csr_row_norm(a_NH_in)


In [72]:
all_col_names = np.concatenate([
    np.char.add(type_col, '_out'),
    np.char.add(type_col, '_in')
])
all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_NH_out_normalised, a_NH_in_normalised]).toarray(), columns=all_col_names, index=neuron_row)

In [73]:

if save_things:
    all_norm_vec_df.to_parquet(f'{dm_processed_dir}/{h_cat_string}_vec.parquet')

# _C. elegans_ connectome

In [74]:
from module.data_prep_celegans import celegans_data, syn_to_edge

In [75]:
edge_df, ce_meta = celegans_data()

chem_edges_df = syn_to_edge(edge_df, electrical=False)
all_edges_df = syn_to_edge(edge_df, electrical=True)

  edge_df = pd.read_csv('../../data/celegans/white_1986_whole.csv', delim_whitespace=True)


In [76]:
if save_things:
    ce_datadir = f'../../data/celegans'
    ce_processed_dir = ce_datadir + '/processed'
    if not os.path.isdir(grn_processed_dir):
        os.makedirs(grn_processed_dir)


In [77]:
strings_list = ['Anatomical cell class (WW Barry)', 'Cook cell category', 'Cell Class', 'Final classification']

shorthand = {'Anatomical cell class (WW Barry)':'cellclass', \
             'Cook cell category':'cook_ccat', \
             'Cell Class':'ncclass', 'Final classification':'fclass'}

edge_dict = {'chem':chem_edges_df, 'all':all_edges_df}


In [78]:
for class_string in strings_list:
    class_meta = ce_meta.reset_index()[['Neuron', class_string]].rename(columns={'Neuron':'id', class_string:'type'}).copy(True)
    class_name = shorthand.get(class_string)
    if save_things:
        todir = f'../../data/celegans/processed/{class_name}'
        if not os.path.isdir(todir):
            os.mkdir(todir)
    n_ids = class_meta.id.sort_values().unique()

    for edgetype in edge_dict.keys():
        typeedge_df = edge_dict.get(edgetype)
        a_NN_coo, _ = adjacency_cooarray(\
            df=typeedge_df,\
            row_col=['pre', 'post'],\
            id_order=n_ids,\
            weight=True,\
            directed=True\
            )

        bpt_NT_coo, neuron_row, type_col = bipartite_cooarray(
            df=class_meta,\
            row_col=['id', 'type'],\
            weight=False,\
            row_order=n_ids\
            )
        
        a_NT_out = (a_NN_coo @ bpt_NT_coo).tocsr() # out matrix
        a_NT_in = (a_NN_coo.T @ bpt_NT_coo).tocsr() # in matrix
        # out matrix normalisation:
        a_NT_out_normalised = csr_row_norm(a_NT_out)
        # in matrix normalisation:
        a_NT_in_normalised = csr_row_norm(a_NT_in)

        all_col_names = np.concatenate([
            np.char.add(type_col, '_out'),
            np.char.add(type_col, '_in')
        ])

        all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_NT_out_normalised, a_NT_in_normalised]).toarray(), columns=all_col_names, index=neuron_row)
        if save_things:
            all_norm_vec_df.to_parquet(f'{todir}/V_{class_name}_{edgetype}_vec.parquet')
        # todir = f'../../data/celegans/processed/{class_name}'
        # save_dir = f'{todir}/V_{class_name}_{edgetype}_vec.parquet'
        # test_df = pd.read_parquet(save_dir)
        # print((test_df - all_norm_vec_df).fillna(0).sum().sum())

In [95]:
# class_string = 'Cook cell category'
# typeedge_df = all_edges_df
# class_meta = ce_meta.reset_index()[['Neuron', class_string]].rename(columns={'Neuron':'id', class_string:'type'}).copy(True)


In [79]:
n_ids = class_meta.id.sort_values().unique()

In [80]:
a_NN_coo, _ = adjacency_cooarray(df=typeedge_df, row_col=['pre', 'post'], id_order=n_ids, weight=True, directed=True)


In [81]:
bpt_NT_coo, neuron_row, type_col = bipartite_cooarray(df=class_meta, row_col=['id', 'type'], weight=False, row_order=n_ids)


In [82]:
a_NT_out = (a_NN_coo @ bpt_NT_coo).tocsr() # out matrix
a_NT_in = (a_NN_coo.T @ bpt_NT_coo).tocsr() # in matrix

# out matrix normalisation:
a_NT_out_normalised = csr_row_norm(a_NT_out)

# in matrix normalisation:
a_NT_in_normalised = csr_row_norm(a_NT_in)


In [84]:
all_col_names = np.concatenate([
    np.char.add(type_col, '_out'),
    np.char.add(type_col, '_in')
])
all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_NT_out_normalised, a_NT_in_normalised]).toarray(), columns=all_col_names, index=neuron_row)

In [101]:
# if save_things:
#     all_norm_vec_df.to_parquet(f'{c}')