# Import

In [1]:
import numpy as np 
import scipy as sp
import pandas as pd
# import os
from pathlib import Path


In [2]:
from module.data_prep import prep_grn_data, prep_go_meta, prep_foodweb_data, prep_recipe_data
from module.fvec import bipartite_cooarray, adjacency_cooarray, csr_row_norm, csr_col_norm

In [None]:
save_things = False
cwd = Path.cwd()


# Load data:

In [4]:
# Arabidopsis gene regulatory network and associated metadata
g_reg_edges, GO_bipartite_df = prep_grn_data()
go_meta_df = prep_go_meta()

GO_meta = pd.DataFrame({'node':GO_bipartite_df['GO'], 'type':GO_bipartite_df['GO']}) # dummy dataframe

  data = pd.read_csv('../data/grn/AtRegNet.csv', index_col=0)


# Vectorisation:

## GRN vectorisation:

In [5]:

grn_datadir = cwd.parent / 'data' / 'grn' # '../data/grn'

if save_things:
    grn_processed_dir = grn_datadir / 'processed'
    grn_processed_dir.mkdir(parents=True, exist_ok=True)


In [6]:
tfs = g_reg_edges.pre.unique() # all transcription factors
regulated_genes = g_reg_edges.post.unique() # all genes that are regulated
g_reg_ids = list(set(regulated_genes) | set(tfs)) # all nodes in the regulatory network
gene_ids = list(set(GO_bipartite_df.gene)) # genes that have GO labelling

all_gene_ids = list(set(regulated_genes) | set(tfs) | set(GO_bipartite_df.gene))

In [7]:
v = GO_bipartite_df.GO.value_counts()
single_GOs = v[v <=1].index.tolist() # GO terms that only appear once
multi_GOs = v[v >1].index.tolist() # GO terms that appear more than once

In [8]:
GO_bipartite_multigos_df = GO_bipartite_df[GO_bipartite_df['GO'].isin(multi_GOs)].reset_index(drop=True)

In [9]:
# create COO_arrays from edgelists
bpt_gG_coo, gene_row, GO_col = bipartite_cooarray(\
    df=GO_bipartite_multigos_df,\
    row_col=['gene', 'GO'], \
    weight=False, \
    row_order=all_gene_ids)
a_gg_coo, _ = adjacency_cooarray(\
    df=g_reg_edges, \
    row_col=['pre', 'post'], \
    id_order=all_gene_ids, \
    weight=False, \
    directed=True)

In [10]:
# fast implementation of a dot product:
a_gG_out = (a_gg_coo @ bpt_gG_coo).tocsr() # out matrix
a_gG_in = (a_gg_coo.T @ bpt_gG_coo).tocsr() # in matrix


# out matrix normalisation:
a_gG_out_normalised = csr_row_norm(a_gG_out)
# in matrix normalisation:
a_gG_in_normalised = csr_row_norm(a_gG_in)


#######
# remove: redundant columns and rows 
out_col_sums = np.array(a_gG_out_normalised.sum(axis=0)).flatten()  
out_col_keep = (out_col_sums != 0)
in_col_sums = np.array(a_gG_in_normalised.sum(axis=0)).flatten()  
in_col_keep = (in_col_sums != 0)

out_row_sums = np.array(a_gG_out_normalised.sum(axis=1)).flatten()  
in_row_sums = np.array(a_gG_in_normalised.sum(axis=1)).flatten()  
out_row_keep = (out_row_sums != 0)
in_row_keep = (in_row_sums != 0)
row_keep = out_row_keep + in_row_keep # only false if both in/out are false...

# columns and rows to keep:
a_gG_out_normalised = a_gG_out_normalised[:, out_col_keep][row_keep,:]
a_gG_in_normalised = a_gG_in_normalised[:, in_col_keep][row_keep,:]

# column and row names to keep:
all_col_names = np.concatenate([GO_col[out_col_keep] + '_out', GO_col[in_col_keep] + '_in'])
all_row_names = np.array(all_gene_ids)[row_keep]
#######

# construct a dataframe:
all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_gG_out_normalised, a_gG_in_normalised]).toarray(), columns=all_col_names, index=all_row_names)


In [11]:
# save the dataframe:
if save_things:
    all_norm_vec_df.to_parquet(grn_processed_dir / 'grn_GO_vector_multigos.parquet') # takes longer to save/load than recalculate...

## Recipe network vectorisation:

In [12]:
rn_datadir = cwd.parent / 'data' / 'recipe'
if save_things:
    rn_processed_dir = rn_datadir / 'processed'
    rn_processed_dir.mkdir(parents=True, exist_ok=True)


In [13]:
rn_df, meta_df = prep_recipe_data()

cuisine_list = meta_df.cuisine.dropna().unique()

recipe_list = rn_df.r_id.dropna().unique()

ingredient_list = list(set(rn_df.ingredient.dropna().unique()) - {''})
# rn_df = rn_df[rn_df['ingredient'].isin(ingredient_list)].reset_index(drop=True)
rn_df = rn_df[rn_df['ingredient'].isin(ingredient_list)].reset_index(drop=True)

In [14]:
a_RC_coo, recipe_row, cuisine_col = bipartite_cooarray( \
    df=meta_df.sort_values(['r_id', 'cuisine']), \
    row_col=['r_id', 'cuisine'], \
    weight=False, \
    row_order=list(recipe_list), \
    col_order=list(cuisine_list))

In [15]:
a_RI_coo, recipe_row, ingredient_col = bipartite_cooarray( \
    df=rn_df.sort_values(['r_id', 'ingredient']), \
    row_col=['r_id', 'ingredient'], \
    weight=False, 
    row_order=list(recipe_list), \
    col_order=list(ingredient_list))

In [16]:
a_CI_csr = (a_RC_coo.T @ a_RI_coo).tocsr() # csr array

# normalisation:
a_CI_csr_normalised = csr_row_norm(a_CI_csr)

norm_is_c_vec_df = pd.DataFrame(a_CI_csr_normalised.toarray(), columns=ingredient_col, index=cuisine_col)

In [17]:
a_IC_csr = a_CI_csr.T

# standarisation - correct for the number of recipes in each cuisine:
r_sums = np.array(a_RC_coo.tocsr().sum(axis=0)).flatten()  
r_sums[r_sums == 0] = 1 # divide by 1 instead of zero
inv_r_sums = sp.sparse.diags(1 / r_sums)
standardised_a_IC_csr = a_IC_csr @ inv_r_sums

# normalisation:
a_IC_csr_s_normalised = csr_row_norm(standardised_a_IC_csr)

cnorm_cs_i_bpt_df = pd.DataFrame(a_IC_csr_s_normalised.toarray(), columns= cuisine_col, index=ingredient_col)

In [18]:
# standardised_a_IC_df = pd.DataFrame(standardised_a_IC_csr.toarray(), columns= cuisine_col, index=ingredient_col)

In [19]:
if save_things:
    norm_is_c_vec_df.to_parquet(rn_processed_dir / 'is_c_vectors.parquet')

In [20]:
if save_things:
    cnorm_cs_i_bpt_df.to_parquet(rn_processed_dir / 'cs_i_vectors.parquet')

In [21]:
# old_cs_i_vectors = pd.read_parquet('../data/recipe/old_processed/cs_i_vectors.parquet')

# Food web:

In [22]:
fw_datadir = cwd.parent / 'data' / 'foodweb' # '../data/grn'

if save_things:
    fw_processed_dir = fw_datadir / 'processed'
    fw_processed_dir.mkdir(parents=True, exist_ok=True)


In [23]:
# def vectorisation(edge_df, meta_df, edge_row_cols, meta_row_cols, \
#                   edge_weight=True, meta_weight=True, id_order=None, \
#                   redundant_remove=False):

#     adj_AA_coo, _id_order = adjacency_cooarray(df=edge_df, 
#                                                row_col=edge_row_cols, 
#                                                id_order=id_order, 
#                                                weight=edge_weight, 
#                                                directed=True) 
    
#     bpt_AB_coo, row_order, col_order = bipartite_cooarray(df=meta_df, 
#                                                           row_col=meta_row_cols, 
#                                                           weight=meta_weight, 
#                                                           row_order=_id_order)

#     adj_AA_out = (adj_AA_coo @ bpt_AB_coo).tocsr() # out matrix
#     adj_AA_in = (adj_AA_coo.T @ bpt_AB_coo).tocsr() # in matrix

#     # out matrix normalisation:
#     adj_AA_out_normalised = csr_row_norm(adj_AA_out)

#     # in matrix normalisation:
#     adj_AA_in_normalised = csr_row_norm(adj_AA_in)

#     # # put together into one dataframe:
#     # all_col_names = np.concatenate([col_order + '_out', col_order + '_in'])
#     # all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([adj_AA_out_normalised, adj_AA_in_normalised]).toarray(), \
#     #                             columns=all_col_names, index=row_order)

#     # by default, keep all:
#     out_col_keep, in_col_keep = [[True for _ in col_order] for i in range(2)]
#     row_keep = [True for _ in row_order]

#     if redundant_remove:
#         # remove: redundant columns and rows that are empty
#         out_col_sums = np.array(adj_AA_out_normalised.sum(axis=0)).flatten()  
#         out_col_keep = (out_col_sums != 0)
#         in_col_sums = np.array(adj_AA_in_normalised.sum(axis=0)).flatten()  
#         in_col_keep = (in_col_sums != 0)

#         out_row_sums = np.array(adj_AA_out_normalised.sum(axis=1)).flatten()  
#         in_row_sums = np.array(adj_AA_in_normalised.sum(axis=1)).flatten()  
#         out_row_keep = (out_row_sums != 0)
#         in_row_keep = (in_row_sums != 0)
#         row_keep = out_row_keep + in_row_keep # only false if both in/out are false...

#     # columns and rows to keep:
#     adj_AA_out_normalised = adj_AA_out_normalised[:, out_col_keep][row_keep,:]
#     adj_AA_in_normalised = adj_AA_in_normalised[:, in_col_keep][row_keep,:]

#     # column and row names to keep:
#     all_col_names = np.concatenate([col_order[out_col_keep] + '_out', 
#                                     col_order[in_col_keep] + '_in'])
#     all_row_names = np.array(row_order)[row_keep]

#     # construct a dataframe:
#     all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([adj_AA_out_normalised, adj_AA_in_normalised]).toarray(), \
#                                    columns=all_col_names, index=all_row_names)
#     return all_norm_vec_df

In [24]:
fw_df, meta_df_filtered, meta_fine_df_filtered, meta_fine_df_2_filtered = prep_foodweb_data()

In [25]:
preys = set(fw_df.prey)
predators = set(fw_df.predator)

all_animals = sorted(list(preys | predators))

In [26]:
# f for fine
# ff for finer
# fff for finest

prep = zip((meta_df_filtered, meta_fine_df_filtered, meta_fine_df_2_filtered), ('f', 'ff', 'fff'))

In [27]:
for index, data in enumerate(prep):
    meta = data[0]
    label = data[1]

        
    bpt_AS_coo, animal_row, type_col = bipartite_cooarray(\
        df=meta.sort_values(['node', 'type']),\
        row_col=['node', 'type'], \
        weight=False, \
        row_order=all_animals)
    a_AA_coo, _ = adjacency_cooarray(\
        df=fw_df, \
        row_col=['prey', 'predator'], \
        id_order=all_animals, \
        weight=False, \
        directed=True)

    a_AS_out = (a_AA_coo @ bpt_AS_coo).tocsr() # out matrix
    a_AS_in = (a_AA_coo.T @ bpt_AS_coo).tocsr() # in matrix

    # out matrix normalisation:
    a_AS_out_normalised = csr_row_norm(a_AS_out)

    # in matrix normalisation:
    a_AS_in_normalised = csr_row_norm(a_AS_in)

    # put together into one dataframe:
    all_col_names = np.concatenate([type_col + '_out', type_col + '_in'])
    all_norm_vec_df = pd.DataFrame(sp.sparse.hstack([a_AS_out_normalised, a_AS_in_normalised]).toarray(), columns=all_col_names, index=animal_row)
    if save_things:
        all_norm_vec_df.to_parquet(fw_processed_dir / f'{label}_vectors.parquet')

In [28]:
# old_data = pd.read_parquet('../data/foodweb/processed/fff_vectors.parquet')