### Imports

In [1]:
import json

from pyod.models.ecod import ECOD
from tqdm import tqdm
import networkx as nx
import numpy as np
import pandas as pd
import pingouin as pg
import torch
import torch_geometric.transforms as T

from utils.utils_go import *
from dgi.utils_dgi import *
from vgae.utils_vgae import *

# os.environ["DGLBACKEND"] = "pytorch"
# %load_ext autotime

### Parameters

In [2]:
file = open("exp.json")
experiment = json.load(file)
exp_num = experiment["exp"]

file = open("output/{}/parameters.json".format(exp_num))
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

methods = params["methods"]
print("Methods:\t", methods)

data_variations = params["data_variations"]
print("Data variations:", data_variations)

dimension = params["dimension"]
print("Dimension:\t", dimension)

threshold_corr = params["threshold_corr"]
print("Threshold corr:\t", threshold_corr)

iterations = params["iterations"]
print("Iterations:\t", iterations)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id_ = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id_)

seeds = params["seeds"]
print("Seeds:\t\t", seeds)

Exp:		 exp3
Methods:	 ['vgae']
Data variations: ['none']
Dimension:	 3
Threshold corr:	 0.01
Iterations:	 2
Groups id:	 ['pck1', 'zwf1', 'WT']
Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}
Seeds:		 [42, 43, 44, 45, 46]


---

In [None]:
df_temp = pd.DataFrame([[1.0, -0.6, 0.3, 0.5],
                        [-0.6, 1.0, 0.8, ],
                        [-0.6, 0.8, 1.0, 0.6],
                        [0.3, 0.8, 1.0]])
print(df_temp)
threshold = 0.6
df_weighted_edges = (df_temp.where(np.triu(np.ones(df_temp.shape), k=-2).astype(bool)).stack())
df_weighted_edges = df_weighted_edges.dropna().to_frame()
df_weighted_edges.reset_index(inplace=True)
df_weighted_edges.columns = ["source", "target", "weight"]
df_weighted_edges = df_weighted_edges[df_weighted_edges["source"] != df_weighted_edges["target"]]
df_weighted_edges = df_weighted_edges[df_weighted_edges["weight"].abs() >= threshold] # change
df_weighted_edges

In [9]:
import pandas as pd
import numpy as np
from random import choice
import matplotlib.pyplot as plt

n = 10
a = 2

A = np.matrix([np.random.randn(n) + np.random.randn(1)*a for i in range(n)])
A = A*np.transpose(A)
D_half = np.diag(np.diag(A)**(-0.5))
C = D_half*A*D_half
df_temp = pd.DataFrame(C)
df_temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.589276,0.933129,0.156881,0.838955,0.884754,-0.888291,-0.803634,0.729431,-0.782729
1,0.589276,1.0,0.512332,-0.295184,0.546986,0.465279,-0.483242,-0.440064,0.440113,-0.578044
2,0.933129,0.512332,1.0,-0.087187,0.893089,0.96914,-0.928936,-0.949386,0.822337,-0.911183
3,0.156881,-0.295184,-0.087187,1.0,-0.131256,-0.087343,0.0773,0.282348,-0.219869,0.344405
4,0.838955,0.546986,0.893089,-0.131256,1.0,0.862729,-0.937154,-0.861525,0.782075,-0.866768
5,0.884754,0.465279,0.96914,-0.087343,0.862729,1.0,-0.925264,-0.912972,0.824649,-0.912637
6,-0.888291,-0.483242,-0.928936,0.0773,-0.937154,-0.925264,1.0,0.908414,-0.89814,0.910514
7,-0.803634,-0.440064,-0.949386,0.282348,-0.861525,-0.912972,0.908414,1.0,-0.862807,0.942283
8,0.729431,0.440113,0.822337,-0.219869,0.782075,0.824649,-0.89814,-0.862807,1.0,-0.868981
9,-0.782729,-0.578044,-0.911183,0.344405,-0.866768,-0.912637,0.910514,0.942283,-0.868981,1.0


In [11]:
threshold = 0.6
df_weighted_edges = (df_temp.where(np.triu(np.ones(df_temp.shape), k=-df_temp.shape[1]).astype(bool)).stack())
df_weighted_edges = df_weighted_edges.dropna().to_frame()
df_weighted_edges.reset_index(inplace=True)
df_weighted_edges.columns = ["source", "target", "weight"]
df_weighted_edges = df_weighted_edges[df_weighted_edges["source"] != df_weighted_edges["target"]]
df_weighted_edges = df_weighted_edges[df_weighted_edges["weight"].abs() >= threshold] # change
df_weighted_edges
# df_weighted_edges[df_weighted_edges["source"] == 0]

Unnamed: 0,source,target,weight
2,0,2,0.933129
4,0,4,0.838955
5,0,5,0.884754
6,0,6,-0.888291
7,0,7,-0.803634
8,0,8,0.729431
9,0,9,-0.782729
20,2,0,0.933129
24,2,4,0.893089
25,2,5,0.96914


In [49]:
nodes_data = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, groups_id[0], subgroups_id_[groups_id[0]][0])).iloc[:, 2:]
edges_data = pd.read_csv("output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(exp, groups_id[0], subgroups_id_[groups_id[0]][0]))
edges_data[edges_data["target"] == 0]

Unnamed: 0,source,target,weight


In [7]:

dataset = CustomDatasetDGI("g1", nodes_data, edges_data)
graph = dataset[0]

print(graph)

G = dgl.to_networkx(graph)
if G.is_directed():
    print("El grafo es dirigido.")
else:
    print("El grafo no es dirigido.")

Graph(num_nodes=120, num_edges=6937,
      ndata_schemes={'feat': Scheme(shape=(24,), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
El grafo es dirigido.


---

### Node embeddings

In [3]:
# read raw data
df_join_raw = pd.read_csv("input/{}_raw.csv".format(exp), index_col=0)
df_join_raw = df_join_raw.iloc[:, 2:]
df_join_raw

# log10
df_join_raw_log = log10_global(df_join_raw)
df_join_raw_log.head()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# node-embeddings + edge-embeddings
for method in methods: # change
    for data_variation in data_variations: # change   
        for iteration in range(iterations):
            # ---
            # Node embeddings
            # ---
            subgroups_id = subgroups_id_.copy()
            
            torch.manual_seed(seeds[iteration])
            np.random.seed(seeds[iteration])
            
            if data_variation != "none":
                for group in groups_id:
                    subgroups_id[group] = [data_variation]
            print("Subgroups id:\t", subgroups_id)
            
            for group in tqdm(groups_id):
                for subgroup in tqdm(subgroups_id[group]):
                    nodes_data = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup)).iloc[:, 2:]
                    edges_data = pd.read_csv("output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(exp, group, subgroup))

                    if method == "dgi":
                        data = CustomDatasetDGI("g_{}_{}".format(group, subgroup), nodes_data, edges_data)
                        graph = data[0]
                        
                        # train
                        args_ = args_dgi(dimension)
                        train_dgi(exp, graph, args_, method, group, subgroup, iteration)
                    else:
                        """ data = CustomDatasetVGAE("g_{}_{}".format(group, subgroup), nodes_data, edges_data)
                        graph = data[0]

                        # train
                        args_ = args_vgae(dimension)
                        train_vgae(exp, graph, args_, method, group, subgroup, iteration) """
                        
                        transform = T.Compose([
                            T.NormalizeFeatures(),
                            T.ToDevice(device),
                            T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True, split_labels=True, add_negative_train_samples=False),
                            # T.RandomNodeSplit(num_val=0.05, num_test=0.1),
                        ])
                        dataset = CustomDataset(nodes_data, edges_data, transform=transform)
                        train_data, val_data, test_data = dataset[0]

                        # train
                        model = VGAE(Encoder(dataset.num_features, dimension)).to(device)
                        train_vgae_tg(exp, model, train_data, test_data, method, group, subgroup, 0)
                  
            # ---
            # Edge embeddings
            # ---
            subgroups_id = subgroups_id_.copy()
            print(method, data_variation)
            
            if data_variation != "none":
                subgroups_id_op = {}
                for group in groups_id:
                    subgroups_id_op[group] = [data_variation]
            else:
                subgroups_id_op = subgroups_id
            print("Subgroups id op:", subgroups_id_op)
            
            edge_embeddings_global(exp, method, groups_id, subgroups_id_op, iteration)
            
            for group in tqdm(groups_id):
                df_edge_embeddings_concat = pd.DataFrame()
                k = 0
                for subgroup in tqdm(subgroups_id_op[group]):
                    df_edge_embeddings = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_{}_{}_{}_{}.csv".format(exp, method, group, subgroup, iteration))
                    df_edge_embeddings["subgroup"] = [k] * len(df_edge_embeddings)
                    df_edge_embeddings_concat = pd.concat([df_edge_embeddings_concat, df_edge_embeddings])
                    k += 1
                
                df_edge_embeddings_concat.to_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, method, group, data_variation, iteration), index=False)
                    
            # outlier detection (ECOD)
            # dict_df_edge_embeddings_concat_outlier = {}
            dict_df_edge_embeddings_concat_filter = {}

            for group in tqdm(groups_id):
                df_edge_embeddings_concat = pd.read_csv("output/{}/edge_embeddings/edge-embeddings_concat_{}_{}_{}_{}.csv".format(exp, method, group, data_variation, iteration))

                X_train = df_edge_embeddings_concat.iloc[:, 2:-1]

                clf = ECOD()
                clf.fit(X_train)

                X_train["labels"] = clf.labels_ # binary labels (0: inliers, 1: outliers)

                df_edge_embeddings_concat_filter = df_edge_embeddings_concat.copy()
                df_edge_embeddings_concat_filter["labels"] = clf.labels_

                # save
                df_edge_embeddings_concat_filter.to_csv("output/{}/edge_embeddings/edge-embeddings_concat_outlier_{}_{}_{}_{}.csv".format(exp, method, group, data_variation, iteration), index=False)
                
                df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["labels"] == 0].copy()
                df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, :-1]

                # dict_df_edge_embeddings_concat_outlier[group] = X_train
                dict_df_edge_embeddings_concat_filter[group] = df_edge_embeddings_concat_filter
                
            # mapping idx with id
            for group in tqdm(groups_id):
                df_aux = pd.DataFrame(())
                k = 0
                for subgroup in subgroups_id_op[group]:
                    df_nodes = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup))
                    dict_id = dict(zip(df_nodes["idx"], df_nodes["id"]))

                    # mapping
                    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
                    df_edge_embeddings_concat_filter_aux = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["subgroup"] == k].copy()
                    
                    # print(df_edge_embeddings_concat_filter)
                    df_edge_embeddings_concat_filter_aux["source"] = df_edge_embeddings_concat_filter_aux["source"].map(dict_id)
                    df_edge_embeddings_concat_filter_aux["target"] = df_edge_embeddings_concat_filter_aux["target"].map(dict_id)
                    df_aux = pd.concat([df_aux, df_edge_embeddings_concat_filter_aux])
                    k += 1
                dict_df_edge_embeddings_concat_filter[group] = df_aux
                
            # format id
            if data_variation != "none":
                for group in tqdm(groups_id):
                    # format
                    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
                    df_edge_embeddings_concat_filter["source"] = df_edge_embeddings_concat_filter["source"].map(lambda x: int(x[1:]))
                    df_edge_embeddings_concat_filter["target"] = df_edge_embeddings_concat_filter["target"].map(lambda x: int(x[1:]))
                        
            # filter by different edges
            if data_variation != "none":
                for group in tqdm(groups_id):
                    df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
                    df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["source"] != df_edge_embeddings_concat_filter["target"]].copy()
                    dict_df_edge_embeddings_concat_filter[group] = df_edge_embeddings_concat_filter
                    
            # count edges and filter by count
            dict_df_edges_filter = {}
            for group in tqdm(groups_id):
                # read
                df_edge_embeddings_concat_filter = dict_df_edge_embeddings_concat_filter[group]
                
                # sort edges
                sort_df_edges(df_edge_embeddings_concat_filter)

                df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[["source", "target"]].value_counts().to_frame()
                df_edge_embeddings_concat_filter.reset_index(inplace=True)
                df_edge_embeddings_concat_filter.columns = ["source", "target", "count"]
                
                # filter
                df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter[df_edge_embeddings_concat_filter["count"] == len(subgroups_id[group])]
                df_edge_embeddings_concat_filter = df_edge_embeddings_concat_filter.iloc[:, [0, 1]]
                dict_df_edges_filter[group] = df_edge_embeddings_concat_filter
                
                df_edge_embeddings_concat_filter.sort_values(["source", "target"], ascending=True, inplace=True)
                df_edge_embeddings_concat_filter.to_csv("output/{}/common_edges/common_edges_{}_{}_{}_{}.csv".format(exp, method, group, data_variation, iteration), index=False)

Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


  0%|          | 0/3 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 100.53it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:03<00:00, 92.25it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 103.22it/s]
100%|██████████| 3/3 [00:09<00:00,  3.15s/it]
 33%|███▎      | 1/3 [00:09<00:18,  9.45s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 106.93it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 100.86it/s]
100%|██████████| 2/2 [00:05<00:00,  2.93s/it]
 67%|██████▋   | 2/3 [00:15<00:07, 

vgae none
Subgroups id op: {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


100%|██████████| 3/3 [00:01<00:00,  2.18it/s]
100%|██████████| 2/2 [00:00<00:00,  2.38it/s]
100%|██████████| 5/5 [00:02<00:00,  1.84it/s]
100%|██████████| 3/3 [00:04<00:00,  1.65s/it]
100%|██████████| 3/3 [00:00<00:00, 89.00it/s]
100%|██████████| 2/2 [00:00<00:00, 178.77it/s]
100%|██████████| 5/5 [00:00<00:00, 83.95it/s]
100%|██████████| 3/3 [00:00<00:00,  7.68it/s]
100%|██████████| 3/3 [00:00<00:00,  5.26it/s]
100%|██████████| 3/3 [00:00<00:00, 74.40it/s]
100%|██████████| 3/3 [00:00<00:00, 34.98it/s]


Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


  0%|          | 0/3 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 108.35it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:03<00:00, 95.83it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 105.52it/s]
100%|██████████| 3/3 [00:08<00:00,  2.95s/it]
 33%|███▎      | 1/3 [00:08<00:17,  8.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 106.13it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 105.41it/s]
100%|██████████| 2/2 [00:05<00:00,  2.87s/it]
 67%|██████▋   | 2/3 [00:14<00:07,  7.02s/it]
[A


vgae none
Subgroups id op: {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


100%|██████████| 3/3 [00:01<00:00,  2.13it/s]
100%|██████████| 2/2 [00:00<00:00,  2.11it/s]
100%|██████████| 5/5 [00:02<00:00,  2.19it/s]
100%|██████████| 3/3 [00:04<00:00,  1.55s/it]
100%|██████████| 3/3 [00:00<00:00, 120.62it/s]
100%|██████████| 2/2 [00:00<00:00, 178.19it/s]
100%|██████████| 5/5 [00:00<00:00, 93.23it/s]
100%|██████████| 3/3 [00:00<00:00,  7.71it/s]
100%|██████████| 3/3 [00:00<00:00,  7.82it/s]
100%|██████████| 3/3 [00:00<00:00, 59.83it/s]
100%|██████████| 3/3 [00:00<00:00, 34.63it/s]


In [4]:
# join
list_details = []

for method in methods:  
    for k, group in enumerate(groups_id):        
        dict_df_edges_filter = {}
        dict_df_corr = {}
        dict_df_edges_filter_weight = {}
    
        for data_variation in data_variations:
            list_common_subgraph = []
            for iteration in range(iterations):
                df_edges_filter_weight_filter = pd.read_csv("output/{}/common_edges/common_edges_{}_{}_{}_{}.csv".format(exp, method, group, data_variation, iteration))
                # print(df_edges_filter_weight_filter)

                G = nx.from_pandas_edgelist(df_edges_filter_weight_filter) # , edge_attr=["weight"])
                # SG = G.subgraph([0, 1, 2, 3, 4, 5])
                # graph_partial_detail(SG, edges=True)
                list_common_subgraph.append(G)
                
            print("Union")
            # union
            U = nx.compose_all(list_common_subgraph)
            
            df_compose_subgraph = nx.to_pandas_edgelist(U)       
            dict_df_edges_filter[group] = df_compose_subgraph.iloc[:, [0, 1]]
            
            # correlation
            nodes = list(U.nodes())
            
            df_join_raw_filter = df_join_raw_log.loc[nodes, :]
            # df_join_raw_filter = df_join_raw_filter.filter(regex=group, axis=1)
            df_join_raw_filter = df_join_raw_filter.filter(like=group, axis=1)

            df_join_raw_filter_t= df_join_raw_filter.T
            # df_join_raw_filter_corr = df_join_raw_filter_t.corr(method="pearson")
            df_join_raw_filter_corr = pg.pcorr(df_join_raw_filter_t)
            dict_df_corr[group] = df_join_raw_filter_corr
            
            # get new correlation
            df_edges_filter_weight = dict_df_edges_filter[group].copy()
            df_corr = dict_df_corr[group]

            df_edges_filter_weight["weight"] = df_edges_filter_weight.apply(lambda x: df_corr.loc[x["source"], x["target"]], axis=1)
            df_edges_filter_weight.sort_values(["source", "target"], ascending=True, inplace=True)
            dict_df_edges_filter_weight[group] = df_edges_filter_weight
            
            # common subgraph
            df_edges_filter_weight = dict_df_edges_filter_weight[group]
            # G = nx.from_pandas_edgelist(df_edges_filter_weight, "source", "target", edge_attr="weight")
            print(method, group, data_variation)
            # print("Before")
            # graph_partial_detail(G, edges=True)
                
            # filter by abs(weight) >= threshold
            df_edges_filter_weight = dict_df_edges_filter_weight[group]
            df_edges_filter_weight_filter = df_edges_filter_weight[df_edges_filter_weight["weight"].abs() >= threshold_corr]
            df_edges_filter_weight_filter.to_csv("output/{}/common_edges/common_edges_{}_{}_{}.csv".format(exp, method, group, data_variation), index=False)
            
            # print("After")
            # graph_partial_detail(G, edges=True)
            G = nx.from_pandas_edgelist(df_edges_filter_weight_filter, "source", "target", edge_attr="weight")
            list_details.append([method, group, data_variation, G.number_of_nodes(), G.number_of_edges(), nx.density(G)])

df_details = pd.DataFrame(list_details, columns=["Method", "Group", "Data var.", "Num. nodes", "Num. edges", "Density"])
df_details.to_csv("output/{}/common_edges/summary.csv".format(exp), index=False)

Union
vgae pck1 none
Union
vgae zwf1 none
Union
vgae WT none
