### Imports

In [1]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

  @numba.jit()
  @numba.jit()
  @numba.jit()


time: 170 µs (started: 2023-05-23 09:24:21 -05:00)


  @numba.jit()


### Parameters

In [None]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
print("Dimensions:\t", dimensions)

### Edge embeddings

In [None]:
# Get edges embeddings

for dimension in tqdm(dimensions):
    # Get embeddings    
    for i in tqdm(subgroups):
        # Read dataset
        df_node_embeddings = pd.read_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], dimension, i), index_col=0)
        df_edges = pd.read_csv("{}/output_preprocessing/graph_data/{}_edges_data_{}.csv".format(dir, group[0], i))
        
        # Get edges embeddings
        list_df_edge_embeddings, list_edge_embeddings_legend = edge2vecx([df_node_embeddings], [df_edges], [""])
        for j in range(len(list_df_edge_embeddings)):
            list_df_edge_embeddings[j].to_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], dimension, i, list_edge_embeddings_legend[j]), index=True)

In [97]:
df_edge_embeddings = pd.read_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], 3, 1, "L2"), index_col=0)
df_edge_embeddings

Unnamed: 0,0,1,2
"(0, 1)",1.922994,0.271591,0.603832
"(0, 2)",1.238694,0.015087,1.994198
"(0, 3)",0.738467,0.980709,4.727155
"(0, 4)",3.424663,0.013792,0.077268
"(0, 5)",3.254115,1.850905,0.312955
...,...,...,...
"(6202, 6212)",0.006001,0.014718,0.291796
"(6204, 6205)",0.091653,0.188629,0.090315
"(6206, 6208)",0.595720,0.175068,0.750320
"(6208, 6209)",0.374861,0.130178,0.202455


time: 1.62 s (started: 2023-05-21 14:16:54 -05:00)


### Join embeddings

In [None]:
for dimension in tqdm(dimensions):
    df_edge_embeddings_join = pd.DataFrame()
    for i in tqdm(subgroups):
        # Read dataset
        df_edge_embeddings = pd.read_csv("{}/output_{}/edge_embeddings/{}_edge-embeddings_{}_{}_{}.csv".format(dir, method, group[0], dimension, i, "L2"), index_col=0)
        df_edge_embeddings["subgroup"] = [i] * len(df_edge_embeddings)

        df_edge_embeddings_join = pd.concat([df_edge_embeddings_join, df_edge_embeddings])
    df_edge_embeddings_join.to_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=True)

100%|█████████████████████████████████████████████| 2/2 [00:05<00:00,  2.76s/it]
100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.28s/it]

time: 25.3 s (started: 2023-05-21 14:16:56 -05:00)





In [None]:
df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=0)
df_edge_embeddings_join

Unnamed: 0,0,1,2,subgroup
"(0, 1)",1.922994,0.271591,0.603832,1
"(0, 2)",1.238694,0.015087,1.994198,1
"(0, 3)",0.738467,0.980709,4.727155,1
"(0, 4)",3.424663,0.013792,0.077268,1
"(0, 5)",3.254115,1.850905,0.312955,1
...,...,...,...,...
"(6188, 6223)",0.007155,0.005748,0.146487,2
"(6195, 6196)",0.042215,0.274724,0.023031,2
"(6195, 6197)",0.046594,0.680936,0.023798,2
"(6196, 6197)",0.000108,0.090630,0.000006,2


time: 5.33 s (started: 2023-05-21 14:17:21 -05:00)


In [None]:
# Plot join

for dimension in tqdm(dimensions):
    # Read dataset
    df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index_col=0)

    x = df_edge_embeddings_join.iloc[:, 0]
    y = df_edge_embeddings_join.iloc[:, 1]
    z = df_edge_embeddings_join.iloc[:, 2]

    # Creating figure
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    # Creating plot
    ax.scatter3D(x, y, z, c=df_edge_embeddings_join.iloc[:, -1], alpha=0.1)
    # plt.title("Dimension: {}".format(dimension))

    # show plot
    plt.savefig("{}/output_{}/plots/{}_plot_subgroups_{}_{}.png".format(dir, method, group[0], dimension, "L2"))
    # plt.show()
    plt.close()

100%|████████████████████████████████████████████| 1/1 [02:45<00:00, 165.77s/it]

time: 2min 45s (started: 2023-05-21 14:17:27 -05:00)





### Clustering

In [101]:
# silhouette_score(X_train.iloc[:, :-1], X_train.iloc[:, -1])

time: 441 µs (started: 2023-05-21 14:20:13 -05:00)


In [None]:
# Outlier detection

for dimension in tqdm(dimensions):
    df_edge_embeddings_join = pd.read_csv("{}/output_{}/edge_embeddings_join/{}_edge-embeddings_join_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index_col=0)
    
    X_train = df_edge_embeddings_join.iloc[:, :-1]
    clusterer = hdbscan.HDBSCAN(min_cluster_size=100, core_dist_n_jobs=-1).fit(X_train)

100%|████████████████████████████████████████████| 1/1 [10:12<00:00, 612.37s/it]

time: 10min 12s (started: 2023-05-21 14:20:13 -05:00)





In [None]:
threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
outliers = np.where(clusterer.outlier_scores_ > threshold)[0]

print(len(outliers))
outliers

604687


array([      0,       1,       2, ..., 6046824, 6046825, 6046848])

time: 12 s (started: 2023-05-21 14:30:26 -05:00)


In [104]:
outliers_no = np.setdiff1d(np.arange(len(df_edge_embeddings_join)), outliers)
print(len(outliers_no))
outliers_no

5442178


array([      3,       5,      11, ..., 6046862, 6046863, 6046864])

time: 169 ms (started: 2023-05-21 14:30:38 -05:00)


In [105]:
print("Total:", len(df_edge_embeddings_join))
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")

temp = df_edge_embeddings_join.iloc[outliers, :]
x = temp.iloc[:, 0]
y = temp.iloc[:, 1]
z = temp.iloc[:, 2]
ax.scatter3D(x, y, z, c="gray", alpha=0.005)
print("Num. of outliers:", len(temp))

temp = df_edge_embeddings_join.iloc[outliers_no, :]
x = temp.iloc[:, 0]
y = temp.iloc[:, 1]
z = temp.iloc[:, 2]
ax.scatter3D(x, y, z, c="red", alpha=0.005)
print("Num. of no-outliers:", len(temp))

# plt.title("Dimension: {}".format(dimension))
plt.savefig("{}/output_{}/plots/{}_plot_cluster_{}_{}.png".format(dir, method, group[0], dimension, "L2"))
# plt.show()
plt.close()

Total: 6046865
Num. of outliers: 604687
Num. of no-outliers: 5442178
time: 2min 40s (started: 2023-05-21 14:30:38 -05:00)


###  Filter common edges

In [None]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=0)
df_edge_embeddings_join_filter.set_index([pd.Index([eval(item) for item in df_edge_embeddings_join_filter.index])], inplace=True)
df_edge_embeddings_join_filter.to_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index=True)
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,4,3.424663,0.013792,0.077268,1
0,6,2.585806,0.045589,0.290562,1
0,12,1.810337,0.394792,0.006603,1
0,14,0.674824,0.026581,1.662757,1
0,22,0.809482,0.053984,1.181507,1
...,...,...,...,...,...
6188,6223,0.007155,0.005748,0.146487,2
6195,6196,0.042215,0.274724,0.023031,2
6195,6197,0.046594,0.680936,0.023798,2
6196,6197,0.000108,0.090630,0.000006,2


time: 55.4 s (started: 2023-05-21 14:33:40 -05:00)


In [None]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,2,0.001284,2.294346e-06,0.002085,1
0,3,0.002055,3.728568e-06,0.003333,1
0,5,0.000054,1.357996e-07,0.000086,1
0,6,0.002394,4.164416e-06,0.003894,1
0,7,0.000155,7.287300e-08,0.000266,1
...,...,...,...,...,...
6149,6152,1.093219,1.186676e-02,0.002971,2
6169,6196,0.000892,6.112694e-03,0.056493,2
6172,6173,0.005480,1.983146e-02,0.155003,2
6195,6196,0.104158,9.271714e-03,0.134301,2


time: 2.54 s (started: 2023-05-22 09:46:49 -05:00)


---

In [12]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}_1/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,2,0.001284,2.294346e-06,0.002085,1
0,3,0.002055,3.728568e-06,0.003333,1
0,5,0.000054,1.357996e-07,0.000086,1
0,6,0.002394,4.164416e-06,0.003894,1
0,7,0.000155,7.287300e-08,0.000266,1
...,...,...,...,...,...
6149,6152,1.093219,1.186676e-02,0.002971,2
6169,6196,0.000892,6.112694e-03,0.056493,2
6172,6173,0.005480,1.983146e-02,0.155003,2
6195,6196,0.104158,9.271714e-03,0.134301,2


time: 2.29 s (started: 2023-05-22 09:57:00 -05:00)


In [10]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}_2/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,2,0.001284,2.294316e-06,0.002085,1
0,3,0.002055,3.728568e-06,0.003333,1
0,5,0.000054,1.358070e-07,0.000086,1
0,6,0.002394,4.164375e-06,0.003894,1
0,7,0.000155,7.287840e-08,0.000266,1
...,...,...,...,...,...
6149,6152,1.093246,1.186235e-02,0.002971,2
6169,6196,0.000892,6.111961e-03,0.056498,2
6172,6173,0.005478,1.982877e-02,0.155020,2
6195,6196,0.104158,9.271107e-03,0.134309,2


time: 2.73 s (started: 2023-05-22 09:47:57 -05:00)


In [11]:
df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}_3/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
0,2,0.001284,2.294316e-06,0.002085,1
0,3,0.002055,3.728568e-06,0.003333,1
0,5,0.000054,1.358070e-07,0.000086,1
0,6,0.002394,4.164456e-06,0.003894,1
0,7,0.000155,7.287840e-08,0.000266,1
...,...,...,...,...,...
6149,6152,1.093247,1.186097e-02,0.002970,2
6169,6196,0.000892,6.111613e-03,0.056496,2
6172,6173,0.005478,1.982743e-02,0.155012,2
6195,6196,0.104159,9.270705e-03,0.134306,2


time: 2.72 s (started: 2023-05-22 09:51:33 -05:00)


---
---

In [109]:
df_nodes = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(dir, group[0], 1))
df_nodes

Unnamed: 0,idx,degree,ionMz
0,0,29,59.0049
1,1,114,99.9716
2,2,30,120.9310
3,3,106,172.0403
4,4,119,183.0464
...,...,...,...
6230,6230,2,901.7714
6231,6231,4,835.3027
6232,6232,5,835.3377
6233,6233,2,844.5313


time: 15.4 ms (started: 2023-05-21 14:34:38 -05:00)


In [None]:
# Mapping idx with ionMz
dict_df_nodes = {}
list_index = []
for i in tqdm(subgroups):
    df_nodes = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(dir, group[0], i))
    dict_df_nodes[i] = df_nodes

for dimension in tqdm(dimensions):
    df_edge_embeddings_join_filter = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter_{}_{}.csv".format(dir, method, group[0], 3, "L2"), index_col=[0, 1])

    for row in df_edge_embeddings_join_filter.itertuples():
        df_nodes = dict_df_nodes[str(row[-1])]        
        list_index.append((df_nodes.iloc[row[0][0], -1], df_nodes.iloc[row[0][1], -1]))

100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 193.31it/s]
100%|████████████████████████████████████████████| 1/1 [02:32<00:00, 152.19s/it]

time: 2min 34s (started: 2023-05-21 14:34:38 -05:00)





In [None]:
# Set new index
df_edge_embeddings_join_filter.set_index([pd.Index(list_index)], inplace=True)
df_edge_embeddings_join_filter

Unnamed: 0,Unnamed: 1,0,1,2,subgroup
59.0049,183.0464,3.424663,0.013792,0.077268,1
59.0049,210.0762,2.585806,0.045589,0.290562,1
59.0049,372.9437,1.810337,0.394792,0.006603,1
59.0049,373.0051,0.674824,0.026581,1.662757,1
59.0049,490.9350,0.809482,0.053984,1.181507,1
...,...,...,...,...,...
717.2421,717.2314,0.007155,0.005748,0.146487,2
585.1060,654.4398,0.042215,0.274724,0.023031,2
585.1060,655.3718,0.046594,0.680936,0.023798,2
654.4398,655.3718,0.000108,0.090630,0.000006,2


time: 2.55 s (started: 2023-05-21 14:37:12 -05:00)


In [None]:
# Count
temp = df_edge_embeddings_join_filter.index.value_counts().to_frame()
df_edge_embeddings_join_filter_count = temp[temp["count"] == len(subgroups)]
df_edge_embeddings_join_filter_count.to_csv("{}/output_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=True)
df_edge_embeddings_join_filter_count

Unnamed: 0,Unnamed: 1,count
389.2739,523.1860,2
227.2685,238.0154,2
334.0560,579.1397,2
483.0778,562.1602,2
483.0778,562.2176,2
...,...,...
173.0822,230.9876,2
59.0137,677.4234,2
424.1145,780.5817,2
807.4405,812.4817,2


time: 1.8 s (started: 2023-05-21 14:37:15 -05:00)


In [None]:
df_edge_embeddings_join_filter_count = pd.read_csv("{}/output_{}/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], 3, "L2"))
df_edge_embeddings_join_filter_count.columns = ["source", "target", "count"]
# print(df_edge_embeddings_join_filter_count.info())
df_edge_embeddings_join_filter_count

Unnamed: 0,source,target,count
0,389.2739,523.1860,2
1,227.2685,238.0154,2
2,334.0560,579.1397,2
3,483.0778,562.1602,2
4,483.0778,562.2176,2
...,...,...,...
485312,173.0822,230.9876,2
485313,59.0137,677.4234,2
485314,424.1145,780.5817,2
485315,807.4405,812.4817,2


time: 83.8 ms (started: 2023-05-21 14:37:17 -05:00)


In [None]:
# Number of nodes
l1 = df_edge_embeddings_join_filter_count["source"].values
l2 = df_edge_embeddings_join_filter_count["target"].values

num_of_nodes = len(np.unique(np.concatenate((l1, l2), axis=0)))
num_of_nodes

---

In [157]:
df_edge_embeddings_join_filter_count1 = pd.read_csv("{}/output_{}_1/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], 3, "L2"))
df_edge_embeddings_join_filter_count1.columns = ["source", "target", "count"]
# print(df_edge_embeddings_join_filter_count.info())
df_edge_embeddings_join_filter_count1.sort_values(by=["source", "target"])

Unnamed: 0,source,target,count
485012,59.0137,59.0370,2
483718,59.0137,74.0249,2
483933,59.0137,102.0562,2
483910,59.0137,109.0407,2
483818,59.0137,113.0246,2
...,...,...,...
480201,990.5673,988.5664,2
480983,994.9357,420.8285,2
480781,994.9357,538.9569,2
477566,996.7096,676.4897,2


time: 165 ms (started: 2023-05-22 10:51:13 -05:00)


In [158]:
df_edge_embeddings_join_filter_count2 = pd.read_csv("{}/output_{}_2/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], 3, "L2"))
df_edge_embeddings_join_filter_count2.columns = ["source", "target", "count"]
# print(df_edge_embeddings_join_filter_count.info())
df_edge_embeddings_join_filter_count2.sort_values(by=["source", "target"])

Unnamed: 0,source,target,count
484508,59.0137,59.0370,2
484591,59.0137,74.0249,2
484344,59.0137,102.0562,2
484375,59.0137,109.0407,2
484443,59.0137,113.0246,2
...,...,...,...
482214,990.5673,988.5664,2
484516,994.9357,420.8285,2
484306,994.9357,538.9569,2
480311,996.7096,676.4897,2


time: 163 ms (started: 2023-05-22 10:51:13 -05:00)


In [159]:
df_edge_embeddings_join_filter_count3 = pd.read_csv("{}/output_{}_3/edges_filter/{}_edges-filter-count_{}_{}.csv".format(dir, method, group[0], 3, "L2"))
df_edge_embeddings_join_filter_count3.columns = ["source", "target", "count"]
# print(df_edge_embeddings_join_filter_count.info())
df_edge_embeddings_join_filter_count3.sort_values(by=["source", "target"])

Unnamed: 0,source,target,count
479338,59.0137,59.0370,2
479249,59.0137,74.0249,2
479257,59.0137,102.0562,2
479284,59.0137,109.0407,2
479275,59.0137,113.0246,2
...,...,...,...
476396,990.5673,988.5664,2
477421,994.9357,420.8285,2
477353,994.9357,538.9569,2
480480,996.7096,676.4897,2


time: 156 ms (started: 2023-05-22 10:51:13 -05:00)


In [160]:
df_temp = pd.concat([df_edge_embeddings_join_filter_count1, df_edge_embeddings_join_filter_count2, df_edge_embeddings_join_filter_count3], ignore_index=True)
df_temp

Unnamed: 0,source,target,count
0,479.3735,869.5574,2
1,483.0778,560.1392,2
2,334.0560,579.1397,2
3,227.2685,238.9223,2
4,483.0778,562.1602,2
...,...,...,...
1455946,173.0822,230.9876,2
1455947,59.0137,677.4234,2
1455948,424.1145,780.5817,2
1455949,807.4405,812.4817,2


time: 21.5 ms (started: 2023-05-22 10:51:14 -05:00)


In [161]:
df_temp["idx"] = df_temp["source"].astype(str) + "-" + df_temp["target"].astype(str)
df_temp

Unnamed: 0,source,target,count,idx
0,479.3735,869.5574,2,479.3735-869.5574
1,483.0778,560.1392,2,483.0778-560.1392
2,334.0560,579.1397,2,334.056-579.1397
3,227.2685,238.9223,2,227.2685-238.9223
4,483.0778,562.1602,2,483.0778-562.1602
...,...,...,...,...
1455946,173.0822,230.9876,2,173.0822-230.9876
1455947,59.0137,677.4234,2,59.0137-677.4234
1455948,424.1145,780.5817,2,424.1145-780.5817
1455949,807.4405,812.4817,2,807.4405-812.4817


time: 1.34 s (started: 2023-05-22 10:51:14 -05:00)


In [162]:
list_index = df_temp["idx"].values
df_temp.set_index([pd.Index(list_index)], inplace=True)
df_temp

Unnamed: 0,source,target,count,idx
479.3735-869.5574,479.3735,869.5574,2,479.3735-869.5574
483.0778-560.1392,483.0778,560.1392,2,483.0778-560.1392
334.056-579.1397,334.0560,579.1397,2,334.056-579.1397
227.2685-238.9223,227.2685,238.9223,2,227.2685-238.9223
483.0778-562.1602,483.0778,562.1602,2,483.0778-562.1602
...,...,...,...,...
173.0822-230.9876,173.0822,230.9876,2,173.0822-230.9876
59.0137-677.4234,59.0137,677.4234,2,59.0137-677.4234
424.1145-780.5817,424.1145,780.5817,2,424.1145-780.5817
807.4405-812.4817,807.4405,812.4817,2,807.4405-812.4817


time: 9.8 ms (started: 2023-05-22 10:51:15 -05:00)


In [163]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 vgae
Group:		 ['pck1^', 2]
Subgroup:	 ['1', '2']
Dimensions:	 [3]
time: 3.54 ms (started: 2023-05-22 10:51:15 -05:00)


In [164]:
# Count
temp = df_temp.index.value_counts().to_frame()
df_temp = temp[temp["count"] == 3]
df_temp

Unnamed: 0,count
479.3735-869.5574,3
255.2332-644.0301,3
255.2332-604.3977,3
255.2332-607.4168,3
255.2332-623.4881,3
...,...
745.5002-764.5393,3
216.9186-424.8639,3
745.5002-758.57,3
745.5002-766.5135,3


time: 536 ms (started: 2023-05-22 10:51:15 -05:00)


---
---

In [114]:
df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], 2))
# print(df_edges.info())
df_edges

Unnamed: 0,source,target,weight
0,59.0049,338.1873,0.507709
1,59.0049,357.2834,0.516811
2,59.0049,453.9675,0.507832
3,59.0049,587.1823,0.521553
4,59.0049,613.3933,0.516046
...,...,...,...
4417350,996.5509,997.5542,0.618624
4417351,996.5509,998.4845,0.678071
4417352,996.7096,997.5542,0.566913
4417353,997.5542,997.7131,0.589712


time: 1.34 s (started: 2023-05-21 14:37:17 -05:00)


In [None]:
# Get weight
df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
s = []
t = []
for row in df_edge_embeddings_join_filter_count_weight.itertuples():
    if row[1] > row[2]:
        s.append(row[2])
        t.append(row[1])
    else:
        s.append(row[1])
        t.append(row[2])
df_edge_embeddings_join_filter_count_weight["source"] = s
df_edge_embeddings_join_filter_count_weight["target"] = t

# df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
df_edge_embeddings_join_filter_count_weight.sort_values(["source", "target"], ascending=True, inplace=True)
df_edge_embeddings_join_filter_count_weight["idx"] = df_edge_embeddings_join_filter_count_weight["source"].astype(str) + "-" + df_edge_embeddings_join_filter_count_weight["target"].astype(str)
list_aux = df_edge_embeddings_join_filter_count_weight.iloc[:, -1].values

for i in tqdm(subgroups):
    df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], i))
    df_edges.sort_values(["source", "target"], ascending=True, inplace=True)
    df_edges["idx"] = df_edges["source"].astype(str) + "-" + df_edges["target"].astype(str)

    """ list_temp = []
    # for j in tqdm(range(len(df_edges_filter_count_weight))):
    for row in tqdm(df_edge_embeddings_join_filter_count_weight.itertuples()):
        # weight = 1 # df_edges[(df_edges["source"] == row[1]) & (df_edges["target"] == row[2])].iloc[0, -1]
        weight = df_edges[df_edges["idx"] == "{}-{}".format(row[1], row[2])].iloc[0, -2]
        list_temp.append(weight) """
    
    filter = df_edges["idx"].isin(list_aux)
    temp = df_edges[filter]
    list_temp = temp.iloc[:, -2].values
    df_edge_embeddings_join_filter_count_weight["subgroup{}".format(i)] = list_temp
df_edge_embeddings_join_filter_count_weight


100%|█████████████████████████████████████████████| 2/2 [00:09<00:00,  4.97s/it]


Unnamed: 0,source,target,count,idx,subgroup1,subgroup2
479338,59.0137,59.0370,2,59.0137-59.037,0.653159,0.828448
479249,59.0137,74.0249,2,59.0137-74.0249,0.756185,0.617955
479257,59.0137,102.0562,2,59.0137-102.0562,0.618107,0.777138
479284,59.0137,109.0407,2,59.0137-109.0407,0.654224,0.796294
479275,59.0137,113.0246,2,59.0137-113.0246,0.514337,0.841330
...,...,...,...,...,...,...
471175,986.7752,989.4484,2,986.7752-989.4484,0.527028,0.668604
459957,987.5639,990.5673,2,987.5639-990.5673,0.765752,0.633906
464880,988.4452,988.5664,2,988.4452-988.5664,0.671791,0.761600
464853,988.4452,990.5673,2,988.4452-990.5673,0.594618,0.756580


time: 10.9 s (started: 2023-05-21 14:37:19 -05:00)


In [116]:
# df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], 2))
# df_edges[(df_edges["source"] == 986.7752) & (df_edges["target"] == 989.4484)].iloc[0, -1]

time: 325 µs (started: 2023-05-21 14:37:30 -05:00)


In [None]:
# Dispersion (std)
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight.copy()
df_edge_embeddings_join_filter_count_weight_std["std"] = np.std(df_edge_embeddings_join_filter_count_weight_std.iloc[:, -len(subgroups):], axis=1)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std
479338,59.0137,59.0370,2,59.0137-59.037,0.653159,0.828448,0.087645
479249,59.0137,74.0249,2,59.0137-74.0249,0.756185,0.617955,0.069115
479257,59.0137,102.0562,2,59.0137-102.0562,0.618107,0.777138,0.079515
479284,59.0137,109.0407,2,59.0137-109.0407,0.654224,0.796294,0.071035
479275,59.0137,113.0246,2,59.0137-113.0246,0.514337,0.841330,0.163496
...,...,...,...,...,...,...,...
471175,986.7752,989.4484,2,986.7752-989.4484,0.527028,0.668604,0.070788
459957,987.5639,990.5673,2,987.5639-990.5673,0.765752,0.633906,0.065923
464880,988.4452,988.5664,2,988.4452-988.5664,0.671791,0.761600,0.044904
464853,988.4452,990.5673,2,988.4452-990.5673,0.594618,0.756580,0.080981


time: 86.9 ms (started: 2023-05-21 14:37:30 -05:00)


In [None]:
# Filter by std
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight_std[df_edge_embeddings_join_filter_count_weight_std["std"] <= 0.3]
df_edge_embeddings_join_filter_count_weight_std.to_csv("{}/output_{}/edges_filter_weight_std/{}_edge-filter-weight-std_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std
479338,59.0137,59.0370,2,59.0137-59.037,0.653159,0.828448,0.087645
479249,59.0137,74.0249,2,59.0137-74.0249,0.756185,0.617955,0.069115
479257,59.0137,102.0562,2,59.0137-102.0562,0.618107,0.777138,0.079515
479284,59.0137,109.0407,2,59.0137-109.0407,0.654224,0.796294,0.071035
479275,59.0137,113.0246,2,59.0137-113.0246,0.514337,0.841330,0.163496
...,...,...,...,...,...,...,...
471175,986.7752,989.4484,2,986.7752-989.4484,0.527028,0.668604,0.070788
459957,987.5639,990.5673,2,987.5639-990.5673,0.765752,0.633906,0.065923
464880,988.4452,988.5664,2,988.4452-988.5664,0.671791,0.761600,0.044904
464853,988.4452,990.5673,2,988.4452-990.5673,0.594618,0.756580,0.080981


time: 2.28 s (started: 2023-05-21 14:37:30 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std = pd.read_csv("{}/output_{}/edges_filter_weight_std/{}_edge-filter-weight-std_{}_{}.csv".format(dir, method, group[0], dimension, "L2"))
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std
0,59.0137,59.0370,2,59.0137-59.037,0.653159,0.828448,0.087645
1,59.0137,74.0249,2,59.0137-74.0249,0.756185,0.617955,0.069115
2,59.0137,102.0562,2,59.0137-102.0562,0.618107,0.777138,0.079515
3,59.0137,109.0407,2,59.0137-109.0407,0.654224,0.796294,0.071035
4,59.0137,113.0246,2,59.0137-113.0246,0.514337,0.841330,0.163496
...,...,...,...,...,...,...,...
426349,986.7752,989.4484,2,986.7752-989.4484,0.527028,0.668604,0.070788
426350,987.5639,990.5673,2,987.5639-990.5673,0.765752,0.633906,0.065923
426351,988.4452,988.5664,2,988.4452-988.5664,0.671791,0.761600,0.044904
426352,988.4452,990.5673,2,988.4452-990.5673,0.594618,0.756580,0.080981


time: 377 ms (started: 2023-05-21 14:37:32 -05:00)


In [None]:
# Average weight
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std.copy()
df_edge_embeddings_join_filter_count_weight_std_avg["weight"] = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, -(len(subgroups) + 1):-1].mean(axis=1)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,count,idx,subgroup1,subgroup2,std,weight
0,59.0137,59.0370,2,59.0137-59.037,0.653159,0.828448,0.087645,0.740804
1,59.0137,74.0249,2,59.0137-74.0249,0.756185,0.617955,0.069115,0.687070
2,59.0137,102.0562,2,59.0137-102.0562,0.618107,0.777138,0.079515,0.697622
3,59.0137,109.0407,2,59.0137-109.0407,0.654224,0.796294,0.071035,0.725259
4,59.0137,113.0246,2,59.0137-113.0246,0.514337,0.841330,0.163496,0.677833
...,...,...,...,...,...,...,...,...
426349,986.7752,989.4484,2,986.7752-989.4484,0.527028,0.668604,0.070788,0.597816
426350,987.5639,990.5673,2,987.5639-990.5673,0.765752,0.633906,0.065923,0.699829
426351,988.4452,988.5664,2,988.4452-988.5664,0.671791,0.761600,0.044904,0.716696
426352,988.4452,990.5673,2,988.4452-990.5673,0.594618,0.756580,0.080981,0.675599


time: 57.1 ms (started: 2023-05-21 14:37:33 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, [0, 1, -1]]
df_edge_embeddings_join_filter_count_weight_std_avg.to_csv("{}/output_{}/edges_filter_weight_std_avg/{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0370,0.740804
1,59.0137,74.0249,0.687070
2,59.0137,102.0562,0.697622
3,59.0137,109.0407,0.725259
4,59.0137,113.0246,0.677833
...,...,...,...
426349,986.7752,989.4484,0.597816
426350,987.5639,990.5673,0.699829
426351,988.4452,988.5664,0.716696
426352,988.4452,990.5673,0.675599


time: 1.12 s (started: 2023-05-21 14:37:33 -05:00)


In [None]:
df_edge_embeddings_join_filter_count_weight_std_avg = pd.read_csv("{}/output_{}/edges_filter_weight_std_avg/{}_edge-filter-weight-std-avg_{}_{}.csv".format(dir, method, group[0], dimension, "L2"))
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0370,0.740804
1,59.0137,74.0249,0.687070
2,59.0137,102.0562,0.697622
3,59.0137,109.0407,0.725259
4,59.0137,113.0246,0.677833
...,...,...,...
426349,986.7752,989.4484,0.597816
426350,987.5639,990.5673,0.699829
426351,988.4452,988.5664,0.716696
426352,988.4452,990.5673,0.675599


time: 119 ms (started: 2023-05-21 14:37:34 -05:00)


In [123]:
G = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg, "source", "target", edge_attr=["weight"])
graph_detail(G)

Num. nodes: 5261
Num. edges: 426354

time: 752 ms (started: 2023-05-21 14:37:34 -05:00)
