### Imports

In [112]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.19 ms (started: 2023-05-26 17:29:38 -05:00)


### Parameters

In [145]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
dimension = dimensions[0]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['WT', 5]
Subgroup:	 ['1', '2', '3', '4', '5']
Dimensions:	 [3]
time: 3.93 ms (started: 2023-05-26 22:29:06 -05:00)


### Get common subgraphs

In [114]:
list_graphs = []

time: 500 µs (started: 2023-05-26 17:29:38 -05:00)


In [115]:
graphs = []
for item in tqdm(subgroups):
    weighted_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], item))
    G = nx.from_pandas_edgelist(weighted_edges, "source", "target", edge_attr="weight")
    graph_detail(G)
    graphs.append(G)

 50%|█████     | 1/2 [00:03<00:03,  3.57s/it]

Num. nodes: 6235
Num. edges: 1629510



100%|██████████| 2/2 [00:13<00:00,  6.61s/it]

Num. nodes: 6234
Num. edges: 4417355

time: 13.2 s (started: 2023-05-26 17:29:38 -05:00)





In [116]:
H = get_subgraphs(graphs)
graph_detail(H)

list_graphs.append(H.copy())

100%|██████████| 1/1 [00:00<00:00, 187.87it/s]
100%|██████████| 1/1 [00:12<00:00, 12.69s/it]


Num. nodes: 5710
Num. edges: 842720

time: 23.6 s (started: 2023-05-26 17:29:51 -05:00)


In [117]:
df_edge_embeddings_join_filter_count = pd.DataFrame(H.edges())
df_edge_embeddings_join_filter_count.columns = ["source", "target"]
df_edge_embeddings_join_filter_count

Unnamed: 0,source,target
0,520.9177,870.5460
1,520.9177,378.7450
2,520.9177,397.2518
3,520.9177,193.0769
4,520.9177,147.0655
...,...,...
842715,455.0738,455.1677
842716,711.1266,711.1610
842717,687.0800,687.0919
842718,760.3625,760.3770


time: 505 ms (started: 2023-05-26 17:30:15 -05:00)


In [118]:
# Get weight
df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
s = []
t = []
for row in df_edge_embeddings_join_filter_count_weight.itertuples():
    if row[1] > row[2]:
        s.append(row[2])
        t.append(row[1])
    else:
        s.append(row[1])
        t.append(row[2])
df_edge_embeddings_join_filter_count_weight["source"] = s
df_edge_embeddings_join_filter_count_weight["target"] = t

# df_edge_embeddings_join_filter_count_weight = df_edge_embeddings_join_filter_count.copy()
df_edge_embeddings_join_filter_count_weight.sort_values(["source", "target"], ascending=True, inplace=True)
df_edge_embeddings_join_filter_count_weight["idx"] = df_edge_embeddings_join_filter_count_weight["source"].astype(str) + "-" + df_edge_embeddings_join_filter_count_weight["target"].astype(str)
list_aux = df_edge_embeddings_join_filter_count_weight.iloc[:, -1].values

for i in tqdm(subgroups):
    df_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], i))
    df_edges.sort_values(["source", "target"], ascending=True, inplace=True)
    df_edges["idx"] = df_edges["source"].astype(str) + "-" + df_edges["target"].astype(str)
    
    filter = df_edges["idx"].isin(list_aux)
    temp = df_edges[filter]
    list_temp = temp.iloc[:, -2].values
    df_edge_embeddings_join_filter_count_weight["subgroup{}".format(i)] = list_temp
df_edge_embeddings_join_filter_count_weight


100%|██████████| 2/2 [00:10<00:00,  5.04s/it]


Unnamed: 0,source,target,idx,subgroup1,subgroup2
608869,59.0137,59.0291,59.0137-59.0291,0.961159,0.988618
767402,59.0137,59.0370,59.0137-59.037,0.653159,0.828448
146184,59.0137,60.0171,59.0137-60.0171,0.704725,0.897362
767281,59.0137,61.9884,59.0137-61.9884,0.780303,0.588602
767342,59.0137,71.0139,59.0137-71.0139,0.701654,0.661068
...,...,...,...,...,...
349710,988.4452,990.5673,988.4452-990.5673,0.594618,0.756580
646004,988.5664,990.5673,988.5664-990.5673,0.603655,0.570618
839146,989.4484,989.5686,989.4484-989.5686,0.640050,0.625773
708391,990.4534,990.5673,990.4534-990.5673,0.667017,0.877431


time: 11.8 s (started: 2023-05-26 17:30:15 -05:00)


In [119]:
# Dispersion (std)
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight.copy()
df_edge_embeddings_join_filter_count_weight_std["std"] = np.std(df_edge_embeddings_join_filter_count_weight_std.iloc[:, -len(subgroups):], axis=1)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,idx,subgroup1,subgroup2,std
608869,59.0137,59.0291,59.0137-59.0291,0.961159,0.988618,0.013730
767402,59.0137,59.0370,59.0137-59.037,0.653159,0.828448,0.087645
146184,59.0137,60.0171,59.0137-60.0171,0.704725,0.897362,0.096319
767281,59.0137,61.9884,59.0137-61.9884,0.780303,0.588602,0.095850
767342,59.0137,71.0139,59.0137-71.0139,0.701654,0.661068,0.020293
...,...,...,...,...,...,...
349710,988.4452,990.5673,988.4452-990.5673,0.594618,0.756580,0.080981
646004,988.5664,990.5673,988.5664-990.5673,0.603655,0.570618,0.016518
839146,989.4484,989.5686,989.4484-989.5686,0.640050,0.625773,0.007138
708391,990.4534,990.5673,990.4534-990.5673,0.667017,0.877431,0.105207


time: 128 ms (started: 2023-05-26 17:30:27 -05:00)


In [120]:
# Filter by std
df_edge_embeddings_join_filter_count_weight_std = df_edge_embeddings_join_filter_count_weight_std[df_edge_embeddings_join_filter_count_weight_std["std"] <= 0.3]
df_edge_embeddings_join_filter_count_weight_std.to_csv("{}/output_greedy/edges_filter_weight_std/greedy_{}_edge-filter-weight-std.csv".format(dir, group[0]), index=False)
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,idx,subgroup1,subgroup2,std
608869,59.0137,59.0291,59.0137-59.0291,0.961159,0.988618,0.013730
767402,59.0137,59.0370,59.0137-59.037,0.653159,0.828448,0.087645
146184,59.0137,60.0171,59.0137-60.0171,0.704725,0.897362,0.096319
767281,59.0137,61.9884,59.0137-61.9884,0.780303,0.588602,0.095850
767342,59.0137,71.0139,59.0137-71.0139,0.701654,0.661068,0.020293
...,...,...,...,...,...,...
349710,988.4452,990.5673,988.4452-990.5673,0.594618,0.756580,0.080981
646004,988.5664,990.5673,988.5664-990.5673,0.603655,0.570618,0.016518
839146,989.4484,989.5686,989.4484-989.5686,0.640050,0.625773,0.007138
708391,990.4534,990.5673,990.4534-990.5673,0.667017,0.877431,0.105207


time: 3.85 s (started: 2023-05-26 17:30:27 -05:00)


In [121]:
df_edge_embeddings_join_filter_count_weight_std = pd.read_csv("{}/output_greedy/edges_filter_weight_std/greedy_{}_edge-filter-weight-std.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,idx,subgroup1,subgroup2,std
0,59.0137,59.0291,59.0137-59.0291,0.961159,0.988618,0.013730
1,59.0137,59.0370,59.0137-59.037,0.653159,0.828448,0.087645
2,59.0137,60.0171,59.0137-60.0171,0.704725,0.897362,0.096319
3,59.0137,61.9884,59.0137-61.9884,0.780303,0.588602,0.095850
4,59.0137,71.0139,59.0137-71.0139,0.701654,0.661068,0.020293
...,...,...,...,...,...,...
732143,988.4452,990.5673,988.4452-990.5673,0.594618,0.756580,0.080981
732144,988.5664,990.5673,988.5664-990.5673,0.603655,0.570618,0.016518
732145,989.4484,989.5686,989.4484-989.5686,0.640050,0.625773,0.007138
732146,990.4534,990.5673,990.4534-990.5673,0.667017,0.877431,0.105207


time: 681 ms (started: 2023-05-26 17:30:31 -05:00)


In [122]:
# Average weight
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std.copy()
df_edge_embeddings_join_filter_count_weight_std_avg["weight"] = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, -(len(subgroups) + 1):-1].mean(axis=1)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,idx,subgroup1,subgroup2,std,weight
0,59.0137,59.0291,59.0137-59.0291,0.961159,0.988618,0.013730,0.974889
1,59.0137,59.0370,59.0137-59.037,0.653159,0.828448,0.087645,0.740804
2,59.0137,60.0171,59.0137-60.0171,0.704725,0.897362,0.096319,0.801043
3,59.0137,61.9884,59.0137-61.9884,0.780303,0.588602,0.095850,0.684453
4,59.0137,71.0139,59.0137-71.0139,0.701654,0.661068,0.020293,0.681361
...,...,...,...,...,...,...,...
732143,988.4452,990.5673,988.4452-990.5673,0.594618,0.756580,0.080981,0.675599
732144,988.5664,990.5673,988.5664-990.5673,0.603655,0.570618,0.016518,0.587136
732145,989.4484,989.5686,989.4484-989.5686,0.640050,0.625773,0.007138,0.632912
732146,990.4534,990.5673,990.4534-990.5673,0.667017,0.877431,0.105207,0.772224


time: 78.6 ms (started: 2023-05-26 17:30:32 -05:00)


In [123]:
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, [0, 1, -1]]
df_edge_embeddings_join_filter_count_weight_std_avg.to_csv("{}/output_greedy/edges_filter_weight_std_avg/greedy_{}_edge-filter-weight-std-avg.csv".format(dir, group[0]), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
732143,988.4452,990.5673,0.675599
732144,988.5664,990.5673,0.587136
732145,989.4484,989.5686,0.632912
732146,990.4534,990.5673,0.772224


time: 1.87 s (started: 2023-05-26 17:30:32 -05:00)


In [124]:
df_edge_embeddings_join_filter_count_weight_std_avg = pd.read_csv("{}/output_greedy/edges_filter_weight_std_avg/greedy_{}_edge-filter-weight-std-avg.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
732143,988.4452,990.5673,0.675599
732144,988.5664,990.5673,0.587136
732145,989.4484,989.5686,0.632912
732146,990.4534,990.5673,0.772224


time: 173 ms (started: 2023-05-26 17:30:34 -05:00)


In [125]:
G = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg, "source", "target", edge_attr=["weight"])
graph_detail(G)

list_graphs.append(G.copy())

Num. nodes: 5677
Num. edges: 732148

time: 3.67 s (started: 2023-05-26 17:30:35 -05:00)


### ANOVA

In [126]:
# Load dataset Groups
df1 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int1"), delimiter="|")
df2 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int2"), delimiter="|")
df3 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int3"), delimiter="|")
df4 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int4"), delimiter="|")
# df5_ = pd.read_csv("{}/inputs/Edwin_proyecto2/{}.csv".format(dir, "int5"), delimiter="|")

""" df1 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities1"), delimiter="|")
df2 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities2"), delimiter="|")
df3 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities3"), delimiter="|")
df4 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities4"), delimiter="|")
df5_ = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities5"), delimiter="|") """

print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
# print(df5_.shape)

(6245, 102)
(6245, 72)
(6245, 97)
(6245, 137)
time: 259 ms (started: 2023-05-26 17:30:38 -05:00)


In [127]:
# concat
# df_join_raw = pd.concat([df1.iloc[:,1:], df2.iloc[:, 2:], df3.iloc[:, 2:], df4.iloc[:, 2:], df5.iloc[:, 2:]], axis=1)
df_join_raw = pd.concat([df1.iloc[:, 1:], df2.iloc[:, 2:], df3.iloc[:, 2:], df4.iloc[:, 2:]], axis=1)
df_join_raw.set_index(["ionMz"], inplace=True)

print(df_join_raw.shape)
df_join_raw

(6245, 400)


Unnamed: 0_level_0,0001 / zwf1^ 3.4,0002 / zwf1^ 3.4,0003 / zwf1^ 3.4,0004 / zwf1^ 3.4,0005 / zwf1^ 3.4,0006 / zwf1^ 3.4,0007 / zwf1^ 3.4,0008 / zwf1^ 3.4,0009 / zwf1^ 3.4,0010 / zwf1^ 3.4,...,0951 / WT 3.4,0952 / WT 3.4,0953 / WT 3.4,0954 / WT 3.4,0955 / WT 3.4,0956 / WT 3.4,0957 / WT 3.4,0958 / WT 3.4,0959 / WT 3.4,0960 / WT 3.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,1,47,508,360,675,232,59,345,346,504,...,45,817,32,307,410,716,228,187,361,952
59.0137,53182,57902,51861,62812,54291,54395,59054,52547,57544,63538,...,45747,39677,47205,38697,54320,62610,47283,49927,43632,48511
59.0291,2151,2102,2334,2864,2393,2246,2331,1954,2310,2528,...,1852,1717,1830,1660,2006,2539,2103,2276,1643,2522
59.0370,83,174,366,1134,693,464,235,479,443,691,...,225,349,185,429,186,278,364,368,115,887
59.0453,1,1,51,642,493,143,11,227,160,154,...,60,293,1,336,65,142,1,182,10,684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,2470,2435,2465,2727,2773,2157,2168,2623,2256,2636,...,1866,1834,1669,674,2533,1017,2983,1814,2160,1899
996.7096,3366,3241,2909,3469,3538,3010,3031,2872,2709,3601,...,1748,1419,1160,621,1758,1474,1973,1023,1769,1629
997.5542,2818,2335,1807,1679,2222,2070,2158,1835,2087,2784,...,1168,828,1162,1103,2253,1419,2306,1542,827,1433
997.7131,1901,1469,1572,1916,2180,2555,2101,1656,2543,2643,...,659,1370,714,390,1660,1513,1843,1448,691,1376


time: 29.8 ms (started: 2023-05-26 17:30:39 -05:00)


In [128]:
# get groud
if group[0] == "zwf1^":
    r = "zwf1"
elif group[0] == "pck1^":
    r = "pck1"
else:
    r = group[0]

df_raw_group = df_join_raw.filter(regex=r, axis=1)
df_raw_group

Unnamed: 0_level_0,0011 / pck1^ 2.4,0012 / pck1^ 2.4,0013 / pck1^ 2.4,0014 / pck1^ 2.4,0015 / pck1^ 2.4,0016 / pck1^ 2.4,0017 / pck1^ 2.4,0018 / pck1^ 2.4,0019 / pck1^ 2.4,0020 / pck1^ 2.4,...,0881 / pck1^ 2.3,0882 / pck1^ 2.3,0883 / pck1^ 2.3,0884 / pck1^ 2.3,0885 / pck1^ 2.3,0886 / pck1^ 2.3,0887 / pck1^ 2.3,0888 / pck1^ 2.3,0889 / pck1^ 2.3,0890 / pck1^ 2.3
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,901,81,377,488,883,426,421,689,383,222,...,857,655,431,335,200,670,256,288,315,427
59.0137,33968,22565,36285,30275,34858,28558,36375,31684,25583,24293,...,59883,65860,65463,64715,52758,56120,60780,64502,62149,66849
59.0291,1947,945,2010,1776,1806,1549,2015,1494,1399,1029,...,2528,3118,2863,2896,2422,2483,2878,3035,2456,2964
59.0370,644,139,752,607,694,685,681,404,619,117,...,522,911,995,655,740,733,652,693,697,724
59.0453,139,78,352,414,373,388,376,353,345,134,...,181,459,429,155,321,257,342,135,315,415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,1169,923,815,1219,638,1161,1461,1710,1224,947,...,1580,1340,1940,1714,1266,735,828,1709,978,1051
996.7096,1122,1767,1914,1748,1575,1599,1738,1701,2425,1651,...,1330,817,1456,1113,816,520,161,1229,596,447
997.5542,1135,1849,1313,819,1883,1296,964,1236,1117,711,...,1111,1401,467,2230,1664,1296,391,587,883,762
997.7131,1211,2000,1554,1235,1640,754,1034,1345,1189,1109,...,743,1424,52,1342,541,757,641,290,578,1384


time: 17.9 ms (started: 2023-05-26 17:30:39 -05:00)


In [129]:
# Logarithm

df_raw_log = df_raw_group.copy()
for column in df_raw_group.columns:
  df_raw_log[column] = np.log10(df_raw_group[column], where=df_raw_group[column]>0)
  # df_raw_log[column] = np.log10(df_raw_group[column], out=np.zeros_like(df_raw_group[column]), where=df_raw_group[column]>0)
df_raw_log

Unnamed: 0_level_0,0011 / pck1^ 2.4,0012 / pck1^ 2.4,0013 / pck1^ 2.4,0014 / pck1^ 2.4,0015 / pck1^ 2.4,0016 / pck1^ 2.4,0017 / pck1^ 2.4,0018 / pck1^ 2.4,0019 / pck1^ 2.4,0020 / pck1^ 2.4,...,0881 / pck1^ 2.3,0882 / pck1^ 2.3,0883 / pck1^ 2.3,0884 / pck1^ 2.3,0885 / pck1^ 2.3,0886 / pck1^ 2.3,0887 / pck1^ 2.3,0888 / pck1^ 2.3,0889 / pck1^ 2.3,0890 / pck1^ 2.3
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,2.954725,1.908485,2.576341,2.688420,2.945961,2.629410,2.624282,2.838219,2.583199,2.346353,...,2.932981,2.816241,2.634477,2.525045,2.301030,2.826075,2.408240,2.459392,2.498311,2.630428
59.0137,4.531070,4.353435,4.559727,4.481084,4.542302,4.455728,4.560803,4.500840,4.407951,4.385481,...,4.777304,4.818622,4.815996,4.811005,4.722288,4.749118,4.783761,4.809573,4.793434,4.825095
59.0291,3.289366,2.975432,3.303196,3.249443,3.256718,3.190051,3.304275,3.174351,3.145818,3.012415,...,3.402777,3.493876,3.456821,3.461799,3.384174,3.394977,3.459091,3.482159,3.390228,3.471878
59.0370,2.808886,2.143015,2.876218,2.783189,2.841359,2.835691,2.833147,2.606381,2.791691,2.068186,...,2.717671,2.959518,2.997823,2.816241,2.869232,2.865104,2.814248,2.840733,2.843233,2.859739
59.0453,2.143015,1.892095,2.546543,2.617000,2.571709,2.588832,2.575188,2.547775,2.537819,2.127105,...,2.257679,2.661813,2.632457,2.190332,2.506505,2.409933,2.534026,2.130334,2.498311,2.618048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,3.067815,2.965202,2.911158,3.086004,2.804821,3.064832,3.164650,3.232996,3.087781,2.976350,...,3.198657,3.127105,3.287802,3.234011,3.102434,2.866287,2.918030,3.232742,2.990339,3.021603
996.7096,3.049993,3.247237,3.281942,3.242541,3.197281,3.203848,3.240050,3.230704,3.384712,3.217747,...,3.123852,2.912222,3.163161,3.046495,2.911690,2.716003,2.206826,3.089552,2.775246,2.650308
997.5542,3.054996,3.266937,3.118265,2.913284,3.274850,3.112605,2.984077,3.092018,3.048053,2.851870,...,3.045714,3.146438,2.669317,3.348305,3.221153,3.112605,2.592177,2.768638,2.945961,2.881955
997.7131,3.083144,3.301030,3.191451,3.091667,3.214844,2.877371,3.014521,3.128722,3.075182,3.044932,...,2.870989,3.153510,1.716003,3.127753,2.733197,2.879096,2.806858,2.462398,2.761928,3.141136


time: 60.4 ms (started: 2023-05-26 17:30:39 -05:00)


In [130]:
subgroups = [item.split("{} ".format(group[0]))[1].split(".")[0] for item in list(df_raw_log.columns)]
subgroups = np.unique(subgroups)
subgroups

array(['1', '2'], dtype='<U1')

time: 5.32 ms (started: 2023-05-26 17:30:39 -05:00)


In [131]:
# Split graph

list_raw = []

for item in subgroups:
  list_raw.append(df_raw_log.filter(like="{} {}.".format(group[0], item)))

print(len(list_raw))
list_raw[0]

2


Unnamed: 0_level_0,0031 / pck1^ 1.2,0032 / pck1^ 1.2,0033 / pck1^ 1.2,0034 / pck1^ 1.2,0035 / pck1^ 1.2,0036 / pck1^ 1.2,0037 / pck1^ 1.2,0038 / pck1^ 1.2,0039 / pck1^ 1.2,0040 / pck1^ 1.2,...,0761 / pck1^ 1.4,0762 / pck1^ 1.4,0763 / pck1^ 1.4,0764 / pck1^ 1.4,0765 / pck1^ 1.4,0766 / pck1^ 1.4,0767 / pck1^ 1.4,0768 / pck1^ 1.4,0769 / pck1^ 1.4,0770 / pck1^ 1.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,1.431364,2.714330,2.816904,2.983175,2.883661,2.522444,2.567026,2.740363,1.255273,2.117271,...,2.380211,2.793092,2.437751,2.805501,2.209515e+00,2.781755e+00,2.667453,2.770115,2.665581,2.875640
59.0137,4.882285,5.211590,5.134623,4.878763,4.807887,4.708574,4.812947,4.655590,4.749257,4.803935,...,4.951716,5.058365,4.994445,5.073960,4.979175e+00,4.760287e+00,5.024321,4.936061,4.972966,4.963623
59.0291,3.424065,3.835944,3.729570,3.467460,3.447778,3.352954,3.410777,3.342817,3.364926,3.462997,...,3.526856,3.674402,3.605628,3.670431,3.622525e+00,3.413467e+00,3.617000,3.592621,3.638190,3.554004
59.0370,2.532754,3.143951,3.039414,2.536558,2.663701,2.531479,2.485721,2.613842,2.603144,2.931966,...,2.702431,2.945961,2.961421,2.982723,2.943000e+00,2.868644e+00,2.817565,2.927370,3.071514,2.972666
59.0453,0.000000,2.068186,2.451786,2.382017,2.214844,2.075547,0.000000,2.214844,1.255273,2.570543,...,2.363612,2.107210,2.382017,2.600973,2.686636e+00,2.618048e+00,2.113943,2.532754,2.832509,2.576341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,3.597037,3.485153,3.498448,3.436799,3.415641,3.426023,3.421275,3.341632,3.264109,3.473049,...,3.490239,3.517724,3.411956,3.400020,3.500511e+00,3.272538e+00,3.157457,3.398114,3.433130,3.349472
996.7096,3.311542,3.200577,3.312177,3.319522,3.219323,3.192010,3.172895,3.362482,2.975432,3.044932,...,3.289589,3.515211,3.264818,3.413300,3.303412e+00,3.027350e+00,3.103462,3.338656,3.374198,3.148294
997.5542,3.368845,3.277380,3.350442,3.549249,3.355452,3.373647,3.426186,3.210586,3.504607,3.536306,...,3.325721,3.221414,3.376394,3.353916,3.353916e+00,3.353916e+00,3.432649,3.400883,3.451633,3.360972
997.7131,2.927883,3.059185,3.161967,3.096910,3.030195,2.770115,3.157759,2.749736,3.150142,3.076640,...,3.239550,2.927883,3.069668,3.161068,3.161068e+00,3.161068e+00,3.179264,2.928396,3.147676,3.035430


time: 35.5 ms (started: 2023-05-26 17:30:39 -05:00)


In [132]:
list_raw_copy = list_raw[:]

for k, item in enumerate(list_raw_copy):
    item.columns = [chr(65 + k)]*len(item.columns)

time: 1.55 ms (started: 2023-05-26 17:30:39 -05:00)


In [133]:
# Filter by graph and concat 
nodes = list(G.nodes())
df_raw_filter = list_raw_copy[0].loc[nodes, :]

for k in range(1, len(subgroups)):
    df_temp = list_raw_copy[k].loc[nodes, :]
    # df_raw_filter = df_raw_filter.join(df_temp)
    df_raw_filter = pd.concat([df_raw_filter, df_temp], axis=1)

df_raw_filter.to_csv("{}/output_greedy/matrix/greedy_{}_matrix_copy.csv".format(dir, group[0]), index=True)
df_raw_filter

Unnamed: 0_level_0,A,A,A,A,A,A,A,A,A,A,...,B,B,B,B,B,B,B,B,B,B
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0137,4.882285,5.211590,5.134623,4.878763,4.807887,4.708574,4.812947,4.655590,4.749257,4.803935,...,4.777304,4.818622,4.815996,4.811005,4.722288,4.749118,4.783761,4.809573,4.793434,4.825095
59.0291,3.424065,3.835944,3.729570,3.467460,3.447778,3.352954,3.410777,3.342817,3.364926,3.462997,...,3.402777,3.493876,3.456821,3.461799,3.384174,3.394977,3.459091,3.482159,3.390228,3.471878
59.0370,2.532754,3.143951,3.039414,2.536558,2.663701,2.531479,2.485721,2.613842,2.603144,2.931966,...,2.717671,2.959518,2.997823,2.816241,2.869232,2.865104,2.814248,2.840733,2.843233,2.859739
60.0171,3.107549,3.555094,3.593397,3.109241,3.407391,3.006038,3.329601,3.375298,3.173769,3.086360,...,3.006466,3.222716,3.177536,3.302547,3.288473,3.209783,3.190892,3.277380,3.050380,3.281715
61.9884,3.344196,3.639785,3.691965,3.319522,3.355643,3.159868,3.203305,3.239299,3.284882,3.396199,...,3.271842,3.386856,3.168203,3.373464,3.467608,3.275772,3.458336,3.389698,3.418467,3.316390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980.0986,2.919078,2.835056,3.094820,2.885926,2.885926,2.922725,2.673021,3.549126,2.912753,2.445604,...,3.021189,3.033021,2.992111,2.943000,2.945469,2.924796,3.286905,3.215638,3.372728,3.103119
980.1839,2.833147,2.232996,2.879669,2.103804,2.495544,2.406540,0.000000,2.793092,2.315970,2.429752,...,2.881955,2.950851,2.498311,2.453318,2.818226,2.652246,3.264346,3.101403,3.273001,2.814913
981.6836,2.778151,2.469822,2.385606,2.865696,2.732394,2.584331,2.594393,2.544068,2.712650,2.296665,...,2.730782,1.995635,2.565848,2.401401,2.874482,0.000000,0.000000,2.519828,2.702431,2.378398
984.3993,3.093772,2.976808,3.049218,2.902547,2.710963,2.907949,3.098990,2.864511,3.026125,3.050380,...,2.350248,2.401401,2.457882,0.000000,3.128722,1.113943,2.636488,2.269513,2.816904,2.318063


time: 531 ms (started: 2023-05-26 17:30:39 -05:00)


In [134]:
# df_raw_filter = pd.read_csv("{}/output_greedy/matrix/greedy_{}_matrix_copy.csv".format(dir, group[0]), index_col=0n)
# df_raw_filter

time: 481 µs (started: 2023-05-26 17:30:40 -05:00)


---

In [143]:
df_raw_filter = pd.read_csv("/home/ealvarez/Project/GNN_Filter/GNN_unsupervised/luis.csv")
df_raw_filter

df_raw_filter.columns = ["id", "A", "A", "B", "B"]
# df_raw_filter.drop("id", axis=1, inplace=True)
df_raw_filter.set_index("id", inplace=True)
df_raw_filter

p_values = anova(df_raw_filter)

df_raw_filter["p-value"] = p_values
df_raw_filter



Unnamed: 0_level_0,A,A,B,B,p-value
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,-6.28570,-6.28570,-5.28570,-6.28570,0.422650
5,-4.58762,-5.58762,-5.58762,-5.58762,0.422650
6,-5.44801,-4.61151,-5.15530,-5.30908,0.680992
7,-4.12296,-4.61904,-4.24114,-4.10532,0.522278
8,-4.78666,-5.78666,-5.78666,-5.78666,0.422650
...,...,...,...,...,...
7114,-4.49322,-4.26143,-6.41806,-6.41806,0.003210
7115,-5.00350,-4.84296,-4.60468,-4.60448,0.057992
7116,-5.25342,-6.13739,-4.72901,-5.49534,0.423783
7118,-5.62146,-4.65306,-5.22602,-4.94384,0.926827


time: 3.71 s (started: 2023-05-26 18:13:43 -05:00)


In [144]:
df_raw_filter_p = df_raw_filter[df_raw_filter["p-value"] < 0.001]
df_raw_filter_p

Unnamed: 0_level_0,A,A,B,B,p-value
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
350,-4.56243,-4.59063,-6.25179,-6.25179,0.000071
436,-4.67013,-4.63096,-6.09377,-6.09377,0.000184
783,-3.57772,-3.56406,-4.26832,-4.28261,0.000197
845,-4.94197,-4.84079,-7.22938,-7.22938,0.000468
848,-3.61269,-3.62266,-4.31879,-4.32188,0.000055
...,...,...,...,...,...
7023,-5.69815,-5.69815,-3.79130,-3.75990,0.000067
7031,-6.97030,-6.97030,-4.31319,-4.44042,0.000601
7032,-6.98135,-6.98135,-5.06706,-4.98798,0.000409
7050,-4.36663,-4.37073,-5.84611,-5.88692,0.000187


time: 10.3 ms (started: 2023-05-26 18:23:50 -05:00)


---
---

In [136]:
# ANOVA
df_raw_filter_anova = df_raw_filter.copy()
p_values = anova(df_raw_filter_anova)
df_raw_filter_anova["p-value"] = p_values
df_raw_filter_anova

Unnamed: 0_level_0,A,A,A,A,A,A,A,A,A,A,...,B,B,B,B,B,B,B,B,B,p-value
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0137,4.882285,5.211590,5.134623,4.878763,4.807887,4.708574,4.812947,4.655590,4.749257,4.803935,...,4.818622,4.815996,4.811005,4.722288,4.749118,4.783761,4.809573,4.793434,4.825095,0.521935
59.0291,3.424065,3.835944,3.729570,3.467460,3.447778,3.352954,3.410777,3.342817,3.364926,3.462997,...,3.493876,3.456821,3.461799,3.384174,3.394977,3.459091,3.482159,3.390228,3.471878,0.182671
59.0370,2.532754,3.143951,3.039414,2.536558,2.663701,2.531479,2.485721,2.613842,2.603144,2.931966,...,2.959518,2.997823,2.816241,2.869232,2.865104,2.814248,2.840733,2.843233,2.859739,0.012852
60.0171,3.107549,3.555094,3.593397,3.109241,3.407391,3.006038,3.329601,3.375298,3.173769,3.086360,...,3.222716,3.177536,3.302547,3.288473,3.209783,3.190892,3.277380,3.050380,3.281715,0.436624
61.9884,3.344196,3.639785,3.691965,3.319522,3.355643,3.159868,3.203305,3.239299,3.284882,3.396199,...,3.386856,3.168203,3.373464,3.467608,3.275772,3.458336,3.389698,3.418467,3.316390,0.007072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980.0986,2.919078,2.835056,3.094820,2.885926,2.885926,2.922725,2.673021,3.549126,2.912753,2.445604,...,3.033021,2.992111,2.943000,2.945469,2.924796,3.286905,3.215638,3.372728,3.103119,0.801823
980.1839,2.833147,2.232996,2.879669,2.103804,2.495544,2.406540,0.000000,2.793092,2.315970,2.429752,...,2.950851,2.498311,2.453318,2.818226,2.652246,3.264346,3.101403,3.273001,2.814913,0.526455
981.6836,2.778151,2.469822,2.385606,2.865696,2.732394,2.584331,2.594393,2.544068,2.712650,2.296665,...,1.995635,2.565848,2.401401,2.874482,0.000000,0.000000,2.519828,2.702431,2.378398,0.685374
984.3993,3.093772,2.976808,3.049218,2.902547,2.710963,2.907949,3.098990,2.864511,3.026125,3.050380,...,2.401401,2.457882,0.000000,3.128722,1.113943,2.636488,2.269513,2.816904,2.318063,0.150345


time: 10.3 s (started: 2023-05-26 17:30:40 -05:00)


In [137]:
# Filter by ANOVA (p < 0.001)
df_raw_filter_anova_p = df_raw_filter_anova[df_raw_filter_anova["p-value"] < 0.001]
df_raw_filter_anova_p

nodes = list(df_raw_filter_anova_p.index)
len(nodes)

986

time: 8.18 ms (started: 2023-05-26 17:30:51 -05:00)


#### Filter graph by ANOVA

In [138]:
H = G.subgraph(nodes)
graph_detail(H)

list_graphs.append(H.copy())

Num. nodes: 986
Num. edges: 48440

time: 399 ms (started: 2023-05-26 17:30:51 -05:00)


In [139]:
edges = list(H.edges())

df_edge_embeddings_join_filter_count_weight_std_avg_anova = pd.DataFrame(edges, columns=["source", "target"])
df_edge_embeddings_join_filter_count_weight_std_avg_anova["weight"] = [H.get_edge_data(edge[0], edge[1], default=0)["weight"] for edge in edges]
df_edge_embeddings_join_filter_count_weight_std_avg_anova.to_csv("{}/output_greedy/edges_filter_weight_std_avg_anova/greedy_{}_edge-filter-weight-std-avg-anova.csv".format(dir, group[0]), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg_anova

Unnamed: 0,source,target,weight
0,814.5429,907.4824,0.638985
1,816.6374,193.0858,0.629736
2,816.6374,265.0364,0.627481
3,816.6374,283.1190,0.652843
4,816.6374,284.1224,0.551715
...,...,...,...
48435,797.5127,802.5591,0.711252
48436,798.5243,882.5494,0.722606
48437,798.5814,812.5371,0.862177
48438,802.5591,803.5605,0.847592


time: 550 ms (started: 2023-05-26 17:30:51 -05:00)


In [140]:
df_edge_embeddings_join_filter_count_weight_std_avg_anova = pd.read_csv("{}/output_greedy/edges_filter_weight_std_avg_anova/greedy_{}_edge-filter-weight-std-avg-anova.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std_avg_anova

Unnamed: 0,source,target,weight
0,814.5429,907.4824,0.638985
1,816.6374,193.0858,0.629736
2,816.6374,265.0364,0.627481
3,816.6374,283.1190,0.652843
4,816.6374,284.1224,0.551715
...,...,...,...
48435,797.5127,802.5591,0.711252
48436,798.5243,882.5494,0.722606
48437,798.5814,812.5371,0.862177
48438,802.5591,803.5605,0.847592


time: 32.7 ms (started: 2023-05-26 17:30:52 -05:00)


In [141]:
H = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg_anova, "source", "target")
graph_detail(H)

list_graphs.append(H.copy())

Num. nodes: 907
Num. edges: 48440

time: 143 ms (started: 2023-05-26 17:30:52 -05:00)


In [142]:
for graph in list_graphs:
    graph_detail(graph)
    print()

Num. nodes: 5710
Num. edges: 842720


Num. nodes: 5677
Num. edges: 732148


Num. nodes: 986
Num. edges: 48440


Num. nodes: 907
Num. edges: 48440


time: 16.1 ms (started: 2023-05-26 17:30:52 -05:00)
