### Imports

In [2]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.41 ms (started: 2023-05-27 12:55:53 -05:00)


### Parameters

In [69]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.getcwd())
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
dimension = dimensions[0]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 dgi
Group:		 ['pck1^', 2]
Subgroup:	 ['1', '2']
Dimensions:	 [3]
time: 3.26 ms (started: 2023-05-27 15:38:20 -05:00)


### Get common subgraphs

In [70]:
graphs = []
for item in tqdm(subgroups):
    weighted_edges = pd.read_csv("{}/output_preprocessing/edges/{}_edges_{}.csv".format(dir, group[0], item))
    G = nx.from_pandas_edgelist(weighted_edges, "source", "target", edge_attr="weight")
    graph_detail(G)
    graphs.append(G)

 50%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 1/2 [00:03<00:03,  3.37s/it]

Num. nodes: 6235
Num. edges: 1629510



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.56s/it]

Num. nodes: 6234
Num. edges: 4417355

time: 13.1 s (started: 2023-05-27 15:38:31 -05:00)





In [72]:
H1 = get_subgraphs(graphs)
graph_detail(H1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 263.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.56s/it]


Num. nodes: 5710
Num. edges: 842720

time: 20.7 s (started: 2023-05-27 15:41:18 -05:00)


In [74]:
df_edge_embeddings_join_filter_count = pd.DataFrame(H1.edges())
df_edge_embeddings_join_filter_count.columns = ["source", "target"]
df_edge_embeddings_join_filter_count

Unnamed: 0,source,target
0,520.9177,870.5460
1,520.9177,378.7450
2,520.9177,397.2518
3,520.9177,193.0769
4,520.9177,147.0655
...,...,...
842715,455.0738,455.1677
842716,711.1266,711.1610
842717,687.0800,687.0919
842718,760.3625,760.3770


time: 519 ms (started: 2023-05-27 15:42:09 -05:00)


### Calculate ANOVA (nodes)

In [96]:
df_nodes_anova = get_nodes_anova(H1, dir)
df_nodes_anova

(6245, 102)
(6245, 72)
(6245, 97)
(6245, 137)
(6245, 400)
2


Unnamed: 0_level_0,p-value
ionMz,Unnamed: 1_level_1
520.9177,0.484216
870.5460,0.000001
158.9565,0.010560
178.0601,0.849503
537.1691,0.012312
...,...
609.9771,0.295581
451.9710,0.152285
402.2219,0.060412
422.2040,0.085458


time: 10.4 s (started: 2023-05-27 16:08:35 -05:00)


In [97]:
# Filter by ANOVA (p > 0.001)
df_nodes_anova_filter = df_nodes_anova[df_nodes_anova["p-value"] > 0.001]
df_nodes_anova_filter
# nodes = list(df_nodes_anova_p.index)
# len(nodes)

Unnamed: 0_level_0,p-value
ionMz,Unnamed: 1_level_1
520.9177,0.484216
158.9565,0.010560
178.0601,0.849503
537.1691,0.012312
608.2665,0.872061
...,...
609.9771,0.295581
451.9710,0.152285
402.2219,0.060412
422.2040,0.085458


time: 13.3 ms (started: 2023-05-27 16:09:42 -05:00)


### Calculate STD (edges)

In [102]:
df_edges_std = get_edges_std(H1)
df_edges_std

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.55s/it]


Unnamed: 0,source,target,weight,std
608869,59.0137,59.0291,0.974889,0.013730
767402,59.0137,59.0370,0.740804,0.087645
146184,59.0137,60.0171,0.801043,0.096319
767281,59.0137,61.9884,0.684453,0.095850
767342,59.0137,71.0139,0.681361,0.020293
...,...,...,...,...
349710,988.4452,990.5673,0.675599,0.080981
646004,988.5664,990.5673,0.587136,0.016518
839146,989.4484,989.5686,0.632912,0.007138
708391,990.4534,990.5673,0.772224,0.105207


time: 11.4 s (started: 2023-05-27 16:17:10 -05:00)


In [103]:
# Filter by std
df_edges_std_filter = df_edges_std[df_edges_std["std"] <= 0.3]
df_edges_std_filter

Unnamed: 0,source,target,weight,std
608869,59.0137,59.0291,0.974889,0.013730
767402,59.0137,59.0370,0.740804,0.087645
146184,59.0137,60.0171,0.801043,0.096319
767281,59.0137,61.9884,0.684453,0.095850
767342,59.0137,71.0139,0.681361,0.020293
...,...,...,...,...
349710,988.4452,990.5673,0.675599,0.080981
646004,988.5664,990.5673,0.587136,0.016518
839146,989.4484,989.5686,0.632912,0.007138
708391,990.4534,990.5673,0.772224,0.105207


time: 27.2 ms (started: 2023-05-27 16:17:33 -05:00)


---

#### Filter by edges (std) and nodes (ANOVA)

In [66]:
df_edges_all = df_edge_embeddings_join_filter_count_weight_std_avg_all.iloc[:, [0,1, -1, -2]]
df_edges_all

Unnamed: 0,source,target,weight,std
0,59.0137,59.0291,0.859127,0.136282
1,59.0137,349.2758,0.701044,0.118268
2,59.0137,350.2793,0.706849,0.112406
3,59.0137,350.3033,0.736142,0.128241
4,59.0137,389.2739,0.773249,0.145148
...,...,...,...,...
135071,980.7146,981.7177,0.842014,0.070631
135072,981.5305,981.7177,0.673811,0.111292
135073,981.7177,996.7096,0.638545,0.099503
135074,985.5091,985.5490,0.724017,0.106148


time: 14.6 ms (started: 2023-05-27 14:45:21 -05:00)


In [68]:
# Filter by std
df_edges_all_std = df_edges_all[df_edges_all["std"] <= 0.3]
df_edges_all_std.to_csv("{}/output_greedy/edges_filter/greedy_{}_edge-filter-all-std.csv".format(dir, group[0]), index=False)
df_edges_all_std

Unnamed: 0,source,target,weight,std
0,59.0137,59.0291,0.859127,0.136282
1,59.0137,349.2758,0.701044,0.118268
2,59.0137,350.2793,0.706849,0.112406
3,59.0137,350.3033,0.736142,0.128241
4,59.0137,389.2739,0.773249,0.145148
...,...,...,...,...
135071,980.7146,981.7177,0.842014,0.070631
135072,981.5305,981.7177,0.673811,0.111292
135073,981.7177,996.7096,0.638545,0.099503
135074,985.5091,985.5490,0.724017,0.106148


time: 532 ms (started: 2023-05-27 14:48:45 -05:00)


In [None]:
H = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg_anova, "source", "target")
graph_detail(H)

In [12]:
df_edge_embeddings_join_filter_count_weight_std = pd.read_csv("{}/output_greedy/edges_filter_weight_std/greedy_{}_edge-filter-weight-std.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std

Unnamed: 0,source,target,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std
0,59.0137,59.0291,59.0137-59.0291,0.954735,0.928018,0.841093,0.601605,0.970185,0.136282
1,59.0137,349.2758,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268
2,59.0137,350.2793,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406
3,59.0137,350.3033,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241
4,59.0137,389.2739,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148
...,...,...,...,...,...,...,...,...,...
131535,980.7146,981.7177,980.7146-981.7177,0.743341,0.954130,0.878971,0.816034,0.817595,0.070631
131536,981.5305,981.7177,981.5305-981.7177,0.571209,0.850899,0.670445,0.544705,0.731794,0.111292
131537,981.7177,996.7096,981.7177-996.7096,0.557537,0.784976,0.563671,0.732291,0.554250,0.099503
131538,985.5091,985.5490,985.5091-985.549,0.798737,0.577365,0.723628,0.875572,0.644783,0.106148


time: 191 ms (started: 2023-05-27 13:01:48 -05:00)


Unnamed: 0,source,target,idx,subgroup1,subgroup2,subgroup3,subgroup4,subgroup5,std,weight
0,59.0137,59.0291,59.0137-59.0291,0.954735,0.928018,0.841093,0.601605,0.970185,0.136282,0.859127
1,59.0137,349.2758,59.0137-349.2758,0.796859,0.788310,0.560861,0.551982,0.807207,0.118268,0.701044
2,59.0137,350.2793,59.0137-350.2793,0.807013,0.779306,0.576143,0.563560,0.808223,0.112406,0.706849
3,59.0137,350.3033,59.0137-350.3033,0.854806,0.751778,0.539172,0.650983,0.883973,0.128241,0.736142
4,59.0137,389.2739,59.0137-389.2739,0.823682,0.886260,0.723124,0.515230,0.917952,0.145148,0.773249
...,...,...,...,...,...,...,...,...,...,...
131535,980.7146,981.7177,980.7146-981.7177,0.743341,0.954130,0.878971,0.816034,0.817595,0.070631,0.842014
131536,981.5305,981.7177,981.5305-981.7177,0.571209,0.850899,0.670445,0.544705,0.731794,0.111292,0.673811
131537,981.7177,996.7096,981.7177-996.7096,0.557537,0.784976,0.563671,0.732291,0.554250,0.099503,0.638545
131538,985.5091,985.5490,985.5091-985.549,0.798737,0.577365,0.723628,0.875572,0.644783,0.106148,0.724017


time: 30.7 ms (started: 2023-05-27 13:15:26 -05:00)


In [11]:
df_edge_embeddings_join_filter_count_weight_std_avg = df_edge_embeddings_join_filter_count_weight_std_avg.iloc[:, [0, 1, -1]]
df_edge_embeddings_join_filter_count_weight_std_avg.to_csv("{}/output_greedy/edges_filter_weight_std_avg/greedy_{}_edge-filter-weight-std-avg.csv".format(dir, group[0]), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.859127
1,59.0137,349.2758,0.701044
2,59.0137,350.2793,0.706849
3,59.0137,350.3033,0.736142
4,59.0137,389.2739,0.773249
...,...,...,...
131535,980.7146,981.7177,0.842014
131536,981.5305,981.7177,0.673811
131537,981.7177,996.7096,0.638545
131538,985.5091,985.5490,0.724017


time: 343 ms (started: 2023-05-26 10:59:31 -05:00)


In [12]:
df_edge_embeddings_join_filter_count_weight_std_avg = pd.read_csv("{}/output_greedy/edges_filter_weight_std_avg/greedy_{}_edge-filter-weight-std-avg.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std_avg

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.859127
1,59.0137,349.2758,0.701044
2,59.0137,350.2793,0.706849
3,59.0137,350.3033,0.736142
4,59.0137,389.2739,0.773249
...,...,...,...
131535,980.7146,981.7177,0.842014
131536,981.5305,981.7177,0.673811
131537,981.7177,996.7096,0.638545
131538,985.5091,985.5490,0.724017


time: 49.2 ms (started: 2023-05-26 10:59:32 -05:00)


In [13]:
G = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg, "source", "target", edge_attr=["weight"])
graph_detail(G)

Num. nodes: 4181
Num. edges: 131540

time: 245 ms (started: 2023-05-26 10:59:32 -05:00)


### Features selection / ANOVA / PCA

In [14]:
# Load dataset Groups
df1 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int1"), delimiter="|")
df2 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int2"), delimiter="|")
df3 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int3"), delimiter="|")
df4 = pd.read_csv("{}/input/Edwin_proyecto2/{}.csv".format(dir, "int4"), delimiter="|")
# df5_ = pd.read_csv("{}/inputs/Edwin_proyecto2/{}.csv".format(dir, "int5"), delimiter="|")

""" df1 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities1"), delimiter="|")
df2 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities2"), delimiter="|")
df3 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities3"), delimiter="|")
df4 = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities4"), delimiter="|")
df5_ = pd.read_csv("{}/inputs/SecondDataset/{}.csv".format(dir, "intensities5"), delimiter="|") """

print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
# print(df5_.shape)

(6245, 102)
(6245, 72)
(6245, 97)
(6245, 137)
time: 197 ms (started: 2023-05-26 10:59:35 -05:00)


In [15]:
# concat
# df_join_raw = pd.concat([df1.iloc[:,1:], df2.iloc[:, 2:], df3.iloc[:, 2:], df4.iloc[:, 2:], df5.iloc[:, 2:]], axis=1)
df_join_raw = pd.concat([df1.iloc[:, 1:], df2.iloc[:, 2:], df3.iloc[:, 2:], df4.iloc[:, 2:]], axis=1)
df_join_raw.set_index("ionMz", inplace=True)

print(df_join_raw.shape)
df_join_raw

(6245, 400)


Unnamed: 0_level_0,0001 / zwf1^ 3.4,0002 / zwf1^ 3.4,0003 / zwf1^ 3.4,0004 / zwf1^ 3.4,0005 / zwf1^ 3.4,0006 / zwf1^ 3.4,0007 / zwf1^ 3.4,0008 / zwf1^ 3.4,0009 / zwf1^ 3.4,0010 / zwf1^ 3.4,...,0951 / WT 3.4,0952 / WT 3.4,0953 / WT 3.4,0954 / WT 3.4,0955 / WT 3.4,0956 / WT 3.4,0957 / WT 3.4,0958 / WT 3.4,0959 / WT 3.4,0960 / WT 3.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,1,47,508,360,675,232,59,345,346,504,...,45,817,32,307,410,716,228,187,361,952
59.0137,53182,57902,51861,62812,54291,54395,59054,52547,57544,63538,...,45747,39677,47205,38697,54320,62610,47283,49927,43632,48511
59.0291,2151,2102,2334,2864,2393,2246,2331,1954,2310,2528,...,1852,1717,1830,1660,2006,2539,2103,2276,1643,2522
59.0370,83,174,366,1134,693,464,235,479,443,691,...,225,349,185,429,186,278,364,368,115,887
59.0453,1,1,51,642,493,143,11,227,160,154,...,60,293,1,336,65,142,1,182,10,684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,2470,2435,2465,2727,2773,2157,2168,2623,2256,2636,...,1866,1834,1669,674,2533,1017,2983,1814,2160,1899
996.7096,3366,3241,2909,3469,3538,3010,3031,2872,2709,3601,...,1748,1419,1160,621,1758,1474,1973,1023,1769,1629
997.5542,2818,2335,1807,1679,2222,2070,2158,1835,2087,2784,...,1168,828,1162,1103,2253,1419,2306,1542,827,1433
997.7131,1901,1469,1572,1916,2180,2555,2101,1656,2543,2643,...,659,1370,714,390,1660,1513,1843,1448,691,1376


time: 31.1 ms (started: 2023-05-26 10:59:37 -05:00)


In [16]:
# get groud
if group[0] == "zwf1^":
    r = "zwf1"
elif group[0] == "pck1^":
    r = "pck1"
else:
    r = group[0]

df_raw_group = df_join_raw.filter(regex=r, axis=1)
df_raw_group

Unnamed: 0_level_0,0071 / WT 2.2,0072 / WT 2.2,0073 / WT 2.2,0074 / WT 2.2,0075 / WT 2.2,0076 / WT 2.2,0077 / WT 2.2,0078 / WT 2.2,0079 / WT 2.2,0080 / WT 2.2,...,0951 / WT 3.4,0952 / WT 3.4,0953 / WT 3.4,0954 / WT 3.4,0955 / WT 3.4,0956 / WT 3.4,0957 / WT 3.4,0958 / WT 3.4,0959 / WT 3.4,0960 / WT 3.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,528,1,378,438,530,250,662,431,305,334,...,45,817,32,307,410,716,228,187,361,952
59.0137,56523,24625,78586,76559,67449,74919,88152,99718,86001,93962,...,45747,39677,47205,38697,54320,62610,47283,49927,43632,48511
59.0291,2290,927,3149,3038,2774,3419,3911,3820,3374,3746,...,1852,1717,1830,1660,2006,2539,2103,2276,1643,2522
59.0370,320,195,1137,642,588,799,602,606,768,372,...,225,349,185,429,186,278,364,368,115,887
59.0453,196,59,555,405,234,413,46,148,95,25,...,60,293,1,336,65,142,1,182,10,684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,2050,1415,3851,2285,2038,3089,2737,2788,1819,2600,...,1866,1834,1669,674,2533,1017,2983,1814,2160,1899
996.7096,2681,1984,3570,2169,2479,2668,3033,3271,2264,2411,...,1748,1419,1160,621,1758,1474,1973,1023,1769,1629
997.5542,1516,1307,2299,2363,2877,1436,1771,2326,1629,2740,...,1168,828,1162,1103,2253,1419,2306,1542,827,1433
997.7131,1763,811,2219,1497,1396,1746,1518,1750,2290,1649,...,659,1370,714,390,1660,1513,1843,1448,691,1376


time: 31 ms (started: 2023-05-26 10:59:40 -05:00)


In [17]:
# Logarithm

df_raw_log = df_raw_group.copy()
for column in df_raw_group.columns:
  df_raw_log[column] = np.log10(df_raw_group[column], where=df_raw_group[column]>0)
  # df_raw_log[column] = np.log10(df_raw_group[column], out=np.zeros_like(df_raw_group[column]), where=df_raw_group[column]>0)
df_raw_log

Unnamed: 0_level_0,0071 / WT 2.2,0072 / WT 2.2,0073 / WT 2.2,0074 / WT 2.2,0075 / WT 2.2,0076 / WT 2.2,0077 / WT 2.2,0078 / WT 2.2,0079 / WT 2.2,0080 / WT 2.2,...,0951 / WT 3.4,0952 / WT 3.4,0953 / WT 3.4,0954 / WT 3.4,0955 / WT 3.4,0956 / WT 3.4,0957 / WT 3.4,0958 / WT 3.4,0959 / WT 3.4,0960 / WT 3.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,2.722634,0.000000,2.577492,2.641474,2.724276,2.397940,2.820858,2.634477,2.484300,2.523746,...,1.653213,2.912222,1.505150,2.487138,2.612784e+00,2.854913e+00,2.357935,2.271842,2.557507,2.978637
59.0137,4.752225,4.391376,4.895345,4.883996,4.828976,4.874592,4.945232,4.998774,4.934504,4.972952,...,4.660363,4.598539,4.673988,4.587677,4.734960e+00,4.796644e+00,4.674705,4.698335,4.639805,4.685840
59.0291,3.359835,2.967080,3.498173,3.482588,3.443106,3.533899,3.592288,3.582063,3.528145,3.573568,...,3.267641,3.234770,3.262451,3.220108,3.302331e+00,3.404663e+00,3.322839,3.357172,3.215638,3.401745
59.0370,2.505150,2.290035,3.055760,2.807535,2.769377,2.902547,2.779596,2.782473,2.885361,2.570543,...,2.352183,2.542825,2.267172,2.632457,2.269513e+00,2.444045e+00,2.561101,2.565848,2.060698,2.947924
59.0453,2.292256,1.770852,2.744293,2.607455,2.369216,2.615950,1.662758,2.170262,1.977724,1.397940,...,1.778151,2.466868,0.000000,2.526339,1.812913e+00,2.152288e+00,0.000000,2.260071,1.000000,2.835056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,3.311754,3.150756,3.585574,3.358886,3.309204,3.489818,3.437275,3.445293,3.259833,3.414973,...,3.270912,3.263399,3.222456,2.828660,3.403635e+00,3.007321e+00,3.474653,3.258637,3.334454,3.278525
996.7096,3.428297,3.297542,3.552668,3.336260,3.394277,3.426186,3.481872,3.514681,3.354876,3.382197,...,3.242541,3.151982,3.064458,2.793092,3.245019e+00,3.168497e+00,3.295127,3.009876,3.247728,3.211921
997.5542,3.180699,3.116276,3.361539,3.373464,3.458940,3.157154,3.248219,3.366610,3.211921,3.437751,...,3.067443,2.918030,3.065206,3.042576,3.352761e+00,3.151982e+00,3.362859,3.188084,2.917506,3.156246
997.7131,3.246252,2.909021,3.346157,3.175222,3.144885,3.242044,3.181272,3.243038,3.359835,3.217221,...,2.818885,3.136721,2.853698,2.591065,3.220108e+00,3.179839e+00,3.265525,3.160769,2.839478,3.138618


time: 1.66 s (started: 2023-05-26 10:59:44 -05:00)


In [18]:
subgroups = [item.split("{} ".format(group[0]))[1].split(".")[0] for item in list(df_raw_log.columns)]
subgroups = np.unique(subgroups)
subgroups


array(['1', '2', '3', '4', '5'], dtype='<U1')

time: 4.68 ms (started: 2023-05-26 10:59:51 -05:00)


In [19]:
# Split graph

list_raw = []

for item in subgroups:
  list_raw.append(df_raw_log.filter(like="{} {}.".format(group[0], item)))

print(len(list_raw))
list_raw[0]

5


Unnamed: 0_level_0,0091 / WT 1.3,0092 / WT 1.3,0093 / WT 1.3,0094 / WT 1.3,0095 / WT 1.3,0096 / WT 1.3,0097 / WT 1.3,0098 / WT 1.3,0099 / WT 1.3,0100 / WT 1.3,...,0821 / WT 1.4,0822 / WT 1.4,0823 / WT 1.4,0824 / WT 1.4,0825 / WT 1.4,0826 / WT 1.4,0827 / WT 1.4,0828 / WT 1.4,0829 / WT 1.4,0830 / WT 1.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0049,2.515874,2.453318,2.832509,2.687529,2.515874,1.973128,2.873902,2.670246,2.795185,2.675778,...,2.686636,2.658011,2.689309e+00,2.902003e+00,2.872739,2.378398,2.842609,2.836957,3.044148,2.672098
59.0137,5.101637,5.073656,5.191384,5.053144,5.127630,5.014327,5.236834,5.009944,5.198135,5.278424,...,4.904088,4.937468,4.943069e+00,4.867391e+00,4.962649,4.961786,4.989499,4.914708,4.984023,4.986274
59.0291,3.683497,3.647872,3.772762,3.684486,3.743196,3.626853,3.847819,3.593618,3.759290,3.860038,...,3.530072,3.522053,3.543323e+00,3.539076e+00,3.593618,3.594614,3.639088,3.568671,3.661623,3.615529
59.0370,2.841985,2.990783,3.060320,3.089198,3.143951,2.880242,3.228657,2.829947,2.930949,3.070776,...,2.990339,3.022428,2.748963e+00,2.927883e+00,2.843233,2.973128,2.937518,3.072985,2.995196,2.896526
59.0453,0.000000,2.475671,2.632457,2.597695,2.862131,2.468347,2.568202,2.553883,2.403121,2.113943,...,2.499687,2.424882,0.000000e+00,2.588832e+00,2.459392,2.665581,2.596597,2.874482,2.693727,2.397940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996.5509,3.431203,3.400538,3.375481,3.477266,3.579669,3.106191,3.448397,3.398461,3.493179,3.498724,...,3.274620,3.240549,3.348500e+00,2.993436e+00,3.167022,3.196729,3.121560,3.302980,3.147985,3.387034
996.7096,2.877371,3.203305,3.030600,3.000000,3.068557,3.232996,2.970812,3.075547,3.157457,3.283075,...,2.978181,3.010724,3.106191e+00,2.926342e+00,2.813581,2.968016,2.629410,2.843855,3.015360,3.181272
997.5542,3.253822,3.159868,3.400538,3.302547,3.332842,3.306639,3.492201,3.340841,3.473633,3.412461,...,2.882525,3.237795,2.848189e+00,2.903090e+00,3.129045,2.842609,3.232742,3.332438,3.104828,2.862728
997.7131,3.077731,2.881955,2.924796,2.472756,2.747412,2.990783,3.084576,2.878522,3.174641,3.011570,...,2.600973,2.875061,0.000000e+00,2.621176e+00,2.767898,2.580925,2.900913,3.306639,2.775246,2.146128


time: 39.7 ms (started: 2023-05-26 10:59:54 -05:00)


In [20]:
print(list_raw[0].columns)
print(list_raw[1].columns)
print(list_raw[2].columns)
print(list_raw[3].columns)
print(list_raw[4].columns)


Index(['0091 / WT 1.3', '0092 / WT 1.3', '0093 / WT 1.3', '0094 / WT 1.3',
       '0095 / WT 1.3', '0096 / WT 1.3', '0097 / WT 1.3', '0098 / WT 1.3',
       '0099 / WT 1.3', '0100 / WT 1.3', '0751 / WT 1.1', '0752 / WT 1.1',
       '0753 / WT 1.1', '0754 / WT 1.1', '0755 / WT 1.1', '0756 / WT 1.1',
       '0757 / WT 1.1', '0758 / WT 1.1', '0759 / WT 1.1', '0760 / WT 1.1',
       '0811 / WT 1.2', '0812 / WT 1.2', '0813 / WT 1.2', '0814 / WT 1.2',
       '0815 / WT 1.2', '0816 / WT 1.2', '0817 / WT 1.2', '0818 / WT 1.2',
       '0819 / WT 1.2', '0820 / WT 1.2', '0821 / WT 1.4', '0822 / WT 1.4',
       '0823 / WT 1.4', '0824 / WT 1.4', '0825 / WT 1.4', '0826 / WT 1.4',
       '0827 / WT 1.4', '0828 / WT 1.4', '0829 / WT 1.4', '0830 / WT 1.4'],
      dtype='object')
Index(['0071 / WT 2.2', '0072 / WT 2.2', '0073 / WT 2.2', '0074 / WT 2.2',
       '0075 / WT 2.2', '0076 / WT 2.2', '0077 / WT 2.2', '0078 / WT 2.2',
       '0079 / WT 2.2', '0080 / WT 2.2', '0271 / WT 2.3', '0272 / WT 2.3',
  

---

In [21]:
list_raw_copy = list_raw.copy()

for k, item in enumerate(list_raw_copy):
    item.columns = [chr(65 + k)]*len(item.columns)

time: 1.76 ms (started: 2023-05-26 11:00:00 -05:00)


In [34]:
print(len(list_raw_copy[0].columns), list_raw_copy[0].columns)
print(len(list_raw_copy[1].columns), list_raw_copy[1].columns)
print(len(list_raw_copy[2].columns), list_raw_copy[2].columns)
print(len(list_raw_copy[3].columns), list_raw_copy[3].columns)
print(len(list_raw_copy[4].columns), list_raw_copy[4].columns)

40 Index(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'],
      dtype='object')
40 Index(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'],
      dtype='object')
40 Index(['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
      dtype='object')
40 Index(['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
      dtype='object')
40 Index(['E

In [None]:
print(list_raw_copy[0].index)
print(list_raw_copy[1].index)
print(list_raw_copy[2].index)
print(list_raw_copy[3].index)
print(list_raw_copy[4].index)

In [67]:
# Filter by graph and concat 
nodes = list(G.nodes())
df_raw_filter = list_raw[0].loc[nodes, :]

for k in range(1, len(subgroups)):
    df_temp = list_raw[k].loc[nodes, :]
    # df_raw_filter = df_raw_filter.join(df_temp)
    df_raw_filter = pd.concat([df_raw_filter, df_temp], axis=1)

df_raw_filter.to_csv("{}/output_greedy/matrix/greedy_{}_matrix_copy.csv".format(dir, group[0]), index=True)
df_raw_filter

Unnamed: 0_level_0,A,A,A,A,A,A,A,A,A,A,...,E,E,E,E,E,E,E,E,E,E
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0137,5.101637,5.073656,5.191384,5.053144,5.127630,5.014327,5.236834,5.009944,5.198135,5.278424,...,4.675879,4.755280,4.687681,4.671071,4.633378,4.749458,4.704485,4.698370,4.683947,4.712448
59.0291,3.683497,3.647872,3.772762,3.684486,3.743196,3.626853,3.847819,3.593618,3.759290,3.860038,...,3.373831,3.409764,3.314710,3.294687,3.278754,3.404149,3.325516,3.282622,3.417804,3.319314
349.2758,5.257571,5.206707,5.276522,5.255923,5.214160,5.247069,5.299169,5.262506,5.377057,5.338753,...,4.657486,4.689522,4.707161,4.748095,4.676099,4.724030,4.739667,4.717254,4.714238,4.716429
350.2793,4.554683,4.540742,4.586981,4.581403,4.579544,4.554307,4.618613,4.589425,4.682118,4.677434,...,4.039969,4.066214,4.061867,4.025961,3.994625,4.081167,4.060131,4.048985,4.002986,4.008387
350.3033,3.862847,3.763727,3.853272,3.801129,3.848128,3.760799,3.868879,3.779957,3.918816,3.860098,...,3.320562,3.282849,3.381837,3.264109,3.362294,3.343802,3.442950,3.299943,3.292034,3.311966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980.0986,2.822822,2.903633,2.933487,3.033424,2.773786,2.898176,3.129690,2.778874,2.737987,2.861534,...,3.125156,2.392697,2.866878,3.100026,2.763428,2.744293,2.962369,2.707570,3.104146,3.174641
985.5091,3.099335,3.335658,3.382197,3.236789,3.277151,3.104146,3.169674,3.337260,3.256718,3.251395,...,3.197005,3.309204,3.135451,3.409933,3.204391,3.344392,3.358506,3.187803,3.273927,3.288026
985.5490,3.133858,3.294687,3.276692,3.133219,3.239299,3.212720,2.837588,3.193681,3.090611,3.307068,...,3.436957,3.440437,3.449015,3.569608,3.458336,3.533391,3.477555,3.477121,3.437592,3.498724
986.5138,3.063709,3.286905,3.429591,3.259594,3.377852,3.336460,3.475526,3.155943,3.407731,3.254548,...,3.251395,3.308137,3.205204,3.123525,3.321184,3.378216,3.298635,3.347525,3.198932,3.376029


time: 986 ms (started: 2023-05-26 11:40:28 -05:00)


In [86]:
df_raw_filter_1 = df_raw_filter.reset_index()
df_raw_filter_1

Unnamed: 0,ionMz,A,A.1,A.2,A.3,A.4,A.5,A.6,A.7,A.8,...,E,E.1,E.2,E.3,E.4,E.5,E.6,E.7,E.8,E.9
0,59.0137,5.101637,5.073656,5.191384,5.053144,5.127630,5.014327,5.236834,5.009944,5.198135,...,4.675879,4.755280,4.687681,4.671071,4.633378,4.749458,4.704485,4.698370,4.683947,4.712448
1,59.0291,3.683497,3.647872,3.772762,3.684486,3.743196,3.626853,3.847819,3.593618,3.759290,...,3.373831,3.409764,3.314710,3.294687,3.278754,3.404149,3.325516,3.282622,3.417804,3.319314
2,349.2758,5.257571,5.206707,5.276522,5.255923,5.214160,5.247069,5.299169,5.262506,5.377057,...,4.657486,4.689522,4.707161,4.748095,4.676099,4.724030,4.739667,4.717254,4.714238,4.716429
3,350.2793,4.554683,4.540742,4.586981,4.581403,4.579544,4.554307,4.618613,4.589425,4.682118,...,4.039969,4.066214,4.061867,4.025961,3.994625,4.081167,4.060131,4.048985,4.002986,4.008387
4,350.3033,3.862847,3.763727,3.853272,3.801129,3.848128,3.760799,3.868879,3.779957,3.918816,...,3.320562,3.282849,3.381837,3.264109,3.362294,3.343802,3.442950,3.299943,3.292034,3.311966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4176,980.0986,2.822822,2.903633,2.933487,3.033424,2.773786,2.898176,3.129690,2.778874,2.737987,...,3.125156,2.392697,2.866878,3.100026,2.763428,2.744293,2.962369,2.707570,3.104146,3.174641
4177,985.5091,3.099335,3.335658,3.382197,3.236789,3.277151,3.104146,3.169674,3.337260,3.256718,...,3.197005,3.309204,3.135451,3.409933,3.204391,3.344392,3.358506,3.187803,3.273927,3.288026
4178,985.5490,3.133858,3.294687,3.276692,3.133219,3.239299,3.212720,2.837588,3.193681,3.090611,...,3.436957,3.440437,3.449015,3.569608,3.458336,3.533391,3.477555,3.477121,3.437592,3.498724
4179,986.5138,3.063709,3.286905,3.429591,3.259594,3.377852,3.336460,3.475526,3.155943,3.407731,...,3.251395,3.308137,3.205204,3.123525,3.321184,3.378216,3.298635,3.347525,3.198932,3.376029


time: 37.2 ms (started: 2023-05-26 11:47:30 -05:00)


In [89]:
cols = df_raw_filter_1["ionMz"]
cols.values

array([ 59.0137,  59.0291, 349.2758, ..., 985.549 , 986.5138, 986.55  ])

time: 4.31 ms (started: 2023-05-26 11:47:53 -05:00)


In [90]:
df_raw_filter_1.drop("ionMz", axis=1, inplace=True)
df_raw_filter_1

Unnamed: 0,A,A.1,A.2,A.3,A.4,A.5,A.6,A.7,A.8,A.9,...,E,E.1,E.2,E.3,E.4,E.5,E.6,E.7,E.8,E.9
0,5.101637,5.073656,5.191384,5.053144,5.127630,5.014327,5.236834,5.009944,5.198135,5.278424,...,4.675879,4.755280,4.687681,4.671071,4.633378,4.749458,4.704485,4.698370,4.683947,4.712448
1,3.683497,3.647872,3.772762,3.684486,3.743196,3.626853,3.847819,3.593618,3.759290,3.860038,...,3.373831,3.409764,3.314710,3.294687,3.278754,3.404149,3.325516,3.282622,3.417804,3.319314
2,5.257571,5.206707,5.276522,5.255923,5.214160,5.247069,5.299169,5.262506,5.377057,5.338753,...,4.657486,4.689522,4.707161,4.748095,4.676099,4.724030,4.739667,4.717254,4.714238,4.716429
3,4.554683,4.540742,4.586981,4.581403,4.579544,4.554307,4.618613,4.589425,4.682118,4.677434,...,4.039969,4.066214,4.061867,4.025961,3.994625,4.081167,4.060131,4.048985,4.002986,4.008387
4,3.862847,3.763727,3.853272,3.801129,3.848128,3.760799,3.868879,3.779957,3.918816,3.860098,...,3.320562,3.282849,3.381837,3.264109,3.362294,3.343802,3.442950,3.299943,3.292034,3.311966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4176,2.822822,2.903633,2.933487,3.033424,2.773786,2.898176,3.129690,2.778874,2.737987,2.861534,...,3.125156,2.392697,2.866878,3.100026,2.763428,2.744293,2.962369,2.707570,3.104146,3.174641
4177,3.099335,3.335658,3.382197,3.236789,3.277151,3.104146,3.169674,3.337260,3.256718,3.251395,...,3.197005,3.309204,3.135451,3.409933,3.204391,3.344392,3.358506,3.187803,3.273927,3.288026
4178,3.133858,3.294687,3.276692,3.133219,3.239299,3.212720,2.837588,3.193681,3.090611,3.307068,...,3.436957,3.440437,3.449015,3.569608,3.458336,3.533391,3.477555,3.477121,3.437592,3.498724
4179,3.063709,3.286905,3.429591,3.259594,3.377852,3.336460,3.475526,3.155943,3.407731,3.254548,...,3.251395,3.308137,3.205204,3.123525,3.321184,3.378216,3.298635,3.347525,3.198932,3.376029


time: 23.4 ms (started: 2023-05-26 11:47:59 -05:00)


In [91]:
df_raw_filter_2 = df_raw_filter_1.T
df_raw_filter_2.columns = cols.values
df_raw_filter_2

Unnamed: 0,59.0137,59.0291,349.2758,350.2793,350.3033,389.2739,389.2907,389.3284,390.2943,391.2964,...,978.6174,978.6600,979.6191,979.6627,980.0763,980.0986,985.5091,985.5490,986.5138,986.5500
A,5.101637,3.683497,5.257571,4.554683,3.862847,4.276737,5.469997,3.975064,4.831146,4.052848,...,3.148911,2.966611,2.964731,2.650308,3.167317,2.822822,3.099335,3.133858,3.063709,2.854913
A,5.073656,3.647872,5.206707,4.540742,3.763727,4.115244,5.360688,3.877314,4.718966,3.996074,...,3.259355,2.992995,2.908485,2.103804,3.285332,2.903633,3.335658,3.294687,3.286905,3.212454
A,5.191384,3.772762,5.276522,4.586981,3.853272,4.295721,5.488060,4.008770,4.831767,4.143546,...,3.214314,2.923762,3.143015,2.835056,3.348889,2.933487,3.382197,3.276692,3.429591,3.314710
A,5.053144,3.684486,5.255923,4.581403,3.801129,4.188169,5.340147,3.898122,4.715761,3.976029,...,2.919078,2.542825,2.988113,2.776701,3.437433,3.033424,3.236789,3.133219,3.259594,3.233504
A,5.127630,3.743196,5.214160,4.579544,3.848128,4.262000,5.465279,3.979138,4.852480,4.042851,...,3.410609,3.079543,3.019116,2.769377,3.302547,2.773786,3.277151,3.239299,3.377852,3.131939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E,4.749458,3.404149,4.724030,4.081167,3.343802,3.809290,4.436512,3.122544,3.975340,3.238046,...,3.224015,2.721811,3.024896,3.125156,3.279895,2.744293,3.344392,3.533391,3.378216,3.382377
E,4.704485,3.325516,4.739667,4.060131,3.442950,3.751664,4.504321,3.175802,3.858597,3.229682,...,3.230193,2.984977,3.085291,2.662758,3.242044,2.962369,3.358506,3.477555,3.298635,3.248709
E,4.698370,3.282622,4.717254,4.048985,3.299943,3.803116,4.394504,2.906874,3.868056,2.982271,...,3.180413,3.046885,3.209783,2.808211,3.121231,2.707570,3.187803,3.477121,3.347525,3.408410
E,4.683947,3.417804,4.714238,4.002986,3.292034,3.762453,4.417139,3.278754,3.797060,3.333246,...,3.095169,2.876218,2.989005,2.875640,3.372175,3.104146,3.273927,3.437592,3.198932,3.242541


time: 17.6 ms (started: 2023-05-26 11:48:07 -05:00)


In [93]:
df_raw_filter_3 = df_raw_filter_2.reset_index()
df_raw_filter_3

Unnamed: 0,index,59.0137,59.0291,349.2758,350.2793,350.3033,389.2739,389.2907,389.3284,390.2943,...,978.6174,978.66,979.6191,979.6627,980.0763,980.0986,985.5091,985.549,986.5138,986.55
0,A,5.101637,3.683497,5.257571,4.554683,3.862847,4.276737,5.469997,3.975064,4.831146,...,3.148911,2.966611,2.964731,2.650308,3.167317,2.822822,3.099335,3.133858,3.063709,2.854913
1,A,5.073656,3.647872,5.206707,4.540742,3.763727,4.115244,5.360688,3.877314,4.718966,...,3.259355,2.992995,2.908485,2.103804,3.285332,2.903633,3.335658,3.294687,3.286905,3.212454
2,A,5.191384,3.772762,5.276522,4.586981,3.853272,4.295721,5.488060,4.008770,4.831767,...,3.214314,2.923762,3.143015,2.835056,3.348889,2.933487,3.382197,3.276692,3.429591,3.314710
3,A,5.053144,3.684486,5.255923,4.581403,3.801129,4.188169,5.340147,3.898122,4.715761,...,2.919078,2.542825,2.988113,2.776701,3.437433,3.033424,3.236789,3.133219,3.259594,3.233504
4,A,5.127630,3.743196,5.214160,4.579544,3.848128,4.262000,5.465279,3.979138,4.852480,...,3.410609,3.079543,3.019116,2.769377,3.302547,2.773786,3.277151,3.239299,3.377852,3.131939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,E,4.749458,3.404149,4.724030,4.081167,3.343802,3.809290,4.436512,3.122544,3.975340,...,3.224015,2.721811,3.024896,3.125156,3.279895,2.744293,3.344392,3.533391,3.378216,3.382377
196,E,4.704485,3.325516,4.739667,4.060131,3.442950,3.751664,4.504321,3.175802,3.858597,...,3.230193,2.984977,3.085291,2.662758,3.242044,2.962369,3.358506,3.477555,3.298635,3.248709
197,E,4.698370,3.282622,4.717254,4.048985,3.299943,3.803116,4.394504,2.906874,3.868056,...,3.180413,3.046885,3.209783,2.808211,3.121231,2.707570,3.187803,3.477121,3.347525,3.408410
198,E,4.683947,3.417804,4.714238,4.002986,3.292034,3.762453,4.417139,3.278754,3.797060,...,3.095169,2.876218,2.989005,2.875640,3.372175,3.104146,3.273927,3.437592,3.198932,3.242541


time: 27.1 ms (started: 2023-05-26 11:49:15 -05:00)


In [94]:
# # reshape the dataframe suitable for statsmodels package 
df_melt = pd.melt(df_raw_filter_3, id_vars=["index"], value_vars=cols.values)
df_melt

Unnamed: 0,index,variable,value
0,A,59.0137,5.101637
1,A,59.0137,5.073656
2,A,59.0137,5.191384
3,A,59.0137,5.053144
4,A,59.0137,5.127630
...,...,...,...
836195,E,986.55,3.382377
836196,E,986.55,3.248709
836197,E,986.55,3.408410
836198,E,986.55,3.242541


time: 173 ms (started: 2023-05-26 11:49:31 -05:00)


In [None]:
df_melt[df_melt["ionMz"] == 59.0137]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
ax = sns.boxplot(x='variable', y='value', data=df_melt, color='#99c2a2')
ax = sns.swarmplot(x="variable", y="value", data=df_melt, color='#7d0013')
plt.show()

In [52]:
import scipy.stats as stats
# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df_raw_filter['A'], df_raw_filter['B'], df_raw_filter['C'], df_raw_filter['D'], df_raw_filter['E'])
print("fvalue:", len(fvalue), np.mean(fvalue))
print(fvalue)
print()
print("pvalue: ", len(pvalue), np.mean(pvalue))
print(pvalue)

fvalue: 40 159.02827798213866
[311.01392748 642.24672032 289.97358441 133.00195789 310.47623493
 128.76703054 302.65506905 346.08714279 336.45500928 322.80442417
  31.09600016  14.85140565 113.46167456  50.03191232  48.52486677
  38.1387538   33.47377893  43.99848425  16.90235029  41.61423983
 118.97991485 150.87709455 119.24995834  62.97234618 253.12347023
 131.76013297 114.07960825 175.20992717 106.85554439 224.3871796
  67.47579155 133.2303013  185.08471064 378.32239817  62.61530576
 112.16177537 107.56917898 107.01009241  97.87797235  96.71384882]

pvalue:  40 1.0247594014437299e-13
[2.29482609e-260 0.00000000e+000 4.10760810e-243 2.17496693e-112
 6.32206414e-260 8.17177342e-109 1.61173367e-253 5.52478193e-289
 3.83925204e-281 5.25398730e-270 7.38819602e-026 4.02269346e-012
 7.10772295e-096 5.62437419e-042 1.08099056e-040 7.55702993e-032
 7.03435679e-028 7.74792651e-037 7.63441413e-014 8.30932552e-035
 1.52415313e-100 1.88381635e-127 9.00862194e-101 5.34997050e-053
 9.63858947e-213

In [53]:
# ANOVA table as R like output
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Ordinary Least Squares (OLS) model
model = ols('value ~ C(variable)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(variable),1252.278967,4.0,1129.70595,0.0
Residual,231730.52481,836195.0,,


time: 1.66 s (started: 2023-05-26 11:21:43 -05:00)


In [54]:
# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm)
from bioinfokit.analys import stat
from bioinfokit.analys import stat
res = stat()
res.anova_stat(df=df_melt, res_var='value', anova_model='value ~ C(variable)')
res.anova_summary


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(variable),4.0,1252.278967,313.069742,1129.70595,0.0
Residual,836195.0,231730.52481,0.277125,,


time: 1.64 s (started: 2023-05-26 11:21:53 -05:00)


---

In [188]:
# Filter by graph and concat 
nodes = list(G.nodes())
df_raw_filter = list_raw[0].loc[nodes, :]

for k in range(1, len(subgroups)):
    df_temp = list_raw[k].loc[nodes, :]
    # df_raw_filter = df_raw_filter.join(df_temp)
    df_raw_filter = pd.concat([df_raw_filter, df_temp], axis=1)

df_raw_filter.to_csv("{}/output_greedy/matrix/greedy_{}_matrix.csv".format(dir, group[0]), index=True)
df_raw_filter

Unnamed: 0_level_0,0091 / WT 1.3,0092 / WT 1.3,0093 / WT 1.3,0094 / WT 1.3,0095 / WT 1.3,0096 / WT 1.3,0097 / WT 1.3,0098 / WT 1.3,0099 / WT 1.3,0100 / WT 1.3,...,0921 / WT 5.4,0922 / WT 5.4,0923 / WT 5.4,0924 / WT 5.4,0925 / WT 5.4,0926 / WT 5.4,0927 / WT 5.4,0928 / WT 5.4,0929 / WT 5.4,0930 / WT 5.4
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0137,5.101637,5.073656,5.191384,5.053144,5.127630,5.014327,5.236834,5.009944,5.198135,5.278424,...,4.675879,4.755280,4.687681,4.671071,4.633378,4.749458,4.704485,4.698370,4.683947,4.712448
59.0291,3.683497,3.647872,3.772762,3.684486,3.743196,3.626853,3.847819,3.593618,3.759290,3.860038,...,3.373831,3.409764,3.314710,3.294687,3.278754,3.404149,3.325516,3.282622,3.417804,3.319314
59.0370,2.841985,2.990783,3.060320,3.089198,3.143951,2.880242,3.228657,2.829947,2.930949,3.070776,...,2.839478,2.905796,2.620136,2.814248,2.836957,2.610660,2.742725,2.378398,2.945961,2.668386
60.0171,3.652826,3.371253,3.545925,3.510947,3.423246,3.492062,3.619928,3.243782,3.660771,3.669131,...,3.229426,3.138303,3.114944,2.963788,3.313023,3.326745,2.910091,3.062206,2.964731,2.915927
61.9884,3.593618,3.454387,3.722140,3.238046,3.599446,3.419460,3.699404,3.370698,3.474944,3.662096,...,3.455758,3.521269,3.328991,3.484300,3.339253,3.580811,3.475671,3.423082,3.421933,3.424882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980.0986,2.822822,2.903633,2.933487,3.033424,2.773786,2.898176,3.129690,2.778874,2.737987,2.861534,...,3.125156,2.392697,2.866878,3.100026,2.763428,2.744293,2.962369,2.707570,3.104146,3.174641
980.1839,3.512951,2.645422,2.849419,2.536558,2.434569,2.522444,2.519828,2.694605,2.403121,2.709270,...,2.764923,0.000000,2.875640,2.756636,2.332438,1.838849,2.290035,2.746634,2.626340,3.093422
981.6836,2.699838,2.454845,1.414973,2.394452,2.488551,0.000000,2.671173,0.000000,1.770852,2.439333,...,2.862728,3.153815,3.028164,2.365488,2.763428,2.807535,2.741939,2.053078,2.727541,2.617000
984.3993,2.987666,3.032216,3.103804,2.982271,2.673942,2.763428,3.014940,2.937016,1.863323,2.921686,...,2.681241,2.980912,2.895423,2.716838,2.873321,3.083144,2.889302,2.608526,2.883661,2.723456


time: 15.9 s (started: 2023-05-26 09:43:20 -05:00)


In [167]:
df_raw_filter = pd.read_csv("{}/output_greedy/matrix/greedy_{}_matrix.csv".format(dir, group[0]), index_col=0)
df_raw_filter

Unnamed: 0_level_0,0031 / pck1^ 1.2,0032 / pck1^ 1.2,0033 / pck1^ 1.2,0034 / pck1^ 1.2,0035 / pck1^ 1.2,0036 / pck1^ 1.2,0037 / pck1^ 1.2,0038 / pck1^ 1.2,0039 / pck1^ 1.2,0040 / pck1^ 1.2,...,0881 / pck1^ 2.3,0882 / pck1^ 2.3,0883 / pck1^ 2.3,0884 / pck1^ 2.3,0885 / pck1^ 2.3,0886 / pck1^ 2.3,0887 / pck1^ 2.3,0888 / pck1^ 2.3,0889 / pck1^ 2.3,0890 / pck1^ 2.3
ionMz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59.0137,4.882285,5.211590,5.134623,4.878763,4.807887,4.708574,4.812947,4.655590,4.749257,4.803935,...,4.777304,4.818622,4.815996,4.811005,4.722288,4.749118,4.783761,4.809573,4.793434,4.825095
59.0291,3.424065,3.835944,3.729570,3.467460,3.447778,3.352954,3.410777,3.342817,3.364926,3.462997,...,3.402777,3.493876,3.456821,3.461799,3.384174,3.394977,3.459091,3.482159,3.390228,3.471878
59.0370,2.532754,3.143951,3.039414,2.536558,2.663701,2.531479,2.485721,2.613842,2.603144,2.931966,...,2.717671,2.959518,2.997823,2.816241,2.869232,2.865104,2.814248,2.840733,2.843233,2.859739
60.0171,3.107549,3.555094,3.593397,3.109241,3.407391,3.006038,3.329601,3.375298,3.173769,3.086360,...,3.006466,3.222716,3.177536,3.302547,3.288473,3.209783,3.190892,3.277380,3.050380,3.281715
61.9884,3.344196,3.639785,3.691965,3.319522,3.355643,3.159868,3.203305,3.239299,3.284882,3.396199,...,3.271842,3.386856,3.168203,3.373464,3.467608,3.275772,3.458336,3.389698,3.418467,3.316390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980.0986,2.919078,2.835056,3.094820,2.885926,2.885926,2.922725,2.673021,3.549126,2.912753,2.445604,...,3.021189,3.033021,2.992111,2.943000,2.945469,2.924796,3.286905,3.215638,3.372728,3.103119
980.1839,2.833147,2.232996,2.879669,2.103804,2.495544,2.406540,0.000000,2.793092,2.315970,2.429752,...,2.881955,2.950851,2.498311,2.453318,2.818226,2.652246,3.264346,3.101403,3.273001,2.814913
981.6836,2.778151,2.469822,2.385606,2.865696,2.732394,2.584331,2.594393,2.544068,2.712650,2.296665,...,2.730782,1.995635,2.565848,2.401401,2.874482,0.000000,0.000000,2.519828,2.702431,2.378398
984.3993,3.093772,2.976808,3.049218,2.902547,2.710963,2.907949,3.098990,2.864511,3.026125,3.050380,...,2.350248,2.401401,2.457882,0.000000,3.128722,1.113943,2.636488,2.269513,2.816904,2.318063


time: 99.3 ms (started: 2023-05-25 15:54:28 -05:00)


In [168]:
# Features selection
df_raw_filter_t = df_raw_filter.T
df_raw_filter_t

ionMz,59.0137,59.0291,59.0370,60.0171,61.9884,71.0139,74.0249,78.9592,85.0296,87.0086,...,979.6627,979.8201,979.8546,979.9144,980.0763,980.0986,980.1839,981.6836,984.3993,986.5500
0031 / pck1^ 1.2,4.882285,3.424065,2.532754,3.107549,3.344196,3.148294,3.688242,2.974512,3.234517,3.549739,...,2.783904,2.625312,2.809560,2.681241,3.263873,2.919078,2.833147,2.778151,3.093772,3.059942
0032 / pck1^ 1.2,5.211590,3.835944,3.143951,3.555094,3.639785,3.652440,4.195983,3.487563,3.606919,3.749968,...,2.574031,0.000000,2.240549,2.781755,3.401745,2.835056,2.232996,2.469822,2.976808,3.272074
0033 / pck1^ 1.2,5.134623,3.729570,3.039414,3.593397,3.691965,3.644931,4.142327,3.535041,3.615634,3.641573,...,2.247973,2.594393,2.617000,2.951338,3.397245,3.094820,2.879669,2.385606,3.049218,3.225568
0034 / pck1^ 1.2,4.878763,3.467460,2.536558,3.109241,3.319522,3.445137,3.801541,3.310268,3.402261,3.528145,...,2.164353,2.506505,1.698970,2.664642,3.270912,2.885926,2.103804,2.865696,2.902547,3.265525
0035 / pck1^ 1.2,4.807887,3.447778,2.663701,3.407391,3.355643,3.230193,3.294466,3.192010,2.834421,3.369030,...,2.712650,2.515874,2.152288,0.000000,3.216694,2.885926,2.495544,2.732394,2.710963,3.021603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0886 / pck1^ 2.3,4.749118,3.394977,2.865104,3.209783,3.275772,3.342028,4.014689,3.268812,3.334454,3.554247,...,2.642465,2.041393,1.838849,2.627366,3.300813,2.924796,2.652246,0.000000,1.113943,2.797268
0887 / pck1^ 2.3,4.783761,3.459091,2.814248,3.190892,3.458336,3.272538,4.073425,3.487845,3.451633,3.552060,...,2.431364,2.751279,2.996949,3.096215,3.439017,3.286905,3.264346,0.000000,2.636488,3.089905
0888 / pck1^ 2.3,4.809573,3.482159,2.840733,3.277380,3.389698,3.395326,4.162863,3.411620,3.512151,3.602603,...,2.344392,2.670246,2.767156,2.996074,3.372912,3.215638,3.101403,2.519828,2.269513,3.189209
0889 / pck1^ 2.3,4.793434,3.390228,2.843233,3.050380,3.418467,3.445760,4.039374,3.351796,3.401573,3.318689,...,2.848189,3.108903,3.110590,3.246499,3.545307,3.372728,3.273001,2.702431,2.816904,3.200303


time: 19.5 ms (started: 2023-05-25 15:54:29 -05:00)


In [169]:
from sklearn.feature_selection import VarianceThreshold

X = df_raw_filter_t.values
print(X.shape)
sel = VarianceThreshold(threshold=(.98 * (1 - .98)))
X_ = sel.fit_transform(X)
print(X_.shape)
# print(len(sel.get_feature_names_out()))
features_name = sel.get_feature_names_out()
# print(features_name)
features_name_index = [int(item[1:]) for item in features_name]
# print(features_name_index)
nodes = list(df_raw_filter_t.iloc[:, features_name_index].columns)
len(nodes)

(80, 5677)
(80, 3972)


3972

time: 17.9 ms (started: 2023-05-25 15:54:29 -05:00)


In [170]:
len(np.unique(nodes))

3972

time: 4.01 ms (started: 2023-05-25 15:54:29 -05:00)


#### Filter by features selection

In [171]:
H = G.subgraph(nodes)
graph_detail(H)

Num. nodes: 3972
Num. edges: 426211

time: 414 ms (started: 2023-05-25 15:54:29 -05:00)


In [172]:
edges = list(H.edges())

df_edge_embeddings_join_filter_count_weight_std_avg_fs = pd.DataFrame(edges, columns=["source", "target"])
df_edge_embeddings_join_filter_count_weight_std_avg_fs["weight"] = [H.get_edge_data(edge[0], edge[1], default=0)["weight"] for edge in edges]
df_edge_embeddings_join_filter_count_weight_std_avg_fs.to_csv("{}/output_greedy/edges_filter_weight_std_avg_fs/greedy_{}_edge-filter-weight-std-avg-fs.csv".format(dir, group[0]), index=False)
df_edge_embeddings_join_filter_count_weight_std_avg_fs

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
426206,940.1264,940.2058,0.534917
426207,978.3602,978.3779,0.682080
426208,978.4185,978.6600,0.754314
426209,979.8201,979.8546,0.664992


time: 2.92 s (started: 2023-05-25 15:54:29 -05:00)


In [173]:
df_edge_embeddings_join_filter_count_weight_std_avg_fs = pd.read_csv("{}/output_greedy/edges_filter_weight_std_avg_fs/greedy_{}_edge-filter-weight-std-avg-fs.csv".format(dir, group[0]))
df_edge_embeddings_join_filter_count_weight_std_avg_fs

Unnamed: 0,source,target,weight
0,59.0137,59.0291,0.974889
1,59.0137,59.0370,0.740804
2,59.0137,60.0171,0.801043
3,59.0137,61.9884,0.684453
4,59.0137,71.0139,0.681361
...,...,...,...
426206,940.1264,940.2058,0.534917
426207,978.3602,978.3779,0.682080
426208,978.4185,978.6600,0.754314
426209,979.8201,979.8546,0.664992


time: 104 ms (started: 2023-05-25 15:54:32 -05:00)


In [174]:
H = nx.from_pandas_edgelist(df_edge_embeddings_join_filter_count_weight_std_avg_fs, "source", "target")
graph_detail(H)

Num. nodes: 3881
Num. edges: 426211

time: 414 ms (started: 2023-05-25 15:54:33 -05:00)
