### Imports

In [25]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.35 ms (started: 2023-06-13 17:57:11 -05:00)


### Parameters

In [26]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

raw_data_folder =  params["raw_folder"]
print("Raw folder:\t", raw_data_folder)

exp = params["exp"]
print("Exp:\t\t", exp)

methods = ["vgae", "vgae", "vgae"] # change
print("Methods:\t", methods)

groups_id = ["WT", "zwf1^", "pck1^"] # change
print("Groups id:\t", groups_id)

options = ["dyn", "dyn", "dyn"] # change
print("Options:\t", options)

Raw folder:	 Edwin_proyecto2
Exp:		 exp1
Methods:	 ['vgae', 'vgae', 'vgae']
Groups id:	 ['WT', 'zwf1^', 'pck1^']
Options:	 ['dyn', 'dyn', 'dyn']
time: 15.6 ms (started: 2023-06-13 17:57:11 -05:00)


### Load data

In [61]:
# Load metadata

df_metadata = pd.read_excel("temp/{}/{}.xlsx".format(raw_data_folder, "Edwin_Set2-processed"), header=0, sheet_name=1)
df_metadata.dropna(subset=["Corrected (m/z)"], inplace=True)
df_metadata = df_metadata.astype({"Corrected (m/z)": str})
df_metadata.set_index("Corrected (m/z)", inplace=True)
print(df_metadata.shape)
df_metadata.head()

(85, 7)


Unnamed: 0_level_0,Metabolites - Approved by Nicola,Metabolites - Used by Florian,Theoretical - m/z (based on BioCyc),Chemical Formula (based on BioCyc),Adduct (negative mode),Error in mDa (oTOF),Error in PPM (Theoretical vs Corrected)
Corrected (m/z),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59.0137,Acetic acid,*,59.013568,C2H4O2,H,,
71.0139,Acrylic acid,*,71.013568,C3H4O2,H,,
74.0249,Glycine,*,74.024467,C2H5NO2,H,,
85.0296,Methyl Acrylate,*,85.029218,C4H6O2,H,,
87.0086,Pyruvic acid,*,87.008482,C3H4O3,H,,


time: 54.4 ms (started: 2023-06-13 18:24:59 -05:00)


In [28]:
# load dataset groups
df_join_raw = pd.read_csv("input/{}_raw.csv".format(exp), index_col=0)        
df_join_raw.index = df_join_raw.index.astype("str")
df_join_raw.head()

Unnamed: 0,zwf1^3.4,zwf1^3.4.1,zwf1^3.4.2,zwf1^3.4.3,zwf1^3.4.4,zwf1^3.4.5,zwf1^3.4.6,zwf1^3.4.7,zwf1^3.4.8,zwf1^3.4.9,...,WT3.4,WT3.4.1,WT3.4.2,WT3.4.3,WT3.4.4,WT3.4.5,WT3.4.6,WT3.4.7,WT3.4.8,WT3.4.9
59.0049,1,47,508,360,675,232,59,345,346,504,...,45,817,32,307,410,716,228,187,361,952
59.0137,53182,57902,51861,62812,54291,54395,59054,52547,57544,63538,...,45747,39677,47205,38697,54320,62610,47283,49927,43632,48511
59.0291,2151,2102,2334,2864,2393,2246,2331,1954,2310,2528,...,1852,1717,1830,1660,2006,2539,2103,2276,1643,2522
59.037,83,174,366,1134,693,464,235,479,443,691,...,225,349,185,429,186,278,364,368,115,887
59.0453,1,1,51,642,493,143,11,227,160,154,...,60,293,1,336,65,142,1,182,10,684


time: 162 ms (started: 2023-06-13 17:57:11 -05:00)


### BioCyc

In [29]:
# get filter graphs

dict_graphs = {}
for method in methods:
    for group in groups_id:
        for option in options:
            df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}_{}.csv".format(exp, method, group, option),
                                             dtype={"source": "string", "target": "string"})
            G = nx.from_pandas_edgelist(df_common_edges, edge_attr=["weight"])
            dict_graphs[group] = G

time: 35.3 s (started: 2023-06-13 17:57:11 -05:00)


In [30]:
# get nodes

dict_nodes = {}
for group in groups_id:
    dict_nodes[group] = set(list(dict_graphs[group].nodes()))

time: 1.25 ms (started: 2023-06-13 17:57:46 -05:00)


In [31]:
# set operation

dict_set_operation = {}
for group in groups_id:
    dict_nodes_aux = dict_nodes.copy()
    nodes_aux = dict_nodes_aux.pop(group)
    unique_nodes = nodes_aux - set.union(*list(dict_nodes_aux.values()))

    dict_set_operation[group] = unique_nodes

dict_set_operation["intersection"] = set.intersection(*list(dict_nodes.values()))

print(dict_set_operation.keys())

dict_keys(['WT', 'zwf1^', 'pck1^', 'intersection'])
time: 23.6 ms (started: 2023-06-13 17:57:46 -05:00)


In [32]:
# print set size
for key, value in dict_set_operation.items():
    print(key, len(value))

WT 18
zwf1^ 253
pck1^ 353
intersection 3370
time: 12.5 ms (started: 2023-06-13 17:57:47 -05:00)


In [33]:
# delete nodes without metabollities name
for group in dict_set_operation:
    inter = dict_set_operation[group] & set(list(df_metadata.index.values))
    dict_set_operation[group] = list(inter)

time: 11.2 ms (started: 2023-06-13 17:57:47 -05:00)


In [34]:
# print set size
for key, value in dict_set_operation.items():
    print(key, len(value))

WT 0
zwf1^ 3
pck1^ 0
intersection 58
time: 1.58 ms (started: 2023-06-13 17:57:50 -05:00)


In [62]:
df_metadata

Unnamed: 0_level_0,Metabolites - Approved by Nicola,Metabolites - Used by Florian,Theoretical - m/z (based on BioCyc),Chemical Formula (based on BioCyc),Adduct (negative mode),Error in mDa (oTOF),Error in PPM (Theoretical vs Corrected)
Corrected (m/z),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59.0137,Acetic acid,*,59.013568,C2H4O2,H,,
71.0139,Acrylic acid,*,71.013568,C3H4O2,H,,
74.0249,Glycine,*,74.024467,C2H5NO2,H,,
85.0296,Methyl Acrylate,*,85.029218,C4H6O2,H,,
87.0086,Pyruvic acid,*,87.008482,C3H4O3,H,,
...,...,...,...,...,...,...,...
664.117,Nicotinamide adenine dinucleotide,*,664.117210,C21H29N7O14P2,H,,
665.2139,C33H36N6O7 + K,*,665.214297,C24H42O21,H,,
766.1076,Coenzyme A,*,766.107647,C21H36N7O16P3S,H,,
784.1495,FAD,*,784.149573,C27H33N9O15P2,H,,


time: 17.3 ms (started: 2023-06-13 18:25:37 -05:00)


In [68]:
# mapping metabolite name with ratio

common_nodes = dict_set_operation["intersection"]

df_biocyc = pd.DataFrame()
df_biocyc["m/z"] = common_nodes

for group in groups_id:
    df_aux = df_join_raw.filter(like=group)
    df_aux = df_aux.loc[common_nodes]

    df_biocyc["mean-{}".format(group)] = df_aux.mean(axis=1).values
    df_biocyc["log-{}".format(group)] = np.log10(df_aux.mean(axis=1).values)

df_biocyc["metabolities"] = df_metadata.loc[common_nodes]["Metabolites - Approved by Nicola"].values

df_biocyc

Unnamed: 0,m/z,mean-WT,log-WT,mean-zwf1^,log-zwf1^,mean-pck1^,log-pck1^,metabolities
0,117.0195,3968.78,3.598657,6313.683333,3.800283,2967.3375,3.472367,Butanedioic acid
1,367.0185,4659.255,3.668316,4796.658333,3.680939,4309.8875,3.634466,OMP
2,135.0311,8224.695,3.91512,9650.733333,3.98456,8691.7125,3.939105,Adenine
3,218.1033,6119.885,3.786743,5018.658333,3.700588,4354.625,3.638951,Panthothenate
4,113.0246,13810.43,4.140207,7764.516667,3.890114,6768.075,3.830465,C5H6O3
5,766.1076,4375.815,3.641059,4908.75,3.690971,5925.375,3.772716,Coenzyme A
6,115.0038,4839.5,3.6848,5960.875,3.77531,3758.9875,3.575071,Fumaric acid / Maleic acid
7,347.0589,5054.37,3.703667,4960.65,3.695539,11291.7,4.052759,C20H12O6
8,173.0095,6925.91,3.840477,3029.383333,3.481354,6294.5625,3.798966,Aconitic acid
9,129.0192,5338.56,3.727424,3979.775,3.599859,4233.475,3.626697,Acetylpyruvate


time: 40.6 ms (started: 2023-06-13 18:28:18 -05:00)
