### Imports

In [49]:
from mpl_toolkits import mplot3d
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
from utils.utils import *

import hdbscan
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 993 µs (started: 2023-06-15 09:35:15 -05:00)


### Parameters

In [50]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

raw_data_folder =  params["raw_folder"]
print("Raw folder:\t", raw_data_folder)

exp = params["exp"]
print("Exp:\t\t", exp)

methods = ["vgae", "vgae"] # change
print("Methods:\t", methods)

groups_id = ["WT", "zwf1^"] # change
print("Groups id:\t", groups_id)

options = ["dyn", "dyn"] # change
print("Options:\t", options)

Raw folder:	 Edwin_proyecto2
Exp:		 exp1
Methods:	 ['vgae', 'vgae']
Groups id:	 ['WT', 'zwf1^']
Options:	 ['dyn', 'dyn']
time: 14.9 ms (started: 2023-06-15 09:35:15 -05:00)


### Load data

In [51]:
# Load metadata

df_metadata = pd.read_excel("temp/{}/{}.xlsx".format(raw_data_folder, "Edwin_Set2-processed"), header=0, sheet_name=1)
df_metadata.dropna(subset=["Corrected (m/z)"], inplace=True)
df_metadata = df_metadata.astype({"Corrected (m/z)": str})
df_metadata.set_index("Corrected (m/z)", inplace=True)
print(df_metadata.shape)
df_metadata.head()

(85, 7)


Unnamed: 0_level_0,Metabolites - Approved by Nicola,Metabolites - Used by Florian,Theoretical - m/z (based on BioCyc),Chemical Formula (based on BioCyc),Adduct (negative mode),Error in mDa (oTOF),Error in PPM (Theoretical vs Corrected)
Corrected (m/z),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59.0137,Acetic acid,*,59.013568,C2H4O2,H,,
71.0139,Acrylic acid,*,71.013568,C3H4O2,H,,
74.0249,Glycine,*,74.024467,C2H5NO2,H,,
85.0296,Methyl Acrylate,*,85.029218,C4H6O2,H,,
87.0086,Pyruvic acid,*,87.008482,C3H4O3,H,,


time: 80.5 ms (started: 2023-06-15 09:35:15 -05:00)


In [52]:
df_metadata.index

Index(['59.0137', '71.0139', '74.0249', '85.0296', '87.0086', '88.0405',
       '89.0245', '101.0243', '102.0562', '109.0407', '112.0518', '113.0246',
       '115.0038', '117.0195', '118.051', '118.9807', '122.0247', '128.0355',
       '129.0192', '130.0506', '130.0875', '131.0355', '131.0824', '132.0303',
       '133.014', '135.0311', '145.0144', '145.0617', '145.0984', '146.0458',
       '147.0297', '147.0655', '151.0403', '152.0354', '153.0173', '154.062',
       '165.0554', '166.0506', '166.9758', '168.0663', '171.0302', '173.0095',
       '173.0454', '173.0711', '179.0348', '179.0561', '184.0021', '186.1139',
       '211.0603', '213.0145', '218.1033', '225.0991', '237.0282', '249.0552',
       '251.0776', '259.0227', '266.0886', '279.2331', '307.0311', '316.2857',
       '317.2143', '323.0277', '338.9889', '346.0558', '347.0589', '362.0509',
       '365.0531', '367.0185', '388.9441', '397.1319', '401.0173', '426.0221',
       '441.3372', '489.9937', '505.9889', '521.984', '545.399

time: 4.14 ms (started: 2023-06-15 09:35:15 -05:00)


In [53]:
#df_metadata.loc["131"]

time: 1.1 ms (started: 2023-06-15 09:35:15 -05:00)


In [54]:
# load dataset groups
df_join_raw = pd.read_csv("input/{}_raw.csv".format(exp), index_col=0)        
df_join_raw.index = df_join_raw.index.astype("str")
df_join_raw.head()

Unnamed: 0,zwf1^3.4,zwf1^3.4.1,zwf1^3.4.2,zwf1^3.4.3,zwf1^3.4.4,zwf1^3.4.5,zwf1^3.4.6,zwf1^3.4.7,zwf1^3.4.8,zwf1^3.4.9,...,WT3.4,WT3.4.1,WT3.4.2,WT3.4.3,WT3.4.4,WT3.4.5,WT3.4.6,WT3.4.7,WT3.4.8,WT3.4.9
59.0049,1,47,508,360,675,232,59,345,346,504,...,45,817,32,307,410,716,228,187,361,952
59.0137,53182,57902,51861,62812,54291,54395,59054,52547,57544,63538,...,45747,39677,47205,38697,54320,62610,47283,49927,43632,48511
59.0291,2151,2102,2334,2864,2393,2246,2331,1954,2310,2528,...,1852,1717,1830,1660,2006,2539,2103,2276,1643,2522
59.037,83,174,366,1134,693,464,235,479,443,691,...,225,349,185,429,186,278,364,368,115,887
59.0453,1,1,51,642,493,143,11,227,160,154,...,60,293,1,336,65,142,1,182,10,684


time: 215 ms (started: 2023-06-15 09:35:15 -05:00)


### BioCyc

In [55]:
# get filter graphs

dict_graphs = {}
for method in methods:
    for group in groups_id:
        for option in options:
            df_common_edges = pd.read_csv("output/{}/common_edges/common_edges_{}_{}_{}.csv".format(exp, method, group, option),
                                             dtype={"source": "string", "target": "string"})
            G = nx.from_pandas_edgelist(df_common_edges, edge_attr=["weight"])
            dict_graphs[group] = G

time: 9.07 s (started: 2023-06-15 09:35:15 -05:00)


In [56]:
# get nodes

dict_nodes = {}
for group in groups_id:
    dict_nodes[group] = set(list(dict_graphs[group].nodes()))

time: 825 µs (started: 2023-06-15 09:35:24 -05:00)


In [57]:
# set operation

dict_set_operation = {}
for group in groups_id:
    dict_nodes_aux = dict_nodes.copy()
    nodes_aux = dict_nodes_aux.pop(group)
    unique_nodes = nodes_aux - set.union(*list(dict_nodes_aux.values()))

    dict_set_operation[group] = unique_nodes

dict_set_operation["-".join(groups_id)] = set.intersection(*list(dict_nodes.values()))

print(dict_set_operation.keys())

dict_keys(['WT', 'zwf1^', 'WT-zwf1^'])
time: 16.1 ms (started: 2023-06-15 09:35:24 -05:00)


In [58]:
# print set size
for key, value in dict_set_operation.items():
    print(key, len(value))

WT 165
zwf1^ 1970
WT-zwf1^ 3453
time: 12 ms (started: 2023-06-15 09:35:24 -05:00)


In [59]:
# delete nodes without metabollities name
for group in dict_set_operation:
    inter = dict_set_operation[group] & set(list(df_metadata.index.values))
    dict_set_operation[group] = list(inter)

time: 9.85 ms (started: 2023-06-15 09:35:24 -05:00)


In [60]:
# print set size
for key, value in dict_set_operation.items():
    print(key, len(value))

WT 2
zwf1^ 24
WT-zwf1^ 59
time: 11.1 ms (started: 2023-06-15 09:35:24 -05:00)


In [61]:
df_metadata.head()

Unnamed: 0_level_0,Metabolites - Approved by Nicola,Metabolites - Used by Florian,Theoretical - m/z (based on BioCyc),Chemical Formula (based on BioCyc),Adduct (negative mode),Error in mDa (oTOF),Error in PPM (Theoretical vs Corrected)
Corrected (m/z),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59.0137,Acetic acid,*,59.013568,C2H4O2,H,,
71.0139,Acrylic acid,*,71.013568,C3H4O2,H,,
74.0249,Glycine,*,74.024467,C2H5NO2,H,,
85.0296,Methyl Acrylate,*,85.029218,C4H6O2,H,,
87.0086,Pyruvic acid,*,87.008482,C3H4O3,H,,


time: 14.3 ms (started: 2023-06-15 09:35:24 -05:00)


In [62]:
# mapping metabolite name with ratio (2)
for key, value in dict_set_operation.items():
    nodes = dict_set_operation[key]

    df_biocyc = pd.DataFrame()
    df_biocyc["m/z"] = nodes

    list_data = []
    for group in groups_id:
        df_aux = df_join_raw.filter(like=group)
        df_aux = df_aux.loc[nodes]

        # df_biocyc["mean-{}".format(group)] = df_aux.mean(axis=1).values
        # df_biocyc["log-{}".format(group)] = np.log10(df_aux.mean(axis=1).values)
        list_data.append(df_aux.mean(axis=1).values)

    df_biocyc["before"] = np.log10(list_data[0])
    df_biocyc["after"] = np.log10(list_data[1])
    df_biocyc["ratio"] = np.log10(np.divide(list_data[0], list_data[1]))

    # df_biocyc["metabolities"] = df_metadata.loc[common_nodes]["Metabolites - Approved by Nicola"].values
    df_biocyc.insert(1, "metabolities", df_metadata.loc[nodes]["Metabolites - Approved by Nicola"].values)

    df_biocyc = df_biocyc.iloc[:, 1:]
    print(key, df_biocyc.shape)
    # save
    df_biocyc.to_csv("output/{}/biocyc/biocyc_{}_{}_{}.csv".format(exp, methods[0], key, options[0]), 
                    index=False, header=False, sep="\t")
    # df_biocyc.head()

WT (2, 4)
zwf1^ (24, 4)
WT-zwf1^ (59, 4)
time: 41.5 ms (started: 2023-06-15 09:35:24 -05:00)


In [63]:
# mapping metabolite name with ratio (3)
""" for key, value in dict_set_operation.items():
    nodes = dict_set_operation[key]

    df_biocyc = pd.DataFrame()
    df_biocyc["m/z"] = nodes

    for group in groups_id:
        df_aux = df_join_raw.filter(like=group)
        df_aux = df_aux.loc[nodes]

        # df_biocyc["mean-{}".format(group)] = df_aux.mean(axis=1).values
        df_biocyc["log-{}".format(group)] = np.log10(df_aux.mean(axis=1).values)

    # df_biocyc["metabolities"] = df_metadata.loc[common_nodes]["Metabolites - Approved by Nicola"].values
    df_biocyc.insert(1, "metabolities", df_metadata.loc[nodes]["Metabolites - Approved by Nicola"].values)

    df_biocyc = df_biocyc.iloc[:, 1:]
    print(key, df_biocyc.shape)
    # save
    df_biocyc.to_csv("output/{}/biocyc/biocyc_{}_{}_{}.csv".format(exp, methods[0], key, options[0]), 
                    index=False, header=False, sep="\t")
    # df_biocyc.head() """

' for key, value in dict_set_operation.items():\n    nodes = dict_set_operation[key]\n\n    df_biocyc = pd.DataFrame()\n    df_biocyc["m/z"] = nodes\n\n    for group in groups_id:\n        df_aux = df_join_raw.filter(like=group)\n        df_aux = df_aux.loc[nodes]\n\n        # df_biocyc["mean-{}".format(group)] = df_aux.mean(axis=1).values\n        df_biocyc["log-{}".format(group)] = np.log10(df_aux.mean(axis=1).values)\n\n    # df_biocyc["metabolities"] = df_metadata.loc[common_nodes]["Metabolites - Approved by Nicola"].values\n    df_biocyc.insert(1, "metabolities", df_metadata.loc[nodes]["Metabolites - Approved by Nicola"].values)\n\n    df_biocyc = df_biocyc.iloc[:, 1:]\n    print(key, df_biocyc.shape)\n    # save\n    df_biocyc.to_csv("output/{}/biocyc/biocyc_{}_{}_{}.csv".format(exp, methods[0], key, options[0]), \n                    index=False, header=False, sep="\t")\n    # df_biocyc.head() '

time: 4.5 ms (started: 2023-06-15 09:35:24 -05:00)


In [64]:
df_biocyc = pd.read_csv("output/{}/biocyc/biocyc_{}_{}_{}.csv".format(exp, methods[0], "-".join(groups_id), options[0]), sep="\t")
df_biocyc.head()

Unnamed: 0,C20H30O3,3.322116913253465,3.3623159381406196,-0.04019902488715463
0,FAD,3.524455,3.466497,0.057958
1,Acetyl coenzyme A,3.865341,3.720248,0.145093
2,Lactic acid / Glyceraldehyde,3.789973,3.524134,0.265839
3,Aminosalicylic acid,3.490355,3.476961,0.013394
4,C18H32O2,3.873093,3.930782,-0.057689


time: 23.8 ms (started: 2023-06-15 09:35:24 -05:00)
