In [1]:
cd /home/tvangraft/tudelft/thesis/metaengineering

/home/tvangraft/tudelft/thesis/metaengineering


In [2]:
from typing import DefaultDict, List, Hashable, Dict, Any

from src.utils.utils import get_generator, get_project_root, make_path_if_not_exists
from src.utils.metric_utils import adjusted_r2, mean_absolute_percentage_error, median_absolute_percentage_error, mean_absolute_error, mean_absolute_percentage_error , mean_squared_error

from src.pipeline.config import DataLoaderConfig, TaskLoaderConfig
from src.pipeline.taskloader import TaskLoader
from src.pipeline.dataloader import DataLoader

from src.orchestrator.trainer import Trainer
from src.utils.result_fetcher import ResultFetcher

from src.settings.tier import Tier
from src.settings.strategy import Strategy

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error


import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import itertools

from scipy.stats import ttest_ind, f_oneway

In [3]:
PATH = "/home/tvangraft/tudelft/thesis/metaengineering"
SUITABLE_METABOLITES = ['3pg;2pg', 'dhap', 'f6p', 'pep', 'pyr', 'r5p']

In [4]:
def get_metabolite_info():
    metabolite_data = pd.read_csv('./data/training/metabolites.txt', delimiter='\t')
    raw_metabolites = pd.read_csv('./data/training/metabolites_dataset.data_prep.tsv', delimiter='\t')

    return raw_metabolites.merge(
        metabolite_data, 
        left_on='official_name', right_on='met_name'
    )[['metabolite_id', 'official_name', 'kegg_id', 'pathway', 'method', 'Order']].drop_duplicates().set_index('metabolite_id')

def gather_results(paths: List[str]):
    metabolites_names = [path.rsplit('/', 1)[1].removesuffix('.csv').removeprefix('Strategy.METABOLITE_CENTRIC_') for path in paths]
    df = pd.concat([
        pd.read_csv(path).assign(metabolite_id=metabolite_name) 
        for path, metabolite_name in zip(paths, metabolites_names)
    ])
    return df

def combine_metabolite_info(df):
    metabolite_info = get_metabolite_info()
    df = df.merge(metabolite_info, left_on='metabolite_id', right_index=True)
    return df

metabolite_info = get_metabolite_info()

result_fetcher = ResultFetcher(
    root_dir=f"{PATH}/data/results",
    metabolite_info=metabolite_info
)

tiers = [Tier.TIER0, Tier.TIER1]
strategies = [Strategy.ALL, Strategy.METABOLITE_CENTRIC, Strategy.ONE_VS_ALL]

def get_best_model_performance(tier, strategy):
    root_dir = f"{PATH}/data/results"
    test_pred_df = pd.read_json(
        f'{root_dir}/{tier}/best_model_prediction_performance_sklearn_{strategy}.json'
    )
    # print(test_pred_df.columns)
    test_pred_df = test_pred_df.T.reset_index().drop("index", axis=1).set_axis(['y_true', 'y_pred', 'architecture', 'metabolite_id'], axis=1)
    test_pred_df[['repeat_id', 'metabolite_id']] = test_pred_df['metabolite_id'].str.split("_", expand=True)

    # print(test_pred_df)
    test_pred_df = test_pred_df \
        .merge(metabolite_info, left_on='metabolite_id', right_index=True) \
        .assign(strategy=strategy).assign(experiment_id=tier)
    return test_pred_df

test_pred_df = pd.concat([get_best_model_performance(tier, strategy) for tier, strategy in itertools.product(tiers, strategies)])
test_pred_df['error'] = test_pred_df.apply(lambda row: mean_absolute_error(row['y_true'], row['y_pred']), axis=1)

test_pred_df.architecture = test_pred_df.architecture.map({
    'all': 'all', 
    'SVR()': 'SVR()',
    'RandomForestRegressor()': 'RandomForestRegressor()' ,
    'ElasticNet()': 'ElasticNet()',
    'MLPRegressor()': 'MLPRegressor()',
    'MLPRegressor(batch_size=8, hidden_layer_sizes=[128, 32, 32])': 'MLPRegressor()',
    'MLPRegressor(batch_size=16, hidden_layer_sizes=[128, 32, 32])': 'MLPRegressor()',
    'MLPRegressor(batch_size=4, hidden_layer_sizes=[128, 32, 32])': 'MLPRegressor()',
})

test_pred_df['architecture'].unique()


array(['all', 'SVR()', nan, 'ElasticNet()', 'MLPRegressor()',
       'RandomForestRegressor()'], dtype=object)

In [5]:
def transform_str_to_list(list_string: str):
    if type(list_string) != str:
        return np.NAN

    return [float(x) for x in list_string.replace("'", "").replace("[", "").replace("]", "").split(", ")]

In [6]:
tier_2_results = pd.read_csv(f"./data/validation/gnn_model_protein_only_evaluation.csv")
tier_2_results = tier_2_results.assign(experiment_id=Tier.TIER2)
tier_2_results['architecture'] = tier_2_results['mode'].apply(lambda x: f"{x.replace('metabolite_gnn_sweep_full_', '')}")
tier_2_results['strategy'] = tier_2_results['strategy'].replace({f'{Strategy.ONE_VS_ALL}': 'one_vs_all', f'{Strategy.ALL}': 'all', f'{Strategy.METABOLITE_CENTRIC}': 'metabolite'})
tier_2_results = tier_2_results.rename({'metrics.R2 score': 'correlation', 'metrics.Mean absolute error': 'error'}, axis=1)
tier_2_results = tier_2_results.assign(experiment_id=Tier.TIER2).astype('object')
tier_2_results['architecture'] = tier_2_results['architecture'].replace('all', 'full')

tier_2_results_opt = tier_2_results.loc[tier_2_results.groupby(['experiment_id', 'strategy', 'metabolite_id'])['correlation'].transform(max) == tier_2_results['correlation']]
tier_2_results_opt = tier_2_results_opt.assign(architecture='all')

tier_2_results = tier_2_results[['experiment_id', 'strategy', 'correlation', 'metabolite_id', 'error', 'architecture', 'y_true', 'y_pred']]

tier_2_results['y_true'] = tier_2_results['y_true'].apply(transform_str_to_list)
tier_2_results['y_pred'] = tier_2_results['y_pred'].apply(transform_str_to_list)

tier_2_results = tier_2_results.merge(metabolite_info, left_on='metabolite_id', right_index=True)
tier_2_results

Unnamed: 0,experiment_id,strategy,correlation,metabolite_id,error,architecture,y_true,y_pred,official_name,kegg_id,pathway,method,Order
0,Tier.TIER2,all,,r5p,0.722256,unfiltered,"[-0.05111140012741089, 1.3853280544281006, 0.1...","[-0.058519646525382996, -0.058519646525382996,...",alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3
1,Tier.TIER2,all,0.240087,r5p,0.71978,unfiltered,"[-0.05111140012741089, 1.3853280544281006, 0.1...","[0.2501451373100281, 0.19814461469650269, 0.24...",alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3
2,Tier.TIER2,all,-0.063211,r5p,0.718658,unfiltered,"[-0.05111140012741089, 1.3853280544281006, 0.1...","[-0.052671339362859726, -0.052671339362859726,...",alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3
3,Tier.TIER2,all,,r5p,0.719443,unfiltered,"[-0.05111140012741089, 1.3853280544281006, 0.1...","[-0.05422510951757431, -0.05422510951757431, -...",alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3
4,Tier.TIER2,all,-0.078347,r5p,0.710062,unfiltered,"[-0.05111140012741089, 1.3853280544281006, 0.1...","[-0.03698136284947395, -0.03817760944366455, -...",alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,Tier.TIER2,one_vs_all,,dhap,0.744175,full,"[-1.2058722972869873, 0.34574317932128906, -0....","[0.004077630117535591, 0.004077630117535591, 0...",Dihydroxyacetone phosphate,C00111,Glycolysis,TCA,5
1324,Tier.TIER2,one_vs_all,,dhap,0.764754,full,"[-1.2058722972869873, 0.34574317932128906, -0....","[0.014124872162938118, 0.014124872162938118, 0...",Dihydroxyacetone phosphate,C00111,Glycolysis,TCA,5
1325,Tier.TIER2,one_vs_all,,dhap,0.76449,full,"[-1.2058722972869873, 0.34574317932128906, -0....","[0.00015460350550711155, 0.0008034512866288424...",Dihydroxyacetone phosphate,C00111,Glycolysis,TCA,5
1326,Tier.TIER2,one_vs_all,-0.044061,dhap,0.76815,full,"[-1.2058722972869873, 0.34574317932128906, -0....","[-0.03734079375863075, 0.02935566008090973, 0....",Dihydroxyacetone phosphate,C00111,Glycolysis,TCA,5


In [7]:
tier_3_results = pd.read_csv("./data/validation/gnn_model_metabolite_evaluation.csv")
tier_3_results = tier_3_results.assign(experiment_id=Tier.TIER3)
tier_3_results['architecture'] = tier_3_results['mode'].apply(lambda x: f"{x.replace('metabolite_gnn_sweep_full_', '')}")
tier_3_results['strategy'] = tier_3_results['strategy'].replace({f'{Strategy.ONE_VS_ALL}': 'one_vs_all', f'{Strategy.ALL}': 'all', f'{Strategy.METABOLITE_CENTRIC}': 'metabolite'})
tier_3_results = tier_3_results.rename({'metrics.R2 score': 'correlation', 'metrics.Mean absolute error': 'error'}, axis=1)
tier_3_results = tier_3_results.assign(experiment_id=Tier.TIER3).astype('object')
tier_3_results['architecture'] = tier_3_results['architecture'].replace('all', 'full')


tier_3_results_opt = tier_3_results.loc[tier_3_results.groupby(['experiment_id', 'strategy', 'metabolite_id'])['correlation'].transform(max) == tier_3_results['correlation']]
tier_3_results_opt = tier_3_results_opt.assign(architecture='all')

tier_3_results = tier_3_results[['experiment_id', 'strategy', 'correlation', 'metabolite_id', 'error', 'architecture', 'y_true', 'y_pred']]

tier_3_results['y_true'] = tier_3_results['y_true'].apply(transform_str_to_list)
tier_3_results['y_pred'] = tier_3_results['y_pred'].apply(transform_str_to_list)

tier_3_results = tier_3_results.merge(metabolite_info, left_on='metabolite_id', right_index=True)


tier_3_results

Unnamed: 0,experiment_id,strategy,correlation,metabolite_id,error,architecture,y_true,y_pred,official_name,kegg_id,pathway,method,Order
0,Tier.TIER3,all,,accoa,0.653635,unfiltered,"[0.04585527256131172, 0.4773847162723541, 0.53...","[0.0, 0.0, 0.0, 0.0, 0.0]",Acetyl-CoA,C00024,TCA,TCA,1
1,Tier.TIER3,all,,accoa,0.653635,unfiltered,"[0.04585527256131172, 0.4773847162723541, 0.53...","[0.0, 0.0, 0.0, 0.0, 0.0]",Acetyl-CoA,C00024,TCA,TCA,1
2,Tier.TIER3,all,,accoa,0.653635,unfiltered,"[0.04585527256131172, 0.4773847162723541, 0.53...","[0.0, 0.0, 0.0, 0.0, 0.0]",Acetyl-CoA,C00024,TCA,TCA,1
3,Tier.TIER3,all,-0.110965,accoa,0.631187,unfiltered,"[0.04585527256131172, 0.4773847162723541, 0.53...","[0.037443678826093674, 0.03738389536738396, 0....",Acetyl-CoA,C00024,TCA,TCA,1
4,Tier.TIER3,all,0.246237,accoa,0.682643,unfiltered,"[0.04585527256131172, 0.4773847162723541, 0.53...","[-0.1064114198088646, -0.04994292929768562, 0....",Acetyl-CoA,C00024,TCA,TCA,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,Tier.TIER3,metabolite,-0.48332,g6p;g6p-B,1.412313,strict,"[0.00834512710571289, 0.4741881191730499, 0.42...","[-0.01386161521077156, -0.01332460530102253, -...",D-Glucose 6-phosphate;beta-D-glucose 6-phosphate,C00092;C01172,Glycolysis,TCA,9
1212,Tier.TIER3,metabolite,0.269743,g6p;g6p-B,0.755023,strict,"[0.00834512710571289, 0.4741881191730499, 0.42...","[-0.013879486359655857, -0.012832674197852612,...",D-Glucose 6-phosphate;beta-D-glucose 6-phosphate,C00092;C01172,Glycolysis,TCA,9
1213,Tier.TIER3,metabolite,-0.437258,g6p;g6p-B,0.783403,strict,"[0.00834512710571289, 0.4741881191730499, 0.42...","[-0.019078245386481285, -0.034617431461811066,...",D-Glucose 6-phosphate;beta-D-glucose 6-phosphate,C00092;C01172,Glycolysis,TCA,9
1214,Tier.TIER3,metabolite,0.132628,g6p;g6p-B,0.851223,strict,"[0.00834512710571289, 0.4741881191730499, 0.42...","[0.04256374016404152, 0.07341930270195007, 0.0...",D-Glucose 6-phosphate;beta-D-glucose 6-phosphate,C00092;C01172,Glycolysis,TCA,9


In [13]:
full_df = test_pred_df.sort_values(['pathway', 'metabolite_id'])
full_df = pd.concat([
    full_df, 
    tier_2_results.reset_index(drop=True),
    tier_3_results.reset_index(drop=True),
], axis=0).reset_index(drop=True)

full_df = full_df[full_df['y_true'].notna()]
full_df['mse_error'] = full_df.apply(lambda row: mean_squared_error(row['y_true'], row['y_pred']), axis=1)
full_df['mape_error'] = full_df.apply(lambda row: mean_absolute_percentage_error(row['y_true'], row['y_pred']), axis=1) * 100
full_df['median_ape_error'] = full_df.apply(lambda row: median_absolute_percentage_error(row['y_true'], row['y_pred']), axis=1) * 100
full_df['adjusted_r2'] = full_df.apply(lambda row: adjusted_r2(row['y_true'], row['y_pred']), axis=1)

full_df = full_df[full_df['architecture'] != 'unfiltered']
full_df['experiment_id_ordering'] = full_df['experiment_id'].apply(lambda x: Tier.get_order(x))
full_df = full_df.sort_values(['experiment_id_ordering', 'metabolite_id'])
full_df = full_df.reset_index(drop=True)
full_df = full_df[full_df['metabolite_id'].isin(SUITABLE_METABOLITES)]

def mapper(x):
    mapping = {
        'all': Strategy.ALL,
        'one_vs_all':  Strategy.ONE_VS_ALL,
        'metabolite': Strategy.METABOLITE_CENTRIC,
    }
    if x in mapping:
        return mapping[x]
    return x

full_df.strategy = full_df.strategy.map(mapper)
full_df = full_df.dropna(axis=0, subset='architecture')

full_df.to_csv(f"{PATH}/data/results/all_collected_results.csv")

In [14]:
full_df

Unnamed: 0,y_true,y_pred,architecture,metabolite_id,repeat_id,official_name,kegg_id,pathway,method,Order,strategy,experiment_id,error,correlation,mse_error,mape_error,median_ape_error,adjusted_r2,experiment_id_ordering
0,"[-1.4913663173, -0.5617994757, 0.9582807563, 1...","[0.7547259209, 0.6264524176, 0.797377106, 0.91...",all,3pg;2pg,0,3-Phospho-D-glycerate;D-Glycerate 2-phosphate,C00197;C00631,Glycolysis,TCA,11,Strategy.ALL,Tier.TIER0,0.624351,,0.930960,60.603961,25.812028,0.089206,0
1,"[-1.4913663173, -0.5617994757, 0.9582807563, 1...","[0.9001374312, 0.6663880895000001, 0.923351585...",all,3pg;2pg,1,3-Phospho-D-glycerate;D-Glycerate 2-phosphate,C00197;C00631,Glycolysis,TCA,11,Strategy.ALL,Tier.TIER0,0.621525,,0.928443,57.390569,26.235976,0.091668,0
2,"[-1.4913663173, -0.5617994757, 0.9582807563, 1...","[0.89532301, 0.4019642524, 0.9173912677, 0.886...",all,3pg;2pg,2,3-Phospho-D-glycerate;D-Glycerate 2-phosphate,C00197;C00631,Glycolysis,TCA,11,Strategy.ALL,Tier.TIER0,0.580675,,0.897161,51.279072,21.886329,0.122273,0
3,"[-1.4913663173, -0.5617994757, 0.9582807563, 1...","[0.7737297082, 0.5229264014, 0.895840242800000...",all,3pg;2pg,3,3-Phospho-D-glycerate;D-Glycerate 2-phosphate,C00197;C00631,Glycolysis,TCA,11,Strategy.ALL,Tier.TIER0,0.608207,,0.939170,55.493432,23.572616,0.081174,0
4,"[-1.4913663173, -0.5617994757, 0.9582807563, 1...","[1.0214577545, 0.5144078273, 0.9561493747, 0.7...",all,3pg;2pg,4,3-Phospho-D-glycerate;D-Glycerate 2-phosphate,C00197;C00631,Glycolysis,TCA,11,Strategy.ALL,Tier.TIER0,0.60877,,0.953282,55.580075,26.023099,0.067368,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,"[0.6518658995628357, -1.3485695123672485, 0.17...","[-0.002068353584036231, -0.002068612026050687,...",strict,r5p,,alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3,Strategy.METABOLITE_CENTRIC,Tier.TIER3,1.109843,0.025546,0.658881,100.086477,100.240773,-0.037037,3
5890,"[0.6518658995628357, -1.3485695123672485, 0.17...","[-0.008101633749902248, -0.008102219551801682,...",strict,r5p,,alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3,Strategy.METABOLITE_CENTRIC,Tier.TIER3,0.880095,0.233839,0.659147,100.338734,100.943102,-0.037037,3
5891,"[0.6518658995628357, -1.3485695123672485, 0.17...","[-0.009108763188123703, -0.009088517166674137,...",strict,r5p,,alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3,Strategy.METABOLITE_CENTRIC,Tier.TIER3,0.878248,0.253064,0.659197,100.380847,101.059726,-0.037037,3
5892,"[0.6518658995628357, -1.3485695123672485, 0.17...","[0.001019663061015308, 0.003810988273471594, 0...",strict,r5p,,alpha-D-Ribose 5-phosphate,C00117,PPP,TCA,3,Strategy.METABOLITE_CENTRIC,Tier.TIER3,0.877988,-0.273758,0.658736,99.867432,99.920972,-0.037037,3
