# Imports

In [1]:
cd /home/tvangraft/tudelft/thesis/metaengineering

/home/tvangraft/tudelft/thesis/metaengineering


In [5]:
import os
import glob
import json
import ast
import re

from typing import Dict, List

from src.utils.parsers.cv_parser import fmt_cv_results
from src.utils.visualizers.prediction_figures import PredictionFigures
from src.utils.visualizers.test_figures import TestFigures
from src.utils.visualizers.cv_figures import CVFigures
from src.settings.strategy import Strategy

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [3]:
def get_metabolite_info():
    metabolite_data = pd.read_csv('./data/training/metabolites.txt', delimiter='\t')
    raw_metabolites = pd.read_csv('./data/training/metabolites_dataset.data_prep.tsv', delimiter='\t')

    return raw_metabolites.merge(
        metabolite_data, 
        left_on='official_name', right_on='met_name'
    )[['metabolite_id', 'official_name', 'kegg_id', 'pathway', 'method', 'Order']].drop_duplicates().set_index('metabolite_id')

def gather_results(paths: List[str]):
    metabolites_names = [path.rsplit('/', 1)[1].removesuffix('.csv').removeprefix('Strategy.METABOLITE_CENTRIC_') for path in paths]
    df = pd.concat([
        pd.read_csv(path).assign(metabolite_id=metabolite_name) 
        for path, metabolite_name in zip(paths, metabolites_names)
    ])
    return df

def combine_metabolite_info(df):
    metabolite_info = get_metabolite_info()
    df = df.merge(metabolite_info, left_on='metabolite_id', right_index=True)
    return df

In [4]:
metabolite_info = get_metabolite_info()

In [None]:
class ResultFetcher:
    def __init__(self, root_dir: str, metabolite_info: pd.DataFrame) -> None:
        self.root_dir = root_dir
        self.metabolite_info = metabolite_info
    
    def get_test_df_all(self, experiment_id: int):
        df = self.get_frame(experiment_id, Strategy.ALL)
        df = df.assign(strategy='all')
        return df
    
    def get_test_df_metabolite(self, experiment_id: int):
        df = self.get_frame(experiment_id, Strategy.METABOLITE_CENTRIC)
        df = df.assign(strategy='metabolite')
        return df
    
    def get_test_df_one_vs_all(self, experiment_id: int):
        df = self.get_frame(experiment_id, Strategy.ONE_VS_ALL)
        df = df.assign(strategy='one_vs_all')
        return df
        
    def get_frame(self, experiment_id: int, strategy: Strategy):
        path = f"{self.root_dir}_experiment_{experiment_id}/{strategy}.csv"
        if self.file_exists(path):
            test_df_all = pd.read_csv(path, index_col=0)
            test_df_all = test_df_all.stack().to_frame().reset_index(1).set_axis(['metabolite_arch', 'r2'], axis=1)
            test_df_all[['metabolite_id', 'architecture']] = test_df_all['metabolite_arch'].str.split("_", expand=True)
            test_df_all = test_df_all.drop('metabolite_arch', axis=1).merge(metabolite_info, left_on='metabolite_id', right_index=True)
            return test_df_all
    
    def file_exists(self, path: str):
        return os.path.exists(path)

In [None]:
dir = "/home/tvangraft/tudelft/thesis/metaengineering/data/results/experiment_0"

test_df_all, test_df_metabolite = None, None
if os.path.exists(f'{dir}/best_model_performance_Strategy.ALL.csv'):
    test_df_all = pd.read_csv(
        f'{dir}/best_model_performance_Strategy.ALL.csv',
        index_col=0
    )
    test_df_all = test_df_all.stack().to_frame().reset_index(1).set_axis(['metabolite_arch', 'r2'], axis=1)
    test_df_all[['metabolite_id', 'architecture']] = test_df_all['metabolite_arch'].str.split("_", expand=True)
    test_df_all = test_df_all.drop('metabolite_arch', axis=1).merge(metabolite_info, left_on='metabolite_id', right_index=True).assign(strategy='all')

if os.path.exists(f'{dir}/best_model_performance_Strategy.METABOLITE_CENTRIC.csv'):
    test_df_metabolite = pd.read_csv(
        f'{dir}/best_model_performance_Strategy.METABOLITE_CENTRIC.csv',
        index_col=0
    )
    test_df_metabolite = test_df_metabolite.stack().to_frame().reset_index(1).set_axis(['metabolite_arch', 'r2'], axis=1)
    test_df_metabolite[['metabolite_id', 'architecture']] = test_df_metabolite['metabolite_arch'].str.split("_", expand=True)
    test_df_metabolite = test_df_metabolite.drop('metabolite_arch', axis=1).merge(metabolite_info, left_on='metabolite_id', right_index=True).assign(strategy='metabolite')

if os.path.exists(f'{dir}/best_model_performance_Strategy.ONE_VS_ALL.csv'):
    test_df_one_vs_all = pd.read_csv(
        f'{dir}/best_model_performance_Strategy.ONE_VS_ALL.csv',
        index_col=0
    )
    test_df_one_vs_all = test_df_one_vs_all.stack().to_frame().reset_index(1).set_axis(['metabolite_arch', 'r2'], axis=1)
    test_df_one_vs_all[['metabolite_id', 'architecture']] = test_df_one_vs_all['metabolite_arch'].str.split("_", expand=True)
    test_df_one_vs_all = test_df_one_vs_all.drop('metabolite_arch', axis=1).merge(metabolite_info, left_on='metabolite_id', right_index=True).assign(strategy='one_vs_all')
    # test_df_metabolite = test_df_metabolite.merge(metabolite_info, left_on='metabolite_id', right_index=True).assign(strategy='metabolite')

test_df = pd.concat([test_df_all, test_df_metabolite, test_df_one_vs_all], axis=0)
test_df = test_df[test_df['architecture'] == 'all']
test_figures = TestFigures(test_df)