# Libraries

In [1]:
cd /home/tvangraft/tudelft/thesis/metaengineering

/home/tvangraft/tudelft/thesis/metaengineering


In [2]:
import os
import glob
import math
import gc
from typing import Any, Dict, List
from dataclasses import dataclass

from collections import defaultdict

from src.pipeline.dataloader import DataLoader
from src.pipeline.frame.filter import FrameFilters
from src.pipeline.frame.transformer import FrameTransformers
from src.pipeline.taskloader import TaskLoader, TaskFrame
from src.pipeline.config import DataLoaderConfig, TaskLoaderConfig

from src.utils.parsers.cv_parser import to_cv_params, parse_cv_result, fmt_cv_results, _fmt_regressor
from src.utils.utils import TestResultStore, build_model_pipeline, get_generator, build_config

from src.orchestrator.orchestrator import Orchestrator
from src.orchestrator.config import RunConfig, ExplanationConfig

from src.settings.strategy import Strategy
from src.settings.tier import Tier

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from lime import lime_tabular
import shap

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.model_selection import RepeatedKFold

from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, GammaRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_dl_config_for_strategy(tier: Tier):
    lookup = {
        Tier.TIER0: dict(
            additional_filters=["is_precursor",],
            additional_transforms=["log_fold_change_protein",]
        ),
        Tier.TIER1: dict(
            additional_frames=["interaction_frame",],
            additional_filters=[
                "is_precursor",
                "has_at_least_n_interaction",
            ],
            additional_transforms=[
                "log_fold_change_protein",
                "ppi_coo_matrix",
            ]
        )
    }
    return lookup.get(tier)

In [4]:
params = {
    'SVR': {
        'regressor__regressor': SVR(),
        'regressor__regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        #'regressor__pca__n_components': [0.1, 0.25, 0.5, 0.75, 0.99],
        'regressor__preprocessor__num__scaler': [MinMaxScaler(), StandardScaler(), RobustScaler()],
    },
    'RandomForestRegressor': {
        'regressor__regressor': RandomForestRegressor(),
        'regressor__regressor__n_estimators': [10, 25, 50, 75, 100],
        'regressor__regressor__criterion': ['squared_error', 'friedman_mse'],
        'regressor__regressor__max_depth': [5, 10, 20],
        # 'regressor__pca__n_components': [0.1, 0.25, 0.5, 0.75, 0.99],
        'regressor__preprocessor__num__scaler': [MinMaxScaler(), StandardScaler(), RobustScaler()]
    },
    'ElasticNet': {
        'regressor__regressor': ElasticNet(),
        'regressor__regressor__l1_ratio': [0.01, 0.25, 0.5, 0.75, 1],
        'regressor__regressor__tol': [0.01],
        #'regressor__pca__n_components': [0.1, 0.25, 0.5, 0.75, 0.99],
        'regressor__preprocessor__num__scaler': [MinMaxScaler(), StandardScaler(), RobustScaler()]
    },
}

strategies = [Strategy.ALL, Strategy.ONE_VS_ALL, Strategy.METABOLITE_CENTRIC]
tiers = [Tier.TIER0, Tier.TIER1]

In [5]:
DataLoader.DATA_FOLDER = './data/training/'
orchestrator = Orchestrator()
orchestrator.prepare_orchestrator(
    *build_config(
        strategy=Strategy.ALL,
        tier=Tier.TIER0,
        params=params,
        forced_training=False,
        forced_testing=True,
        **get_dl_config_for_strategy(Tier.TIER0)
    )
)

In [6]:
orchestrator.run()

Result for Strategy.ALL_all already exists
     Unnamed: 0  mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0             0       0.092504      0.008441         0.026047        0.003230   
1             1       0.077147      0.010093         0.019912        0.002954   
2             2       0.064018      0.008031         0.028832        0.006011   
3             3       0.066155      0.004979         0.020286        0.002943   
4             4       0.064538      0.006565         0.017313        0.003264   
..          ...            ...           ...              ...             ...   
112         112       0.130593      0.013060         0.013295        0.002640   
113         113       0.142476      0.009388         0.014203        0.003045   
114         114       0.146542      0.008727         0.014829        0.002841   
115         115       0.150475      0.005567         0.015463        0.002335   
116         116       0.132754      0.017819         0.010757     

# Training

In [None]:
EXPERIMENT_ID = 'experiment_1'
PATH_PREFIX = './data/results'
EXPERIMENT_PATH = f'{PATH_PREFIX}/{EXPERIMENT_ID}'
TIER = Tier.TIER1

if not os.path.exists(f'{EXPERIMENT_PATH}'):
    os.makedirs(f'{EXPERIMENT_PATH}')

config = DataLoaderConfig(
    additional_frames=[
        dl.data_factory.loaders.interaction_frame,
    ],
    additional_filters=[
        dl.data_factory.filters.is_precursor,
        dl.data_factory.filters.has_at_least_n_interaction,
    ],
    additional_transforms=[
        dl.data_factory.transformer.log_fold_change_protein,
        dl.data_factory.transformer.ppi_coo_matrix,
    ]
)

cv = RepeatedKFold(n_repeats=1, n_splits=10, random_state=0)

## Strategy All

In [None]:
STRATEGY = Strategy.ALL
SPLIT_KWARGS=dict(
    stratify='metabolite_id',
    shuffle=True
)
trainer = Trainer()

In [None]:
gen = get_generator(
    dl=dl,
    tl=tl,
    strategy=STRATEGY,
    tier=TIER,
    data_config=config
)
tf: TaskFrame = next(gen)
model = build_model_pipeline(tf)

In [None]:
search = trainer.do_grid_search(
    tf,
    model, 
    params,
    cv,
    split_kwargs=SPLIT_KWARGS,
    search_kwargs=GRIDSEARCH_KWARGS,
)

pd.DataFrame(search.cv_results_).to_csv(f'{EXPERIMENT_PATH}/{tf.title}_{tf.frame_name}.csv')

Training: Strategy.ALL_all.csv
tf.x.shape=(74192, 50) (74192,)
{'scoring': 'neg_mean_absolute_error', 'n_jobs': -1, 'refit': True, 'verbose': 1, 'error_score': 'raise'}
Fitting 10 folds for each of 105 candidates, totalling 1050 fits


In [None]:
results_df = pd.read_csv(f'{EXPERIMENT_PATH}/Strategy.ALL_all.csv')
results_df = _fmt_regressor(results_df)
architectures = ['all'] + results_df['param_regressor__regressor'].unique().tolist()
architectures

['all', 'ElasticNet()', 'RandomForestRegressor()']

In [None]:
results_df = pd.read_csv(f'{EXPERIMENT_PATH}/Strategy.ALL_all.csv')
results_df = _fmt_regressor(results_df)
architectures = ['all'] + results_df['param_regressor__regressor'].unique().tolist()
testResultStore = TestResultStore(EXPERIMENT_PATH, STRATEGY)

for architecture in architectures:
    _result_df = results_df.copy() if architecture == 'all' else results_df[results_df['param_regressor__regressor'] == architecture].copy()
    _result_df = _result_df.sort_values('rank_test_score').iloc[[0]]
    print(architecture)

    model = build_model_pipeline(tf)
    model = parse_cv_result(model, _result_df)
    model = trainer.do_retrain_model(tf, model, split_kwargs=SPLIT_KWARGS)

    _, X_test, _, y_test = trainer.do_train_test_split(tf, **SPLIT_KWARGS)

    testResultStore.update_results(
        'all', model, architecture,
        X_test, y_test
    )

    for metabolite_id in X_test['metabolite_id'].unique():
        testResultStore.update_results(
            metabolite_id, 
            model,
            architecture,
            X_test[X_test['metabolite_id'] == metabolite_id], 
            y_test.xs(metabolite_id, level='metabolite_id')
        )

testResultStore.to_file()

all
RandomForest model
ElasticNet()
ElasticNet model
RandomForestRegressor()
RandomForest model


## Strategy Metabolic

In [None]:
STRATEGY = Strategy.METABOLITE_CENTRIC
SPLIT_KWARGS=dict(shuffle=False, stratify=None)
trainer = Trainer()

In [None]:
gen = get_generator(
    dl=dl,
    tl=tl,
    strategy=STRATEGY,
    tier=TIER,
    data_config=config
)
tf: TaskFrame = next(gen)
model = build_model_pipeline(tf)

In [None]:
gen = get_generator(
    dl=dl,
    tl=tl,
    strategy=STRATEGY,
    tier=TIER,
    data_config=config
)

for tf in gen:
    search = trainer.do_grid_search(
        tf,
        model, 
        params,
        cv,
        split_kwargs=SPLIT_KWARGS,
        search_kwargs=GRIDSEARCH_KWARGS,
    )

    pd.DataFrame(search.cv_results_).to_csv(f'{EXPERIMENT_PATH}/{tf.title}_{tf.frame_name}.csv')

In [None]:
paths = glob.glob(EXPERIMENT_PATH + "/*.csv")

metabolites_names = [path.rsplit('/', 1)[1].removesuffix('.csv').removeprefix('Strategy.METABOLITE_CENTRIC_') for path in paths]
results_df = pd.concat([
    pd.read_csv(path).assign(metabolite_id=metabolite_name) 
    for path, metabolite_name in zip(paths, metabolites_names)
])
results_df = results_df[(
    (results_df['param_regressor__regressor'].str.contains('DecisionTreeRegressor', na=False)) & \
    (results_df['param_regressor__regressor__max_depth'] > 0)
) | \
    (~results_df['param_regressor__regressor'].str.contains('DecisionTreeRegressor', na=False))
].dropna(subset='param_regressor__regressor')
results_df = _fmt_regressor(results_df)

In [None]:
architectures = ['all'] + results_df['param_regressor__regressor'].unique().tolist()
testResultStore = TestResultStore(EXPERIMENT_PATH, STRATEGY)

for architecture in architectures:
    gen = get_generator(
        dl=dl,
        tl=tl,
        strategy=STRATEGY,
        tier=TIER,
        data_config=config
    )

    print(architecture)

    for tf in gen:
        metabolite_id = tf.frame_name

        _result_df = results_df.copy() if architecture == 'all' else results_df[results_df['param_regressor__regressor'] == architecture].copy()
        _result_df = _result_df[_result_df['metabolite_id'] == metabolite_id]
        _result_df = _result_df.sort_values('rank_test_score').iloc[[0]]

        model = build_model_pipeline(tf)
        model = parse_cv_result(model, _result_df)
        model = trainer.do_retrain_model(tf, model, SPLIT_KWARGS)

        _, X_test, _, y_test = trainer.do_train_test_split(tf, **SPLIT_KWARGS)

        testResultStore.update_results(
            metabolite_id, 
            model,
            architecture,
            X_test, 
            y_test
        )
testResultStore.to_file()

all
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
ElasticNet model
RandomForest model


An input array is constant; the correlation coefficient is not defined.


RandomForest model
RandomForest model
RandomForest model
RandomForest model
ElasticNet()
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model


An input array is constant; the correlation coefficient is not defined.
An input array is constant; the correlation coefficient is not defined.
An input array is constant; the correlation coefficient is not defined.
An input array is constant; the correlation coefficient is not defined.
An input array is constant; the correlation coefficient is not defined.


ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
RandomForestRegressor()
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model


## Strategy One vs ALL

In [None]:
STRATEGY = Strategy.ONE_VS_ALL
SPLIT_KWARGS=dict(shuffle=False, stratify=None)
trainer = Trainer()

In [None]:
gen = get_generator(
    dl=dl,
    tl=tl,
    strategy=STRATEGY,
    tier=TIER,
    data_config=config
)
tf: TaskFrame = next(gen)
model = build_model_pipeline(tf)

In [None]:
gen = get_generator(
    dl=dl,
    tl=tl,
    strategy=STRATEGY,
    tier=TIER,
    data_config=config
)

for tf in gen:
    search = trainer.do_grid_search(
        tf,
        model, 
        params,
        cv,
        split_kwargs=SPLIT_KWARGS,
        search_kwargs=GRIDSEARCH_KWARGS,
    )

    pd.DataFrame(search.cv_results_).to_csv(f'{EXPERIMENT_PATH}/{tf.title}_{tf.frame_name}.csv')

In [None]:
paths = glob.glob(EXPERIMENT_PATH + "/*ONE_VS_ALL*.csv")

metabolites_names = [path.rsplit('/', 1)[1].removesuffix('.csv').removeprefix('Strategy.ONE_VS_ALL_') for path in paths]
results_df = pd.concat([
    pd.read_csv(path).assign(metabolite_id=metabolite_name) 
    for path, metabolite_name in zip(paths, metabolites_names)
])
results_df = results_df[(
    (results_df['param_regressor__regressor'].str.contains('DecisionTreeRegressor', na=False)) & \
    (results_df['param_regressor__regressor__max_depth'] > 0)
) | \
    (~results_df['param_regressor__regressor'].str.contains('DecisionTreeRegressor', na=False))
].dropna(subset='param_regressor__regressor')
results_df = _fmt_regressor(results_df)
results_df

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__preprocessor__num__scaler,param_regressor__regressor,param_regressor__regressor__l1_ratio,param_regressor__regressor__tol,param_regressor__regressor__criterion,...,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,metabolite_id
0,0,0.142945,0.030812,0.014631,0.002328,MinMaxScaler(),ElasticNet(),0.01,0.01,,...,-0.540166,-0.541760,-0.545391,-0.541757,-0.540964,-0.548708,-0.542787,0.002611,93,accoa
1,1,0.185340,0.013399,0.017018,0.001443,MinMaxScaler(),ElasticNet(),0.25,0.01,,...,-0.540166,-0.541760,-0.545391,-0.541757,-0.540964,-0.548708,-0.542787,0.002611,93,accoa
2,2,0.187205,0.010688,0.015719,0.001886,MinMaxScaler(),ElasticNet(),0.50,0.01,,...,-0.540166,-0.541760,-0.545391,-0.541757,-0.540964,-0.548708,-0.542787,0.002611,93,accoa
3,3,0.201018,0.032827,0.018559,0.003030,MinMaxScaler(),ElasticNet(),0.75,0.01,,...,-0.540166,-0.541760,-0.545391,-0.541757,-0.540964,-0.548708,-0.542787,0.002611,93,accoa
4,4,0.215802,0.022706,0.020663,0.004466,MinMaxScaler(),ElasticNet(),1.00,0.01,,...,-0.540166,-0.541760,-0.545391,-0.541757,-0.540964,-0.548708,-0.542787,0.002611,93,accoa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,100,6.539133,0.164711,0.015128,0.000908,RobustScaler(),RandomForestRegressor(),,,friedman_mse,...,-0.372072,-0.401027,-0.395004,-0.393105,-0.390337,-0.391611,-0.390320,0.008353,28,g6p;f6p;g6p-B
101,101,16.450289,0.245280,0.031455,0.002555,RobustScaler(),RandomForestRegressor(),,,friedman_mse,...,-0.377845,-0.387536,-0.383710,-0.380462,-0.385080,-0.383070,-0.384029,0.004503,8,g6p;f6p;g6p-B
102,102,33.248118,0.596361,0.058389,0.004440,RobustScaler(),RandomForestRegressor(),,,friedman_mse,...,-0.372060,-0.385928,-0.391107,-0.382685,-0.386699,-0.392826,-0.384745,0.005737,13,g6p;f6p;g6p-B
103,103,49.350143,1.752222,0.074544,0.008169,RobustScaler(),RandomForestRegressor(),,,friedman_mse,...,-0.380373,-0.383973,-0.388467,-0.378436,-0.391151,-0.391906,-0.385916,0.005625,23,g6p;f6p;g6p-B


In [None]:
architectures = ['all'] + results_df['param_regressor__regressor'].unique().tolist()
testResultStore = TestResultStore(EXPERIMENT_PATH, STRATEGY)

for architecture in architectures:
    gen = get_generator(
        dl=dl,
        tl=tl,
        strategy=STRATEGY,
        tier=TIER,
        data_config=config
    )

    for tf in gen:
        metabolite_id = tf.frame_name
        _result_df = results_df.copy() if architecture == 'all' else results_df[results_df['param_regressor__regressor'] == architecture].copy()
        _result_df = _result_df[_result_df['metabolite_id'] == metabolite_id]
        _result_df = _result_df.sort_values('rank_test_score').iloc[[0]]

        model = build_model_pipeline(tf)
        model = parse_cv_result(model, _result_df)
        model = trainer.do_retrain_model(tf, model, SPLIT_KWARGS)

        _, X_test, _, y_test = trainer.do_train_test_split(tf, **SPLIT_KWARGS)

        testResultStore.update_results(
            metabolite_id, 
            model,
            architecture,
            X_test, 
            y_test
        )
testResultStore.to_file()

RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
ElasticNet model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
RandomForest model
