# Tune the best ML model for AKGDH

The best = the one with the lowest average MAPE score from "models_filter".

Preprocessing and feature selection is performed after splitting the data (therefore, only the samples in the training data are available for these steps). 

In [1]:
#import libraries
import pandas as pd
import numpy as np
from IPython.display import display

from functools import partial #to pass parameters to function inside another function
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn import svm

In [2]:
#import self-made functions
%run /Users/mariekececilia/Documents/master_thesis_code/methods.ipynb

In [3]:
#set a random seed to make reproducible results (used in mutual information)
seed = 0

## Load and clean data

In [4]:
#load transcriptional and fluxomic data
gexp, flux = load_gerosa()

#clean the transcriptional data (remove duplicates)
gexp, groups = clean_gexp_g(gexp)

#define features
X = gexp

#define targets and extract their fluxes
target_labels = ['R_AKGDH']
targets = dict()
for target_label in target_labels:
    targets[target_label] = flux[target_label]

## Create models/pipelines

Choose selection parameters:

In [5]:
cov_filters = [0.2] #same as in the best model
k_numbers = [15, 20, 30] #same k as in the best model + 1 larger and 1 smaller k

Choose learning algorithms:

In [6]:
algorithms = dict({
    'lr': linear_model.LinearRegression(),
    'lasso_a1': linear_model.Lasso(alpha = 1), 
    'lasso_a0.1': linear_model.Lasso(alpha = 0.1),
    'lasso_a0.05': linear_model.Lasso(alpha = 0.05),
    'lasso_a0.001': linear_model.Lasso(alpha = 0.001), 
    'ridge_a1': linear_model.Ridge(alpha = 1),
    'ridge_a0.1': linear_model.Ridge(alpha = 0.1),
    'ridge_a0.05': linear_model.Ridge(alpha = 0.05),
    'ridge_a0.001': linear_model.Ridge(alpha = 0.001),
    })

Make combinations:

In [7]:
def get_pipelines(algorithm, cov_filters, k_numbers):
    models = dict()

    for cov_filter in cov_filters:  
        
        for k in k_numbers:
            #selection by mi – same as in the best model (skipping the other scoring method)
            steps = [
                ('filter', CoVSelector(p = cov_filter)),
                ('selection', SelectKBest(score_func = partial(
                    mutual_info_regression, 
                    random_state=seed), 
                                          k = k)), 
                ('standardize', StandardScaler()),
                ('m', algorithm)
            ]
            models['v%.0f_mi%.0f' % (cov_filter*100, k)] = Pipeline(steps=steps)

    return models

In [8]:
models = dict()

for name, algorithm in algorithms.items():
     models[name]= get_pipelines(algorithm, cov_filters = cov_filters, k_numbers = k_numbers)

## Learning

In [9]:
# evaluate the models and store results in nested dicts: target -> learning algorithm -> scores/preds of all selection combinations
scores = dict()
preds = dict()
for y_name, y in targets.items():
    model_scores = dict()
    model_preds = dict()
    for model_name, pipelines in models.items():
        #get scores
        model_scores[model_name] = compare_models(pipelines, X, y)
        #get predictions
        model_preds[model_name] = get_models_preds(pipelines, X, y)
    scores[y_name] = model_scores
    preds[y_name] = model_preds

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

### Save data to use in other notebooks

Scores and predictions are visualized in the Notebook 'compare_models', which also merges the results with the results from all other models built.

In [10]:
scores_tune_akgdh = scores
preds_tune_akgdh= preds
%store scores_tune_akgdh
%store preds_tune_akgdh

Stored 'scores_tune_akgdh' (dict)
Stored 'preds_tune_akgdh' (dict)


### Scores and predictions

In [11]:
for y in targets.keys():
    for algorithm in algorithms.keys():
        print()
        print(y + '\t(target)') 
        print(algorithm, '\t(learning algorithm)') 
        print('–> summary of all cv split scores for each combination of selection methods:')
        df = scores[y][algorithm][1].sort_values(by = 'average')
        df.index.name = 'Selection combo'
        df.columns.name = 'Score'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all individual cv-split scores for each combination of selection methods:')
        df = scores[y][algorithm][0]
        df.columns.name = 'Selection combo'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all predictions for each combination of selection methods (+ actual values):')
        df = preds[y][algorithm]
        df.index.name = 'Test set'
        df.columns.name = 'Selection combo'
        display(df)


R_AKGDH	(target)
lr 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.897167,1.588921
v20_mi15,0.971282,1.877622
v20_mi30,1.066749,1.42775



R_AKGDH	(target)
lr	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.216796,0.000976,0.13315
Fructose,0.321971,0.311837,0.178014
Galactose,0.313537,0.516975,2.388713
Glucose,0.377105,0.461827,0.415111
Glycerol,0.008542,0.019984,0.039246
Gluconate,5.927318,5.073364,4.352453
Pyruvate,0.449941,0.448523,0.440749
Succinate,0.155049,0.343849,0.586556



R_AKGDH	(target)
lr	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.342057,4.271329,3.698992
Fructose,3.874895,5.122498,5.083231,4.564678
Galactose,0.236073,0.162055,0.358116,0.799982
Glucose,2.138073,1.331796,1.150654,1.250536
Glycerol,1.840459,1.856181,1.877238,1.768228
Gluconate,0.182358,1.263254,1.107528,0.976064
Pyruvate,7.415655,4.079051,4.089565,4.147209
Succinate,2.351983,1.987311,1.543257,0.972412



R_AKGDH	(target)
lasso_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,2.18687,3.555781
v20_mi20,2.187465,3.552109
v20_mi15,2.225683,3.631586



R_AKGDH	(target)
lasso_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.228526,0.228527,0.228527
Fructose,0.099727,0.099727,0.017231
Galactose,6.772659,6.772579,6.786211
Glucose,0.009522,0.006049,0.006131
Glycerol,0.099489,0.099484,0.099488
Gluconate,9.939246,9.637059,9.637391
Pyruvate,0.637679,0.637679,0.637679
Succinate,0.018619,0.018616,0.082301



R_AKGDH	(target)
lasso_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.292005,3.292003,3.291999
Fructose,3.874895,4.261327,4.261327,3.808125
Galactose,0.236073,1.834911,1.834892,1.83811
Glucose,2.138073,2.117714,2.12514,2.124965
Glycerol,1.840459,2.023563,2.023555,2.023562
Gluconate,0.182358,1.994862,1.939756,1.939816
Pyruvate,7.415655,2.68685,2.68685,2.68685
Succinate,2.351983,2.395774,2.395767,2.158412



R_AKGDH	(target)
lasso_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.671654,1.29611
v20_mi15,0.766337,1.582414
v20_mi30,1.213727,1.711929



R_AKGDH	(target)
lasso_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.02568,0.025756,0.000176
Fructose,0.370061,0.497126,0.282836
Galactose,0.043618,0.0493,4.26677
Glucose,0.065532,0.065517,0.065534
Glycerol,0.022136,0.022989,0.138439
Gluconate,4.930529,4.065216,4.064465
Pyruvate,0.493879,0.493385,0.493441
Succinate,0.179262,0.153942,0.39816



R_AKGDH	(target)
lasso_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,4.376745,4.37707,4.266411
Fructose,3.874895,5.30884,5.801205,4.970853
Galactose,0.236073,0.246369,0.247711,1.24334
Glucose,2.138073,1.997961,1.997993,1.997958
Glycerol,1.840459,1.8812,1.88277,1.585668
Gluconate,0.182358,1.081481,0.923684,0.923547
Pyruvate,7.415655,3.753221,3.75688,3.756465
Succinate,2.351983,1.930362,1.989913,1.415519



R_AKGDH	(target)
lasso_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.738723,1.168857
v20_mi15,0.825745,1.515971
v20_mi30,1.216268,1.680586



R_AKGDH	(target)
lasso_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.030522,0.037211,0.006635
Fructose,0.258869,0.470349,0.302949
Galactose,0.779834,0.920797,4.459927
Glucose,0.070207,0.069753,0.078437
Glycerol,0.001925,0.025528,0.24114
Gluconate,4.784008,3.736068,3.737991
Pyruvate,0.483183,0.483146,0.48313
Succinate,0.197417,0.16693,0.419932



R_AKGDH	(target)
lasso_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,4.397403,4.425948,4.295474
Fructose,3.874895,4.877984,5.697448,5.048791
Galactose,0.236073,0.051975,0.018698,1.288939
Glucose,2.138073,1.987966,1.988936,1.97037
Glycerol,1.840459,1.836916,1.793475,1.39665
Gluconate,0.182358,1.054762,0.863661,0.864012
Pyruvate,7.415655,3.832533,3.832808,3.832928
Succinate,2.351983,1.887662,1.959367,1.364309



R_AKGDH	(target)
lasso_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,0.716547,0.634791
v20_mi20,1.083335,1.908089
v20_mi15,1.146271,2.125325



R_AKGDH	(target)
lasso_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.025796,0.047656,0.352769
Fructose,1.258142,0.536172,0.249041
Galactose,0.357104,1.051805,0.837426
Glucose,0.175513,0.090008,0.23753
Glycerol,0.039473,0.022835,0.174085
Gluconate,6.681651,6.06108,2.137064
Pyruvate,0.488881,0.486355,0.506103
Succinate,0.143609,0.370774,1.238357



R_AKGDH	(target)
lasso_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,4.157087,4.470518,2.761839
Fructose,3.874895,8.750063,5.952504,4.839903
Galactose,0.236073,0.320375,0.484375,0.433766
Glucose,2.138073,2.513333,2.330516,1.630216
Glycerol,1.840459,1.767811,1.798432,1.520063
Gluconate,0.182358,1.400813,1.287646,0.57207
Pyruvate,7.415655,3.790285,3.809011,3.662572
Succinate,2.351983,2.014216,1.479929,-0.560613



R_AKGDH	(target)
ridge_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,1.154439,1.877199
v20_mi15,1.196066,2.086189
v20_mi30,1.209776,1.705138



R_AKGDH	(target)
ridge_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.145236,0.05476,0.153207
Fructose,0.218272,0.249367,0.188844
Galactose,1.830491,2.135736,3.066316
Glucose,0.127365,0.206276,0.306185
Glycerol,0.030224,0.030803,0.074614
Gluconate,6.524157,5.822086,5.001369
Pyruvate,0.448609,0.454607,0.441951
Succinate,0.244175,0.281878,0.445722



R_AKGDH	(target)
ridge_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.647416,4.033491,3.613404
Fructose,3.874895,4.720676,4.841167,4.606644
Galactose,0.236073,0.668201,0.740261,0.959945
Glucose,2.138073,1.865757,1.69704,1.483428
Glycerol,1.840459,1.784833,1.783767,1.703134
Gluconate,0.182358,1.372092,1.244064,1.094399
Pyruvate,7.415655,4.088923,4.044443,4.138297
Succinate,2.351983,1.777687,1.68901,1.303651



R_AKGDH	(target)
ridge_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi15,0.932329,1.924114
v20_mi20,0.942363,1.630785
v20_mi30,1.081909,1.463682



R_AKGDH	(target)
ridge_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.163349,0.008027,0.135873
Fructose,0.254041,0.288022,0.177788
Galactose,0.077855,0.876607,2.473695
Glucose,0.312405,0.383335,0.392539
Glycerol,0.004031,0.008237,0.044983
Gluconate,6.011791,5.203463,4.435858
Pyruvate,0.446752,0.44956,0.440736
Succinate,0.18841,0.321656,0.553798



R_AKGDH	(target)
ridge_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.570124,4.232909,3.687373
Fructose,3.874895,4.859277,4.990949,4.563804
Galactose,0.236073,0.254452,0.443015,0.820044
Glucose,2.138073,1.470129,1.318475,1.298796
Glycerol,1.840459,1.847878,1.855619,1.757669
Gluconate,0.182358,1.278658,1.131253,0.991274
Pyruvate,7.415655,4.102693,4.081875,4.147308
Succinate,2.351983,1.908846,1.595454,1.04946



R_AKGDH	(target)
ridge_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.921037,1.609858
v20_mi15,0.938164,1.906376
v20_mi30,1.074234,1.446051



R_AKGDH	(target)
ridge_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.181632,0.003692,0.13454
Fructose,0.279098,0.298349,0.177788
Galactose,0.10444,0.713835,2.431812
Glucose,0.340369,0.417146,0.40302
Glycerol,0.006863,0.013589,0.042226
Gluconate,5.970488,5.141554,4.394863
Pyruvate,0.447686,0.44907,0.440736
Succinate,0.174739,0.331062,0.568885



R_AKGDH	(target)
ridge_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.492111,4.251409,3.693057
Fructose,3.874895,4.95637,5.030968,4.563806
Galactose,0.236073,0.211417,0.404589,0.810157
Glucose,2.138073,1.410339,1.246185,1.276386
Glycerol,1.840459,1.85309,1.865469,1.762744
Gluconate,0.182358,1.271126,1.119963,0.983798
Pyruvate,7.415655,4.095767,4.08551,4.14731
Succinate,2.351983,1.941001,1.573331,1.013974



R_AKGDH	(target)
ridge_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.897657,1.589341
v20_mi15,0.970504,1.878221
v20_mi30,1.066896,1.428123



R_AKGDH	(target)
ridge_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.215808,0.000879,0.133178
Fructose,0.320838,0.311529,0.178006
Galactose,0.309024,0.521333,2.389587
Glucose,0.376233,0.460792,0.414851
Glycerol,0.008533,0.019843,0.039308
Gluconate,5.928199,5.074797,4.353316
Pyruvate,0.449869,0.448534,0.440749
Succinate,0.155529,0.343549,0.586173



R_AKGDH	(target)
ridge_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.267163,3.346276,4.270915,3.698871
Fructose,3.874895,5.11811,5.082036,4.564651
Galactose,0.236073,0.16312,0.359145,0.800188
Glucose,2.138073,1.333659,1.152867,1.251092
Glycerol,1.840459,1.856163,1.87698,1.768114
Gluconate,0.182358,1.263415,1.10779,0.976221
Pyruvate,7.415655,4.079583,4.089479,4.147212
Succinate,2.351983,1.986182,1.543961,0.973313
