# Tune the best ML model for ICDHyr

The best = the one with the lowest average MAPE score from "models_filter".

Preprocessing and feature selection is performed after splitting the data (therefore, only the samples in the training data are available for these steps). 

In [1]:
#import libraries
import pandas as pd
import numpy as np
from IPython.display import display

from functools import partial #to pass parameters to function inside another function
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn import svm

In [2]:
#import self-made functions
%run /Users/mariekececilia/Documents/master_thesis_code/methods.ipynb

In [3]:
#set a random seed to make reproducible results (used in mutual information)
seed = 0

## Load and clean data

In [4]:
#load transcriptional and fluxomic data
gexp, flux = load_gerosa()

#clean the transcriptional data (remove duplicates)
gexp, groups = clean_gexp_g(gexp)

#define features
X = gexp

#define targets and extract their fluxes
target_labels = ['R_ICDHyr']
targets = dict()
for target_label in target_labels:
    targets[target_label] = flux[target_label]

## Create models/pipelines

Choose selection parameters:

In [5]:
cov_filters = [0.2] #same as in the best model
k_numbers = [15, 20, 30] #same k as in the best model + 1 larger and 1 smaller k

Choose learning algorithms:

In [6]:
algorithms = dict({
    'lr': linear_model.LinearRegression(),
    'lasso_a1': linear_model.Lasso(alpha = 1),
    'lasso_a0.1': linear_model.Lasso(alpha = 0.1),
    'lasso_a0.05': linear_model.Lasso(alpha = 0.05), 
    'lasso_a0.001': linear_model.Lasso(alpha = 0.001),
    'ridge_a1': linear_model.Ridge(alpha = 1),
    'ridge_a0.1': linear_model.Ridge(alpha = 0.1),
    'ridge_a0.05': linear_model.Ridge(alpha = 0.05),
    'ridge_a0.001': linear_model.Ridge(alpha = 0.001),
    })

Make combinations:

In [7]:
def get_pipelines(algorithm, cov_filters, k_numbers):
    models = dict()

    for cov_filter in cov_filters:  
        
        for k in k_numbers:
            #selection by mi – same as in the best model (skipping the other scoring method)
            steps = [
                ('filter', CoVSelector(p = cov_filter)),
                ('selection', SelectKBest(score_func = partial(
                    mutual_info_regression, 
                    random_state=seed), 
                                          k = k)), 
                ('standardize', StandardScaler()),
                ('m', algorithm)
            ]
            models['v%.0f_mi%.0f' % (cov_filter*100, k)] = Pipeline(steps=steps)

    return models

In [8]:
models = dict()

for name, algorithm in algorithms.items():
     models[name]= get_pipelines(algorithm, cov_filters = cov_filters, k_numbers = k_numbers)

## Learning

In [9]:
# evaluate the models and store results in nested dicts: target -> learning algorithm -> scores/preds of all selection combinations
scores = dict()
preds = dict()
for y_name, y in targets.items():
    model_scores = dict()
    model_preds = dict()
    for model_name, pipelines in models.items():
        #get scores
        model_scores[model_name] = compare_models(pipelines, X, y)
        #get predictions
        model_preds[model_name] = get_models_preds(pipelines, X, y)
    scores[y_name] = model_scores
    preds[y_name] = model_preds

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

### Save data to use in other notebooks

Scores and predictions are visualized in the Notebook 'compare_models', which also merges the results with the results from all other models built.

In [10]:
scores_tune_icdhyr = scores
preds_tune_icdhyr = preds
%store scores_tune_icdhyr
%store preds_tune_icdhyr

Stored 'scores_tune_icdhyr' (dict)
Stored 'preds_tune_icdhyr' (dict)


### Scores and predictions

In [11]:
for y in targets.keys():
    for algorithm in algorithms.keys():
        print()
        print(y + '\t(target)') 
        print(algorithm, '\t(learning algorithm)') 
        print('–> summary of all cv split scores for each combination of selection methods:')
        df = scores[y][algorithm][1].sort_values(by = 'average')
        df.index.name = 'Selection combo'
        df.columns.name = 'Score'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all individual cv-split scores for each combination of selection methods:')
        df = scores[y][algorithm][0]
        df.columns.name = 'Selection combo'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all predictions for each combination of selection methods (+ actual values):')
        df = preds[y][algorithm]
        df.index.name = 'Test set'
        df.columns.name = 'Selection combo'
        display(df)


R_ICDHyr	(target)
lr 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi15,0.281726,0.361808
v20_mi30,0.289191,0.163812
v20_mi20,0.359214,0.309566



R_ICDHyr	(target)
lr	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.141819,0.07558,0.204127
Fructose,1.176498,0.557744,0.235388
Galactose,0.058176,1.069707,0.671772
Glucose,0.035424,0.193387,0.192835
Glycerol,0.066551,0.063733,0.143427
Gluconate,0.335965,0.358364,0.34093
Pyruvate,0.392126,0.360838,0.371025
Succinate,0.047244,0.194357,0.154021



R_ICDHyr	(target)
lr	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.030374,5.051373,5.655082
Fructose,4.566119,9.938149,7.112843,5.640929
Galactose,0.496177,0.467312,1.026942,0.829496
Glucose,2.977971,2.872478,2.40207,2.403714
Glycerol,2.464836,2.300798,2.307744,2.111313
Gluconate,1.154034,1.541748,1.567598,1.547478
Pyruvate,7.97932,4.850417,5.100077,5.018793
Succinate,3.038949,2.895377,2.448308,2.570888



R_ICDHyr	(target)
lasso_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi15,0.918934,1.468219
v20_mi30,0.920942,1.470491
v20_mi20,0.925596,1.468502



R_ICDHyr	(target)
lasso_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.255296,0.256057,0.256056
Fructose,0.487575,0.487577,0.460096
Galactose,4.693581,4.696789,4.69707
Glucose,0.090843,0.090867,0.104684
Glycerol,0.076737,0.076735,0.076738
Gluconate,1.149021,1.180513,1.180513
Pyruvate,0.577426,0.577426,0.577426
Succinate,0.020996,0.038801,0.014953



R_ICDHyr	(target)
lasso_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,3.49744,3.493866,3.493871
Fructose,4.566119,6.792446,6.792452,6.66697
Galactose,0.496177,2.825026,2.826618,2.826757
Glucose,2.977971,3.248499,3.24857,3.289717
Glycerol,2.464836,2.275693,2.275696,2.27569
Gluconate,1.154034,2.480043,2.516386,2.516386
Pyruvate,7.97932,3.371856,3.371857,3.371857
Succinate,3.038949,2.975144,2.921034,2.993508



R_ICDHyr	(target)
lasso_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.325475,0.219395
v20_mi15,0.348459,0.246365
v20_mi30,0.362169,0.275548



R_ICDHyr	(target)
lasso_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.13936,0.132125,0.132132
Fructose,0.693491,0.507482,0.825879
Galactose,0.787017,0.786992,0.787007
Glucose,0.069821,0.076571,0.120407
Glycerol,0.200752,0.168905,0.20749
Gluconate,0.301555,0.326244,0.32665
Pyruvate,0.406653,0.386599,0.406502
Succinate,0.189023,0.218886,0.091283



R_ICDHyr	(target)
lasso_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.041923,4.075903,4.075869
Fructose,4.566119,7.732683,6.88334,8.337179
Galactose,0.496177,0.886677,0.886665,0.886673
Glucose,2.977971,2.770047,2.749944,2.619403
Glycerol,2.464836,1.970015,2.048514,1.953408
Gluconate,1.154034,1.502038,1.53053,1.530998
Pyruvate,7.97932,4.734503,4.894522,4.735714
Succinate,3.038949,2.464519,2.373765,2.761545



R_ICDHyr	(target)
lasso_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi20,0.279225,0.197944
v20_mi30,0.354237,0.291581
v20_mi15,0.355982,0.222794



R_ICDHyr	(target)
lasso_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.136811,0.126885,0.126855
Fructose,0.705887,0.487795,0.895001
Galactose,0.63634,0.63297,0.736078
Glucose,0.051243,0.029795,0.036184
Glycerol,0.240486,0.060706,0.218166
Gluconate,0.478667,0.292001,0.315266
Pyruvate,0.400022,0.374137,0.409071
Succinate,0.198401,0.229508,0.09728



R_ICDHyr	(target)
lasso_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.053897,4.100514,4.100654
Fructose,4.566119,7.789281,6.793449,8.6528
Galactose,0.496177,0.811915,0.810243,0.861403
Glucose,2.977971,2.825373,2.889243,2.870217
Glycerol,2.464836,1.872078,2.315205,1.927093
Gluconate,1.154034,1.706432,1.491012,1.517861
Pyruvate,7.97932,4.787418,4.993964,4.715213
Succinate,3.038949,2.436019,2.341487,2.743321



R_ICDHyr	(target)
lasso_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,0.194782,0.082812
v20_mi20,0.226665,0.296443
v20_mi15,0.294899,0.237186



R_ICDHyr	(target)
lasso_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.695461,0.152807,0.210293
Fructose,0.541611,0.019413,0.084701
Galactose,0.219067,0.96537,0.331699
Glucose,0.390062,0.227236,0.176418
Glycerol,0.039591,0.036907,0.157846
Gluconate,0.031047,0.063032,0.184027
Pyruvate,0.405924,0.316864,0.310703
Succinate,0.036428,0.031691,0.102567



R_ICDHyr	(target)
lasso_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,1.430241,3.978772,3.708794
Fructose,4.566119,7.039178,4.654759,4.952875
Galactose,0.496177,0.604873,0.975172,0.660759
Glucose,2.977971,4.139565,3.654674,2.452604
Glycerol,2.464836,2.562421,2.555807,2.075771
Gluconate,1.154034,1.118204,1.226775,0.941661
Pyruvate,7.97932,4.740319,5.450964,5.500118
Succinate,3.038949,3.149651,2.942641,2.727252



R_ICDHyr	(target)
ridge_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,0.336413,0.312851
v20_mi15,0.44048,0.529843
v20_mi20,0.458988,0.601283



R_ICDHyr	(target)
ridge_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.053331,0.056541,0.049321
Fructose,0.428905,0.291489,0.223768
Galactose,1.770176,2.009099,1.098236
Glucose,0.086423,0.168998,0.189541
Glycerol,0.075552,0.066019,0.121581
Gluconate,0.507385,0.478307,0.450597
Pyruvate,0.414744,0.373119,0.379624
Succinate,0.187324,0.228331,0.178639



R_ICDHyr	(target)
ridge_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.445951,4.961959,4.92805
Fructose,4.566119,6.524549,5.897094,5.587869
Galactose,0.496177,1.374499,1.493047,1.041097
Glucose,2.977971,2.720607,2.474701,2.413524
Glycerol,2.464836,2.278614,2.302111,2.165159
Gluconate,1.154034,1.739573,1.706016,1.674038
Pyruvate,7.97932,4.669944,5.002087,4.950175
Succinate,3.038949,2.46968,2.345063,2.496074



R_ICDHyr	(target)
ridge_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,0.294312,0.183712
v20_mi15,0.310804,0.269614
v20_mi20,0.377683,0.366976



R_ICDHyr	(target)
ridge_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.113554,0.075676,0.173784
Fructose,0.877809,0.46706,0.233343
Galactose,0.50838,1.280324,0.729274
Glucose,0.053104,0.191627,0.191858
Glycerol,0.068254,0.064016,0.1401
Gluconate,0.367855,0.375519,0.355765
Pyruvate,0.400222,0.363344,0.371988
Succinate,0.097253,0.203894,0.15838



R_ICDHyr	(target)
ridge_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.163119,5.051823,5.512581
Fructose,4.566119,8.574299,6.69877,5.631589
Galactose,0.496177,0.748424,1.131445,0.858027
Glucose,2.977971,2.819828,2.407311,2.406625
Glycerol,2.464836,2.296602,2.307047,2.119513
Gluconate,1.154034,1.57855,1.587396,1.564598
Pyruvate,7.97932,4.785821,5.080081,5.011106
Succinate,3.038949,2.743402,2.419325,2.55764



R_ICDHyr	(target)
ridge_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi30,0.291728,0.173907
v20_mi15,0.292896,0.295323
v20_mi20,0.368917,0.339555



R_ICDHyr	(target)
ridge_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.125672,0.075926,0.188083
Fructose,0.997315,0.505623,0.234325
Galactose,0.281163,1.184377,0.701105
Glucose,0.045562,0.192682,0.192306
Glycerol,0.067412,0.063807,0.141701
Gluconate,0.352947,0.367167,0.3485
Pyruvate,0.396933,0.36218,0.371507
Succinate,0.076164,0.199578,0.156294



R_ICDHyr	(target)
ridge_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.106211,5.052997,5.579736
Fructose,4.566119,9.119976,6.874851,5.636073
Galactose,0.496177,0.635684,1.083838,0.84405
Glucose,2.977971,2.842289,2.404169,2.405291
Glycerol,2.464836,2.298677,2.307563,2.115568
Gluconate,1.154034,1.561346,1.577756,1.556214
Pyruvate,7.97932,4.812068,5.089373,5.014946
Succinate,3.038949,2.807492,2.432441,2.56398



R_ICDHyr	(target)
ridge_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v20_mi15,0.280201,0.361073
v20_mi30,0.289241,0.164017
v20_mi20,0.359417,0.310183



R_ICDHyr	(target)
ridge_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.141444,0.075595,0.203787
Fructose,1.172038,0.556523,0.235366
Galactose,0.049399,1.072226,0.672371
Glucose,0.035661,0.193378,0.192824
Glycerol,0.066569,0.063733,0.143391
Gluconate,0.336329,0.358545,0.341085
Pyruvate,0.392245,0.360867,0.371035
Succinate,0.047927,0.194472,0.154068



R_ICDHyr	(target)
ridge_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v20_mi15,v20_mi20,v20_mi30
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,4.696417,4.032136,5.051443,5.653485
Fructose,4.566119,9.917782,7.107268,5.640828
Galactose,0.496177,0.471667,1.028192,0.829793
Glucose,2.977971,2.871773,2.402097,2.403748
Glycerol,2.464836,2.300756,2.307746,2.111401
Gluconate,1.154034,1.542168,1.567807,1.547657
Pyruvate,7.97932,4.84947,5.099847,5.018716
Succinate,3.038949,2.893302,2.447959,2.570744
