# Tune the best ML model for PPC

The best = the one with the lowest average MAPE score from "models_filter"

Preprocessing and feature selection is performed after splitting the data (therefore, only the samples in the training data are available for these steps). 

In [1]:
#import libraries
import pandas as pd
import numpy as np
from IPython.display import display

from functools import partial #to pass parameters to function inside another function
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn import svm

In [2]:
#import self-made functions
%run /Users/mariekececilia/Documents/master_thesis_code/methods.ipynb

In [3]:
#set a random seed to make reproducible results (used in mutual information)
seed = 0

## Load and clean data

In [4]:
#load transcriptional and fluxomic data
gexp, flux = load_gerosa()

#clean the transcriptional data (remove duplicates)
gexp, groups = clean_gexp_g(gexp)

#define features
X = gexp

#define targets and extract their fluxes
target_labels = ['R_PPC']
targets = dict()
for target_label in target_labels:
    targets[target_label] = flux[target_label]

## Create models/pipelines

Choose selection parameters:

In [5]:
cov_filters = [0.1] #same as in the best model
k_numbers = [15, 10, 5] #same k as in the best model + 1 larger and 1 smaller k

Choose learning algorithms:

In [6]:
algorithms = dict({
    'lr': linear_model.LinearRegression(),
    'lasso_a1': linear_model.Lasso(alpha = 1), #this makes very bad predictions
    'lasso_a0.1': linear_model.Lasso(alpha = 0.1),
    'lasso_a0.05': linear_model.Lasso(alpha = 0.05), #this one causes warning, does not converge
    'lasso_a0.001': linear_model.Lasso(alpha = 0.001), #this one causes warning, does not converge
    'ridge_a1': linear_model.Ridge(alpha = 1),
    'ridge_a0.1': linear_model.Ridge(alpha = 0.1),
    'ridge_a0.05': linear_model.Ridge(alpha = 0.05),
    'ridge_a0.001': linear_model.Ridge(alpha = 0.001),
    })

Make combinations:

In [7]:
def get_pipelines(algorithm, cov_filters, k_numbers):
    models = dict()

    for cov_filter in cov_filters:  
        
        for k in k_numbers:
            #selection by f – same as in the best model (skipping the other scoring method)
            steps = [
                ('filter', CoVSelector(p = cov_filter)),
                ('selection', SelectKBest(score_func = f_regression,
                                          k = k)),
                ('standardize', StandardScaler()),
                ('m', algorithm)
            ]
            models['v%.0f_f%.0f' % (cov_filter*100, k)] = Pipeline(steps=steps)

    return models

In [8]:
models = dict()

for name, algorithm in algorithms.items():
     models[name]= get_pipelines(algorithm = algorithm, cov_filters = cov_filters, k_numbers = k_numbers)

## Learning

In [9]:
# evaluate the models and store results in nested dicts: target -> learning algorithm -> scores/preds of all selection combinations
scores = dict()
preds = dict()
for y_name, y in targets.items():
    model_scores = dict()
    model_preds = dict()
    for model_name, pipelines in models.items():
        #get scores
        model_scores[model_name] = compare_models(pipelines, X, y)
        #get predictions
        model_preds[model_name] = get_models_preds(pipelines, X, y)
    scores[y_name] = model_scores
    preds[y_name] = model_preds

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### Save data to use in other notebooks

Scores and predictions are visualized in the Notebook 'compare_models', which also merges the results with the results from all other models built.

In [10]:
scores_tune_ppc = scores
preds_tune_ppc= preds
%store scores_tune_ppc
%store preds_tune_ppc

Stored 'scores_tune_ppc' (dict)
Stored 'preds_tune_ppc' (dict)


### Scores and predictions

In [11]:
for y in targets.keys():
    for algorithm in algorithms.keys():
        print()
        print(y + '\t(target)') 
        print(algorithm, '\t(learning algorithm)') 
        print('–> summary of all cv split scores for each combination of selection methods:')
        df = scores[y][algorithm][1].sort_values(by = 'average')
        df.index.name = 'Selection combo'
        df.columns.name = 'Score'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all individual cv-split scores for each combination of selection methods:')
        df = scores[y][algorithm][0]
        df.columns.name = 'Selection combo'
        display(df)
        
        print()
        print(y + '\t(target)') 
        print(algorithm + '\t(learning algorithm)') 
        print('–> all predictions for each combination of selection methods (+ actual values):')
        df = preds[y][algorithm]
        df.index.name = 'Test set'
        df.columns.name = 'Selection combo'
        display(df)


R_PPC	(target)
lr 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f10,0.165893,0.061271
v10_f15,0.367378,0.500012
v10_f5,0.596608,0.728406



R_PPC	(target)
lr	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.306289,0.158488,0.182489
Fructose,0.250583,0.185369,0.89014
Galactose,1.672588,0.11532,2.281964
Glucose,0.149871,0.128289,0.018354
Glycerol,0.068355,0.209116,0.243763
Gluconate,0.146459,0.191467,0.020764
Pyruvate,0.266286,0.062126,0.988631
Succinate,0.078589,0.276967,0.146758



R_PPC	(target)
lr	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,2.317547,2.055326,2.097908
Fructose,3.545669,2.657186,2.888411,0.389527
Galactose,0.377386,1.008598,0.420906,1.238568
Glucose,2.453331,2.085649,2.138597,2.498359
Glycerol,1.376469,1.28238,1.088627,1.712001
Gluconate,1.94335,2.227971,1.571263,1.983703
Pyruvate,2.489449,3.152355,2.644108,0.028302
Succinate,2.01611,1.857666,1.457714,2.31199



R_PPC	(target)
lasso_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f15,0.817962,1.554687
v10_f10,0.817962,1.554687
v10_f5,0.817962,1.554687



R_PPC	(target)
lasso_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.14355,0.14355,0.14355
Fructose,0.499178,0.499178,0.499178
Galactose,4.90472,4.90472,4.90472
Glucose,0.212582,0.212582,0.212582
Glycerol,0.515207,0.515207,0.515207
Gluconate,0.031544,0.031544,0.031544
Pyruvate,0.226079,0.226079,0.226079
Succinate,0.010839,0.010839,0.010839



R_PPC	(target)
lasso_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,2.028823,2.028823,2.028823
Fructose,3.545669,1.775749,1.775749,1.775749
Galactose,0.377386,2.228361,2.228361,2.228361
Glucose,2.453331,1.931797,1.931797,1.931797
Glycerol,1.376469,2.085634,2.085634,2.085634
Gluconate,1.94335,2.004651,2.004651,2.004651
Pyruvate,2.489449,1.926637,1.926637,1.926637
Succinate,2.01611,1.994257,1.994257,1.994257



R_PPC	(target)
lasso_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f10,0.13422,0.144079
v10_f15,0.169369,0.149255
v10_f5,0.178675,0.245538



R_PPC	(target)
lasso_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.070801,0.009942,0.015692
Fructose,0.272787,0.272627,0.349687
Galactose,0.456183,0.456209,0.768259
Glucose,0.097294,0.097307,0.070376
Glycerol,0.061681,0.069853,0.075217
Gluconate,0.072991,0.07295,0.029847
Pyruvate,0.318676,0.089091,0.115928
Succinate,0.004536,0.005781,0.004393



R_PPC	(target)
lasso_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,1.899757,1.756507,1.801985
Fructose,3.545669,2.578455,2.579025,2.305794
Galactose,0.377386,0.549544,0.549553,0.667317
Glucose,2.453331,2.214637,2.214604,2.280676
Glycerol,1.376469,1.461371,1.472619,1.480002
Gluconate,1.94335,1.801504,1.801583,2.001354
Pyruvate,2.489449,3.282777,2.711236,2.778045
Succinate,2.01611,2.006965,2.027766,2.024968



R_PPC	(target)
lasso_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f10,0.098784,0.094057
v10_f5,0.126232,0.154501
v10_f15,0.145903,0.113577



R_PPC	(target)
lasso_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.107692,0.029418,0.004344
Fructose,0.260872,0.253305,0.34732
Galactose,0.245904,0.244483,0.427193
Glucose,0.127019,0.12701,0.067712
Glycerol,0.025947,0.025095,0.047785
Gluconate,0.051979,0.082404,0.020296
Pyruvate,0.337282,0.02054,0.089348
Succinate,0.01053,0.008013,0.00586



R_PPC	(target)
lasso_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,1.965207,1.721953,1.781853
Fructose,3.545669,2.620704,2.647532,2.314187
Galactose,0.377386,0.470187,0.469651,0.538603
Glucose,2.453331,2.141713,2.141734,2.287212
Glycerol,1.376469,1.412183,1.341926,1.442243
Gluconate,1.94335,1.842336,1.783211,1.982792
Pyruvate,2.489449,3.329094,2.540582,2.711875
Succinate,2.01611,1.99488,2.032265,2.027924



R_PPC	(target)
lasso_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f5,0.152417,0.190043
v10_f10,0.168699,0.069103
v10_f15,0.188587,0.16112



R_PPC	(target)
lasso_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.151583,0.100439,0.127342
Fructose,0.196607,0.160631,0.58196
Galactose,0.560044,0.106661,0.333087
Glucose,0.286131,0.24329,0.065108
Glycerol,0.061107,0.219716,0.020902
Gluconate,0.091049,0.082646,0.020067
Pyruvate,0.151345,0.285837,0.061375
Succinate,0.010831,0.150369,0.009494



R_PPC	(target)
lasso_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,2.043076,1.952338,2.000068
Fructose,3.545669,2.848565,2.976123,1.48223
Galactose,0.377386,0.588739,0.337134,0.503089
Glucose,2.453331,1.751357,1.856461,2.2936
Glycerol,1.376469,1.46058,1.074037,1.405239
Gluconate,1.94335,2.12029,1.782739,1.982347
Pyruvate,2.489449,2.866213,1.777872,2.642239
Succinate,2.01611,1.994273,1.712949,2.035251



R_PPC	(target)
ridge_a1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f5,0.250811,0.401991
v10_f10,0.272593,0.347887
v10_f15,0.343154,0.567265



R_PPC	(target)
ridge_a1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.067755,0.139396,0.196132
Fructose,0.254998,0.214233,0.291598
Galactose,1.812832,1.154718,1.286213
Glucose,0.119244,0.17058,0.1008
Glycerol,0.037059,0.006915,0.015468
Gluconate,0.049092,0.049569,0.023884
Pyruvate,0.376291,0.351501,0.084565
Succinate,0.02796,0.093831,0.007831



R_PPC	(target)
ridge_a1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,1.653939,1.526837,1.426179
Fructose,3.545669,2.641529,2.786069,2.511759
Galactose,0.377386,1.061524,0.813161,0.862786
Glucose,2.453331,2.160787,2.034842,2.206035
Glycerol,1.376469,1.427479,1.36695,1.355177
Gluconate,1.94335,1.847947,1.847021,1.989765
Pyruvate,2.489449,3.426205,3.364491,2.69997
Succinate,2.01611,2.07248,2.205284,2.000322



R_PPC	(target)
ridge_a0.1 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f5,0.145706,0.190453
v10_f10,0.179436,0.138874
v10_f15,0.33721,0.522737



R_PPC	(target)
ridge_a0.1	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.123695,0.003106,0.063732
Fructose,0.25112,0.202665,0.374657
Galactose,1.699565,0.484424,0.551049
Glucose,0.141877,0.177291,0.087407
Glycerol,0.041031,0.152887,0.007796
Gluconate,0.080193,0.05519,0.018456
Pyruvate,0.313735,0.262092,0.062388
Succinate,0.046464,0.097832,0.000159



R_PPC	(target)
ridge_a0.1	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,1.993599,1.768635,1.661076
Fructose,3.545669,2.655279,2.827087,2.217258
Galactose,0.377386,1.018779,0.560201,0.585345
Glucose,2.453331,2.105259,2.018378,2.238893
Glycerol,1.376469,1.319991,1.166025,1.3872
Gluconate,1.94335,2.099194,1.836098,1.979217
Pyruvate,2.489449,3.270475,3.141913,2.64476
Succinate,2.01611,1.922435,1.81887,2.01643



R_PPC	(target)
ridge_a0.05 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f5,0.138362,0.188817
v10_f10,0.171004,0.081144
v10_f15,0.348386,0.512229



R_PPC	(target)
ridge_a0.05	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.188005,0.047888,0.008324
Fructose,0.250855,0.19798,0.437671
Galactose,1.686931,0.335579,0.487211
Glucose,0.145268,0.172051,0.076548
Glycerol,0.053256,0.177793,0.018209
Gluconate,0.107846,0.076268,0.017239
Pyruvate,0.294597,0.196241,0.05293
Succinate,0.060326,0.164235,0.008762



R_PPC	(target)
ridge_a0.05	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,2.107694,1.859106,1.788913
Fructose,3.545669,2.65622,2.843696,1.993834
Galactose,0.377386,1.014011,0.504029,0.561253
Glucose,2.453331,2.09694,2.031233,2.265534
Glycerol,1.376469,1.303163,1.131742,1.401533
Gluconate,1.94335,2.152933,1.795134,1.976853
Pyruvate,2.489449,3.222833,2.97798,2.621216
Succinate,2.01611,1.894486,1.684995,2.033774



R_PPC	(target)
ridge_a0.001 	(learning algorithm)
–> summary of all cv split scores for each combination of selection methods:


Score,average,std
Selection combo,Unnamed: 1_level_1,Unnamed: 2_level_1
v10_f10,0.164848,0.060185
v10_f15,0.366857,0.50026
v10_f5,0.486524,0.596424



R_PPC	(target)
ridge_a0.001	(learning algorithm)
–> all individual cv-split scores for each combination of selection methods:


Selection combo,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acetate,0.302916,0.15462,0.176747
Fructose,0.250588,0.185825,0.849593
Galactose,1.672895,0.120797,1.876255
Glucose,0.149761,0.131029,0.010355
Glycerol,0.068018,0.20841,0.185302
Gluconate,0.145534,0.184873,0.020387
Pyruvate,0.266981,0.059273,0.643661
Succinate,0.078166,0.273954,0.12989



R_PPC	(target)
ridge_a0.001	(learning algorithm)
–> all predictions for each combination of selection methods (+ actual values):


Selection combo,actual,v10_f15,v10_f10,v10_f5
Test set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acetate,1.774145,2.311563,2.048464,2.087721
Fructose,3.545669,2.657166,2.886794,0.533295
Galactose,0.377386,1.008714,0.422974,1.085459
Glucose,2.453331,2.085918,2.131873,2.478735
Glycerol,1.376469,1.282845,1.089599,1.63153
Gluconate,1.94335,2.226174,1.584078,1.982969
Pyruvate,2.489449,3.154085,2.637005,0.887087
Succinate,2.01611,1.85852,1.463789,2.277983
