# Stability of CoV as a selection method

In [1]:
#import libraries
import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model

In [2]:
#import self-made functions
%run /Users/mariekececilia/Documents/master_thesis_code/methods.ipynb

In [3]:
#figure saving settings for this notebook
folder_path_figures = 'Figures/'
figure_format = 'png'

## Load and clean data

In [4]:
#load transcriptional and fluxomic data
gexp, flux = load_gerosa()

#clean the transcriptional data (remove duplicates)
gexp, groups = clean_gexp_g(gexp)

#define features
X = gexp

## Create models/pipelines

Set selection parameters:

In [5]:
cov_filters = [0.1, 0.2, 0.5, 0.9]

Make the different slection models (the target and model does not affect the selection by CoV):

In [6]:
def get_pipelines_cov(cov_filters):
    models = dict()
    
    for cov_filter in cov_filters:
        
        steps = [
            ('filter', CoVSelector(p = cov_filter)),
        ]
        models['CoV>%s' % (cov_filter)] = Pipeline(steps=steps)        
            
    return models

In [7]:
models_cov = get_pipelines_cov(cov_filters = cov_filters)

## Fit and inspect

In [8]:
#Fit models on all LOOCV splits and get the selected features 
def get_loo_selected_feature_names(pipeline, X):
    
    #find order of test samples left out
    loo_order = get_loo_order(X)
    
    #get selected features when sample left out
    selected_dict = dict()
    for test_sample in loo_order:
        
        #fit with split data
        X_train = X.drop(test_sample)
        fitted = pipeline.fit(X_train)
        
        #get selecetions
        selected_dict[test_sample] = fitted['filter'].get_feature_names_out()
        
    return selected_dict

#print number of selected features and features filtered out in each CV split
def print_loo_n_selected_features(selected_dict, original_columns):
    print('Left out:', '\t', 'Selected:', '\t', 'Filtered out:', )
    n_selected = []
    for test_sample, selected in selected_dict.items():
        print(test_sample, '\t', len(selected), '\t\t', (len(original_columns)-len(selected)))
        n_selected.append(len(selected))
    print()
    print('Min n selected:', min(n_selected))
    print('Max n selected:', max(n_selected))

In [9]:
#store results in a dict for each model (here threshold)
selected_dict = dict()
stats_dict = dict()
for model_name, pipeline in models_cov.items():
    #fit model and get selected features
    selected = get_loo_selected_feature_names(pipeline, X)
    selected_dict[model_name] = selected
    
    #get intersection and union of selected features
    stats = get_selected_features_intersection_and_union(selected)
    stats_dict[model_name] = stats
    
    #print information
    print('--------------------------------------------------')
    print(model_name)
    print('--------------------------------------------------')
    print_loo_n_selected_features(selected, X.columns)
    print('Intersection of selected features: \t', len(stats['intersection']))
    print('Union of selected features: \t\t', len(stats['union']))
    print()

--------------------------------------------------
CoV>0.1
--------------------------------------------------
Left out: 	 Selected: 	 Filtered out:
Acetate 	 4020 		 85
Fructose 	 4030 		 75
Galactose 	 4009 		 96
Glucose 	 4028 		 77
Glycerol 	 4048 		 57
Gluconate 	 4040 		 65
Pyruvate 	 4021 		 84
Succinate 	 4035 		 70

Min n selected: 4009
Max n selected: 4048
Intersection of selected features: 	 3843
Union of selected features: 		 4068

--------------------------------------------------
CoV>0.2
--------------------------------------------------
Left out: 	 Selected: 	 Filtered out:
Acetate 	 3050 		 1055
Fructose 	 3148 		 957
Galactose 	 2983 		 1122
Glucose 	 3169 		 936
Glycerol 	 3248 		 857
Gluconate 	 3207 		 898
Pyruvate 	 3060 		 1045
Succinate 	 3187 		 918

Min n selected: 2983
Max n selected: 3248
Intersection of selected features: 	 2412
Union of selected features: 		 3366

--------------------------------------------------
CoV>0.5
------------------------------------