In [1]:
import pandas as pd

datasets = {
    name: pd.read_csv(f'datasets/{name}.csv')
    for name in ["COAD"]
}
for name, df in datasets.items():
    print(f"{name}\t: {df.shape}")

COAD	: (191, 99521)


In [5]:
from typing import List, Set, Dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from utils.stability_measure import StabilityMeasure
from statistics import mean
from functools import reduce

estimators = {
    "LR_C025": LogisticRegression(C=0.25, penalty='l1', solver='liblinear', max_iter=100000),
    "LR_C05": LogisticRegression(C=0.5, penalty='l1', solver='liblinear', max_iter=100000),
    "SVM_C035": LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000),
    "SVM_C05": LinearSVC(C=0.5, penalty="l1", dual=False, max_iter=100000)
}

selectors = {
    name: SelectFromModel( estimator = estimator, threshold = 1e-8, importance_getter = "auto")
    for name, estimator in estimators.items()
}

class CVResult:
    def __init__(self, n_features:int):
        self.n_features = n_features # number of all features
        self.metrics = {
            "accuracy": accuracy_score
        }
        self.results = {}
        self.features_sets = []
    
    def add_feature_set(self, feature_set:Set[int]):
        self.features_sets.append(feature_set)

    def evaluate(self, prefix:str, y_real:List[float], y_predict:List[float]):
        for name, fun in self.metrics.items():
            key = f"{prefix}_{name}"
            if key not in self.results:
                self.results[key] = []
            self.results[key].append(fun(y_real,y_predict))
    
    def summary(self)->Dict[str,float]:
        results = {key:mean(values) for key, values in self.results.items()}
        results["fss_nogueira"] = StabilityMeasure.Nogueira(self.features_sets, self.n_features)
        #results["fss_lustgarten"] = StabilityMeasure.Lustgarten(self.features_sets, self.n_features)
        results["avg_features"] = mean([len(s) for s in self.features_sets])
        results["all_features"] = self.n_features
        results["runs"] = len(self.features_sets)
        return results

In [7]:
data = {}

def append(data:Dict[str,List[float]], key:str, value:float):
    if key not in data:
        data[key] = []
    data[key].append(value)

def omics_str(omics):
    if type(omics)==str:
        omics = set([omics])
    return reduce(lambda a,b:a+"_"+b, omics)

def get_X(df, omics):
    if type(omics)==str:
        if omics == "__ALL__":
            return df.values[:,1:] #first column is class
        omics = set([omics])
    indexes = [
        i
        for i, name in enumerate(df.columns)
        if name.split("_")[-1] in omics
    ]
    print(f"selected {len(indexes)} for omics: {omics}")
    return df.values[:,indexes]

N_SPLITS = [5,10]
OMICS = ["clinical", "__ALL__", "mutation", "cnv", "mirna", "rna", ["rna","clinical"]]
i = 0
I =  len(datasets) * len(OMICS) * len(N_SPLITS)

for ds_name, df in datasets.items():    
    y = df.values[:,0]
    
    for omics in OMICS:
        X = get_X(df.iloc[:,1:], omics)

        for n_splits in N_SPLITS:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
                
            results = { name: CVResult(X.shape[1]) for name in selectors }
            for train_index, test_index in skf.split(X, y):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]
                
                for name, selector in selectors.items():
                    # calculate results on this split for every estimator/selector
                    result = results[name]
                    selector.fit(X_train, y_train)
                    selected_features_indexes = selector.get_support(indices=True)
                    result.add_feature_set(set(selected_features_indexes))
                    result.evaluate("train", y_train, selector.estimator_.predict(X_train))
                    result.evaluate("test", y_test, selector.estimator_.predict(X_test))
            i+=1
            print(f"Results {i}/{I} for {ds_name} {omics} and {n_splits}-fold cross validation:")
            
            for name, result in results.items():
                r = result.summary()
                #print(f"{name:5} : ",r)
                append( data, "cv-folds", n_splits)
                append( data, "dataset", ds_name)
                append( data, "estimator", name)
                append( data, "omics", omics_str(omics))
                for key, value in r.items():
                    append( data, key, value)

df = pd.DataFrame(data)
df.to_csv("results/jk.csv")
df

selected 7 for omics: {'clinical'}
Results 1/14 for COAD clinical and 5-fold cross validation:
Results 2/14 for COAD clinical and 10-fold cross validation:
Results 3/14 for COAD __ALL__ and 5-fold cross validation:
Results 4/14 for COAD __ALL__ and 10-fold cross validation:
selected 18537 for omics: {'mutation'}
Results 5/14 for COAD mutation and 5-fold cross validation:
Results 6/14 for COAD mutation and 10-fold cross validation:
selected 57964 for omics: {'cnv'}
Results 7/14 for COAD cnv and 5-fold cross validation:
Results 8/14 for COAD cnv and 10-fold cross validation:
selected 802 for omics: {'mirna'}
Results 9/14 for COAD mirna and 5-fold cross validation:
Results 10/14 for COAD mirna and 10-fold cross validation:
selected 22210 for omics: {'rna'}
Results 11/14 for COAD rna and 5-fold cross validation:
Results 12/14 for COAD rna and 10-fold cross validation:
selected 22217 for omics: ['rna', 'clinical']
Results 13/14 for COAD ['rna', 'clinical'] and 5-fold cross validation:
Resul

Unnamed: 0,cv-folds,dataset,estimator,omics,train_accuracy,test_accuracy,fss_nogueira,avg_features,all_features,runs
0,5,COAD,LR_C025,clinical,0.56935,0.549663,0.694056,2.6,7,5
1,5,COAD,LR_C05,clinical,0.628259,0.62807,0.746377,4.6,7,5
2,5,COAD,SVM_C035,clinical,0.629567,0.5861,0.396552,5.8,7,5
3,5,COAD,SVM_C05,clinical,0.628251,0.580837,0.088542,6.4,7,5
4,10,COAD,LR_C025,clinical,0.583463,0.544474,0.62417,3.3,7,10
5,10,COAD,LR_C05,clinical,0.632347,0.612105,0.77904,4.8,7,10
6,10,COAD,SVM_C035,clinical,0.626516,0.586053,0.463602,5.8,7,10
7,10,COAD,SVM_C05,clinical,0.627683,0.585789,0.305809,6.1,7,10
8,5,COAD,LR_C025,__ALL__,1.0,0.758974,0.306299,107.8,99519,5
9,5,COAD,LR_C05,__ALL__,1.0,0.743185,0.322308,157.4,99519,5
