In [1]:
from functools import reduce
import pandas as pd

datasets = {
    name: pd.read_csv(f'datasets/{name}.csv')
    for name in ["COAD"]
}

for name, df in datasets.items():
    print(f"{name}\t: {df.shape}")


def omics_str(omics):
    if type(omics) == str:
        omics = set([omics])
    return reduce(lambda a,b: a+"_"+b, omics)

def get_X(df, omics):
    if type(omics) == str:
        if omics == "__ALL__":
            return df.values
        omics = set([omics])
    indexes = [
        i
        for i, name in enumerate(df.columns)
        if name.split("_")[-1] in omics
    ]
    #print(f"selected {len(indexes)} for omics: {omics}")
    return df.values[:, indexes]

COAD	: (191, 99521)


In [5]:
from typing import List, Set, Dict
from statistics import mean

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from utils import stability_measure

estimators = {
    "LR_f100": LogisticRegression(C=0.2657, penalty='l1', solver='liblinear', max_iter=100000),
    "LR_f25": LogisticRegression(C=0.0416, penalty='l1', solver='liblinear', max_iter=100000),
    "SVM_f100": LinearSVC(C=0.2129, penalty="l1", dual=False, max_iter=100000),
    "SVM_f25": LinearSVC(C=0.0096, penalty="l1", dual=False, max_iter=100000)
}

selectors = {
    name: SelectFromModel(estimator = estimator, threshold = 1e-8, importance_getter = "auto")
    for name, estimator in estimators.items()
}

class CVResult:
    def __init__(self, n_features:int):
        self.n_features = n_features # number of all features
        self.metrics = {
            "accuracy": accuracy_score
        }
        self.results = {}
        self.features_sets = []
    
    def add_feature_set(self, feature_set:Set[int]):
        self.features_sets.append(feature_set)

    def evaluate(self, prefix:str, y_real:List[float], y_predict:List[float]):
        for name, fun in self.metrics.items():
            key = f"{prefix}_{name}"
            if key not in self.results:
                self.results[key] = []
            self.results[key].append(fun(y_real,y_predict))
    
    def summary(self)->Dict[str,float]:
        results = {key:mean(values) for key, values in self.results.items()}
        results["fss_nogueira"] = stability_measure.nogueira(self.features_sets, self.n_features)
        #results["fss_lustgarten"] = stability_measure.lustgarten(self.features_sets, self.n_features)
        results["avg_features"] = mean([len(s) for s in self.features_sets])
        results["all_features"] = self.n_features
        results["runs"] = len(self.features_sets)
        return results


def append(data:Dict[str, List[float]], key:str, value:float):
    if key not in data:
        data[key] = []
    data[key].append(value)

In [6]:
from datetime import datetime


data = {}

N_SPLITS = [5,]
OMICS = ["clinical", "__ALL__", "mutation", "cnv", "mirna", "rna", ["rna","clinical"]]
RANDOM_SEEDS = [0,42,21]
i = 0
I =  len(datasets) * len(OMICS) * len(N_SPLITS)

for ds_name, df in datasets.items():    
    y = df.values[:,0]
    
    for omics in OMICS:
        X = get_X(df.iloc[:,1:], omics)

        for n_splits in N_SPLITS:
            results = { name: CVResult(X.shape[1]) for name in selectors }

            for random_seed in RANDOM_SEEDS:
                skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
                    
                for train_index, test_index in skf.split(X, y):
                    X_train, y_train = X[train_index], y[train_index]
                    X_test, y_test = X[test_index], y[test_index]
                    
                    for name, selector in selectors.items():
                        # calculate results on this split for every estimator/selector
                        result = results[name]
                        selector.fit(X_train, y_train)
                        selected_features_indexes = selector.get_support(indices=True)
                        result.add_feature_set(set(selected_features_indexes))
                        result.evaluate("train", y_train, selector.estimator_.predict(X_train))
                        result.evaluate("test", y_test, selector.estimator_.predict(X_test))
            i+=1
            print(f"Results {i}/{I} for {ds_name} {omics} and {n_splits}-fold cross validation:")
            
            for name, result in results.items():
                r = result.summary()
                #print(f"{name:5} : ", r)
                append(data, "cv-folds", n_splits)
                append(data, "dataset", ds_name)
                append(data, "estimator", name)
                append(data, "omics", omics_str(omics))
                for key, value in r.items():
                    append(data, key, value)

df = pd.DataFrame(data)
df.to_csv(f"results/r_{datetime.today().strftime('%Y%m%d_%H%M')}.csv")
df

Results 1/7 for COAD clinical and 5-fold cross validation:
Results 2/7 for COAD __ALL__ and 5-fold cross validation:
Results 3/7 for COAD mutation and 5-fold cross validation:
Results 4/7 for COAD cnv and 5-fold cross validation:
Results 5/7 for COAD mirna and 5-fold cross validation:
Results 6/7 for COAD rna and 5-fold cross validation:
Results 7/7 for COAD ['rna', 'clinical'] and 5-fold cross validation:


Unnamed: 0,cv-folds,dataset,estimator,omics,train_accuracy,test_accuracy,fss_nogueira,avg_features,all_features,runs
0,5,COAD,LR_f100,clinical,0.586834,0.542825,0.554292,3.133333,7,15
1,5,COAD,LR_f25,clinical,0.5602,0.551417,1.0,2.0,7,15
2,5,COAD,SVM_f100,clinical,0.628291,0.574269,0.548823,4.933333,7,15
3,5,COAD,SVM_f25,clinical,0.5602,0.551417,0.952359,1.933333,7,15
4,5,COAD,LR_f100,__ALL__,1.0,0.767791,0.363602,105.466667,99520,15
5,5,COAD,LR_f25,__ALL__,0.869539,0.792353,0.46215,26.266667,99520,15
6,5,COAD,SVM_f100,__ALL__,1.0,0.738237,0.31775,105.533333,99520,15
7,5,COAD,SVM_f25,__ALL__,0.861254,0.788934,0.468871,24.733333,99520,15
8,5,COAD,LR_f100,mutation,0.753474,0.560099,0.317449,17.933333,18537,15
9,5,COAD,LR_f25,mutation,0.445029,0.445074,0.0,0.0,18537,15


In [4]:
import random
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from utils import estimator_helpers

df = datasets["COAD"]
yy = df.values[:,0]

for omics in ("__ALL__", "mutation", "cnv", "mirna", "rna"):
    X = get_X(df.iloc[:,1:], omics)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    selected_index, _ = next(skf.split(X, yy))
    X, y = X[selected_index], yy[selected_index]

    for n_features in (10, 25, 100):
        C = estimator_helpers.match_C_to_number_of_features(
            LinearSVC,
            n_features,
            X, y,
            estimator_params = {'penalty':'l1', 'dual':False, 'max_iter':100000}
        )
        print(f"SVM {omics:10} {n_features:4} {C}")

        C = estimator_helpers.match_C_to_number_of_features(
            LogisticRegression,
            n_features,
            X, y,
            estimator_params = {'penalty':'l1', 'solver':'liblinear', 'max_iter':100000}
        )
        print(f"LR  {omics:10} {n_features:4} {C}")

SVM __ALL__      10 0.006690478515624999
LR  __ALL__      10 0.028414648437499995
SVM __ALL__      25 0.009619580078125
LR  __ALL__      25 0.04159560546874999
SVM __ALL__     100 0.06039067382812499
LR  __ALL__     100 0.21294804687499996
SVM mutation     10 0.046965625
LR  mutation     10 0.1875625
SVM mutation     25 0.08309121093749999
LR  mutation     25 0.32815937500000003
SVM mutation    100 0.28519921875
LR  mutation    100 0.9999
SVM cnv          10 0.03622558593749999
LR  cnv          10 0.087973046875
SVM cnv          25 0.053312011718749996
LR  cnv          25 0.15631875
SVM cnv         100 0.17307193326950077
LR  cnv         100 0.59373125
SVM mirna        10 0.013769140624999999
LR  mirna        10 0.054776562499999994
SVM mirna        25 0.0245091796875
LR  mirna        25 0.10359492187499998
SVM mirna       100 0.96865625
LR  mirna       100 0.9999
SVM rna          10 0.006690478515624999
LR  rna          10 0.027438281249999998
SVM rna          25 0.010840039062500001
