In [2]:
import pandas as pd

datasets = {
    name: pd.read_csv(f'datasets/{name}.csv')
    for name in ["COAD"]
}
for name, df in datasets.items():
    print(f"{name}\t: {df.shape}")

COAD	: (191, 99521)


In [7]:
from typing import List, Set, Dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from utils.stability_measure import StabilityMeasure
from statistics import mean
from functools import reduce

estimators = {
    "LR": LogisticRegression(C=0.25, penalty='l1', solver='liblinear', max_iter=100000),
    "SVM": LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000)
}

selectors = {
    name: SelectFromModel( estimator = estimator, threshold = 1e-8, importance_getter = "auto")
    for name, estimator in estimators.items()
}

class CVResult:
    def __init__(self, n_features:int):
        self.n_features = n_features # number of all features
        self.metrics = {
            "accuracy": accuracy_score
        }
        self.results = {}
        self.features_sets = []
    
    def add_feature_set(self, feature_set:Set[int]):
        self.features_sets.append(feature_set)

    def evaluate(self, prefix:str, y_real:List[float], y_predict:List[float]):
        for name, fun in self.metrics.items():
            key = f"{prefix}_{name}"
            if key not in self.results:
                self.results[key] = []
            self.results[key].append(fun(y_real,y_predict))
    
    def summary(self)->Dict[str,float]:
        results = {key:mean(values) for key, values in self.results.items()}
        results["fss_nogueira"] = StabilityMeasure.Nogueira(self.features_sets, self.n_features)
        results["fss_lustgarten"] = StabilityMeasure.Lustgarten(self.features_sets, self.n_features)
        results["avg_features"] = mean([len(s) for s in self.features_sets])
        results["all_features"] = self.n_features
        results["runs"] = len(self.features_sets)
        return results

In [15]:
data = {}
def append(data:Dict[str,List[float]], key:str, value:float):
    if key not in data:
        data[key] = []
    data[key].append(value)

def omics_str(omics):
    if type(omics)==str:
        omics = set([omics])
    return reduce(lambda a,b:a+"_"+b, omics)

def get_X(df, omics):
    if type(omics)==str:
        if omics == "__ALL__":
            return df.values[:,1:] #first column is class
        omics = set([omics])
    indexes = [
        i
        for i, name in enumerate(df.columns[1:])
        if name.split("_")[-1] in omics
    ]
    print(f"selected {len(indexes)} for omics: {omics}")
    return df.values[:,indexes]

N_SPLITS = [5,10]
OMICS = ["clinical", "__ALL__"] #, "mutation", "cnv", "mirna", "rna", ["rna","clinical"]]
i = 0
I =  len(datasets) * len(OMICS) * len(N_SPLITS)

for ds_name, df in datasets.items():    
    y = df.values[:,0]
    
    for omics in OMICS:
        X = get_X(df, omics)

        for n_splits in N_SPLITS:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
                
            results = { name: CVResult(X.shape[1]) for name in selectors }
            for train_index, test_index in skf.split(X, y):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]
                
                for name, selector in selectors.items():
                    # calculate results on this split for every estimator/selector
                    result = results[name]
                    selector.fit(X_train, y_train)
                    selected_features_indexes = selector.get_support(indices=True)
                    result.add_feature_set(set(selected_features_indexes))
                    result.evaluate("train", y_train, selector.estimator_.predict(X_train))
                    result.evaluate("test", y_test, selector.estimator_.predict(X_test))
            i+=1
            print(f"Results {i}/{I} for {ds_name} {omics} and {n_splits}-fold cross validation:")
            
            for name, result in results.items():
                r = result.summary()
                print(f"{name:5} : ",r)
                append( data, "cv-folds", n_splits)
                append( data, "dataset", ds_name)
                append( data, "estimator", name)
                append( data, "omics", omics_str(omics))
                for key, value in r.items():
                    append( data, key, value)

df = pd.DataFrame(data)
df.to_csv("results/jk.csv")
df

selected 7 for omics: {'clinical'}
Results 1/4 for COAD clinical and 5-fold cross validation:
LR    :  {'train_accuracy': 1.0, 'test_accuracy': 1.0, 'fss_nogueira': 0.8674242424242424, 'fss_lustgarten': 0.6571428571428571, 'avg_features': 2.2, 'all_features': 7, 'runs': 5}
SVM   :  {'train_accuracy': 1.0, 'test_accuracy': 1.0, 'fss_nogueira': 0.8809523809523809, 'fss_lustgarten': 0.5714285714285714, 'avg_features': 2.8, 'all_features': 7, 'runs': 5}
Results 2/4 for COAD clinical and 10-fold cross validation:
LR    :  {'train_accuracy': 1.0, 'test_accuracy': 1.0, 'fss_nogueira': 0.9319727891156462, 'fss_lustgarten': 0.6857142857142857, 'avg_features': 2.1, 'all_features': 7, 'runs': 10}
SVM   :  {'train_accuracy': 1.0, 'test_accuracy': 1.0, 'fss_nogueira': 0.9411269974768713, 'fss_lustgarten': 0.5714285714285714, 'avg_features': 2.9, 'all_features': 7, 'runs': 10}
Results 3/4 for COAD __ALL__ and 5-fold cross validation:
LR    :  {'train_accuracy': 1.0, 'test_accuracy': 0.76950067476383

Unnamed: 0,cv-folds,dataset,estimator,omics,train_accuracy,test_accuracy,fss_nogueira,fss_lustgarten,avg_features,all_features,runs
0,5,COAD,LR,clinical,1.0,1.0,0.867424,0.657143,2.2,7,5
1,5,COAD,SVM,clinical,1.0,1.0,0.880952,0.571429,2.8,7,5
2,10,COAD,LR,clinical,1.0,1.0,0.931973,0.685714,2.1,7,10
3,10,COAD,SVM,clinical,1.0,1.0,0.941127,0.571429,2.9,7,10
4,5,COAD,LR,__ALL__,1.0,0.769501,0.311508,0.324527,110.8,99520,5
5,5,COAD,SVM,__ALL__,1.0,0.70135,0.268428,0.275483,109.2,99520,5
6,10,COAD,LR,__ALL__,1.0,0.764211,0.494181,0.534261,112.2,99520,10
7,10,COAD,SVM,__ALL__,1.0,0.696053,0.433383,0.441258,119.9,99520,10


In [None]:
from functools import reduce