In [2]:
import pandas as pd

datasets = {
    name: pd.read_csv(f'datasets/{name}.csv')
    for name in ["COAD"]
}
for name, df in datasets.items():
    print(f"{name}\t: {df.shape}")

COAD	: (191, 99521)


In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from utils.stability_measure import StabilityMeasure
from statistics import mean

estimators = {
    "LR": LogisticRegression(C=0.25, penalty='l1', solver='liblinear', max_iter=100000),
    "SVM": LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000)
}

selectors = {
    name: SelectFromModel( estimator = estimator, threshold = 1e-8, importance_getter = "auto")
    for name, estimator in estimators.items()
}

class CVResult:
    def __init__(self, n_features):
        self.n_features = n_features # number of all features
        self.metrics = {
            "accuracy": accuracy_score
        }
        self.results = {}
        self.features_sets = []
    
    def add_feature_set(self, feature_set):
        self.features_sets.append(feature_set)

    def evaluate(self, prefix, y_real, y_predict):
        for name, fun in self.metrics.items():
            key = f"{prefix}_{name}"
            if key not in self.results:
                self.results[key] = []
            self.results[key].append(fun(y_real,y_predict))
    
    def summary(self):
        results = self.results.copy()
        results["fss_nogueira"] = StabilityMeasure.Nogueira(self.features_sets, self.n_features)
        results["fss_lustgarten"] = StabilityMeasure.Lustgarten(self.features_sets, self.n_features)
        results["avg_features"] = mean(self.features_sets)
        results["all_features"] = mean(self.n_features)
        results["runs"] = len(self.features_sets)
        return results

In [7]:
for n_splits in [5,10]:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    
    for ds_name, df in datasets.items():
        X = df.values[:,1:]
        y = df.values[:,0]
        
        results = { name: CVResult(len(X)) for name in selectors }
        for train_index, test_index in skf.split(X, y):
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]
            
            for name, selector in selectors.items():
                # calculate results on this split for every estimator/selector
                result = results[name]
                selector.fit(X_train, y_train)
                selected_features_indexes = selector.get_support(indices=True)
                result.add_feature_set(set(selected_features_indexes))
                result.evaluate("train", y_train, selector.estimator_.predict(X_train))
                result.evaluate("test", y_test, selector.estimator_.predict(X_test))
        
        print(f"Results for {ds_name} and {n_splits}-fold cross validation:")
        for name, result in results.items():
            print(result)

Results for COAD and 5-fold cross validation:
<__main__.CVResult object at 0x1297ee810>
<__main__.CVResult object at 0x1297eeed0>
