In [None]:
import pandas as pd

df = pd.read_csv('datasets/COAD.csv')

df.head()

In [None]:
# all dataset

X = df.values[:,1:]
y = df.values[:,0]

X.shape, y.shape

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from utils.stability_measure import StabilityMeasure


estimator = LogisticRegression(C=0.25, penalty='l1', solver='liblinear', max_iter=100000)
#estimator = LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000)

selector = SelectFromModel(
    estimator = estimator,
    threshold = 1e-8,
    importance_getter = "auto"
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

selected_features_sets = []
for train_index, test_index in skf.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    selector.fit(X_train, y_train)
    selected_features_indexes = selector.get_support(indices=True)
    selected_features_sets.append(set(selected_features_indexes))
    acc_train = accuracy_score(y_train, selector.estimator_.predict(X_train))
    acc_test = accuracy_score(y_test, selector.estimator_.predict(X_test))
    print(f"{acc_train:.3}, {acc_test:.3}, {len(selected_features_indexes)}")

nogueira_fss = StabilityMeasure.Nogueira(selected_features_sets, X.shape[1])
lustgarten_fss = StabilityMeasure.Lustgarten(selected_features_sets, X.shape[1])
print(f"Feature selection stability: {nogueira_fss:.3}, {lustgarten_fss:.3}")

# SVM
# 1.0, 0.744, 110
# 1.0, 0.684, 109
# 1.0, 0.711, 100
# 1.0, 0.711, 106
# 1.0, 0.816, 109
# Feature selection stability: 0.262, 0.268

# LR
# 1.0, 0.744, 84
# 1.0, 0.684, 110
# 1.0, 0.737, 68
# 1.0, 0.763, 128
# 1.0, 0.789, 110
# Feature selection stability: 0.329, 0.384