In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector, RFE, mutual_info_classif, SelectKBest, f_classif, chi2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNet
from feature_selection_package.evaluation import performance_score, single_evaluation, full_evaluation
from feature_selection_package.feature_selectors import CorrelationSelector, MutualInformationSelector, RandomForestSelector, EnsembleSelector
from boruta import BorutaPy


# Load dataset

In [5]:
# Read arcene_train.data
train_data = pd.read_csv('Arcene-dataset/arcene_train.data', sep=' ', header=None)
train_data = train_data.drop(train_data.columns[10000], axis=1)

# Read arcene_train.labels
train_labels = pd.read_csv('Arcene-dataset/arcene_train.labels', sep=' ', header=None).values.ravel()
train_labels = pd.Series(np.where(train_labels == -1, 0, train_labels))

# Read arcene_valid.data
valid_data = pd.read_csv('Arcene-dataset/arcene_valid.data', sep=' ', header=None)
valid_data = valid_data.drop(valid_data.columns[10000], axis=1)

# Read arcene_valid.labels
valid_labels = pd.read_csv('Arcene-dataset/arcene_valid.labels', sep=' ', header=None).values.ravel()
valid_labels = pd.Series(np.where(valid_labels == -1, 0, valid_labels))


In [11]:
print(f"Class count in training part: \n{train_labels.value_counts()}")
print(f"Class count in validation part \n{valid_labels.value_counts()}")

Class count in training part: 
0    56
1    44
dtype: int64
Class count in validation part 
0    56
1    44
dtype: int64


# Estimators

In [20]:
svm1 = SVC(kernel='rbf', C=1, random_state=0)
svm2 = SVC(kernel='linear', C=1, random_state=0)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=0)
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)

logreg = LogisticRegression(penalty='l2', C=1, random_state=0)

classifiers = np.array([svm1, svm2, tree, xgboost, rfc, logreg])

# Feature selection methods

## Dimensionality reduction methods

### PCA

In [7]:
n_features = [10, 20, 50, 80]
pca_results = pd.DataFrame()
for n in n_features:
    selector = [PCA(n_components=n)]
    pca_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    pca_results = pd.concat([pca_results, pca_df])
pca_results.to_csv('data/pca.csv', index=False)

## Wrapper methods

### RFE

In [None]:
n_features = [100, 1000, 7000]
selector = [RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=7000, step=1, verbose=0)]
rfe_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
rfe_results.to_csv('data/RFE.csv', index=False)

### SFS

In [None]:
selector = [SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='forward')]
sfs_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
sfs_results.to_csv('data/SFS.csv', index=False)

### SBS

In [None]:
selector = [SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='backward')]
sbs_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
sbs_results.to_csv('data/SBS.csv', index=False)

## Embeded methods

### Lasso

In [24]:
lasso_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
for C in Cs:
    lasso = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=0)
    lasso.fit(train_data, train_labels)
    n_features = sum(lasso.coef_[0] != 0)
    score = lasso.score(valid_data, valid_labels)
    perf_score = performance_score(score, n_features)
    lasso_df = pd.concat([lasso_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score]})], ignore_index=True)

lasso_df.to_csv('data/lasso.csv', index=False)

### Elastic net

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
l1_ratio = [0.9, 0.95, 0.98]
elastic_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score', 'alpha', 'l1_ratio'])
for ratio in l1_ratio:
    for alpha in alphas:
        elastic = ElasticNet(alpha=alpha, l1_ratio=ratio, random_state=0, max_iter = 10000)
        elastic.fit(train_data, train_labels)
        n_features = sum(elastic.coef_!= 0)
        score = elastic.score(valid_data, valid_labels)
        perf_score = performance_score(score, n_features)
        elastic_df = pd.concat([elastic_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score], "alpha": [alpha], "l1_ratio": [ratio]})], ignore_index=True)

elastic_df.to_csv('data/elasticNet.csv', index=False)

In [29]:
elasticNet = pd.read_csv('data/elasticNet.csv')
elasticNet['Selector'] = 'ElasticNet'
elasticNet['Classifier'] = 'ElasticNet'
elasticNet.to_csv('data/elasticNet.csv', index=False)

### Random forest

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
forest_results = pd.DataFrame()
for n in n_features:
    selector = [RandomForestSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    forest_results = pd.concat([forest_results, df])
forest_results.to_csv('data/forest.csv', index=False)

## Filter methods

### Correlation coefficient

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
corr_results = pd.DataFrame()
for n in n_features:
    selector = [CorrelationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    corr_results = pd.concat([corr_results, df])
corr_results.to_csv('data/corr.csv', index=False)

### Mutual information

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
mutual_results = pd.DataFrame()
for n in n_features:
    selector = [MutualInformationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    mutual_results = pd.concat([mutual_results, df])
mutual_results.to_csv('data/mutual.csv', index=False)

### Select K - Best

#### ANOVA

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
anova_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(f_classif, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    anova_results = pd.concat([anova_results, df])
anova_results.to_csv('data/anova.csv', index=False)

### chi2

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 8000]
chi2_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(chi2, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    chi2_results = pd.concat([chi2_results, df])
chi2_results.to_csv('data/chi2.csv', index=False)

## Hybrid + wrapper

### Boruta algorithm

In [None]:
selector = [BorutaPy(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_estimators='auto', verbose=1, random_state=0)]
boruta_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
boruta_results.to_csv('data/boruta.csv', index=False)

## Stacking

In [None]:
selector1 = RandomForestSelector(n_features=500)
selector2 = RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, step=1, verbose=1)
selectors = [[selector1, selector2]]
stack_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selectors, classifiers)
stack_results.to_csv('data/stack.csv', index=False)

## Ensemble

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
ensemble_results = pd.DataFrame()
for n in n_features:
    selectors = [RandomForestSelector(n_features=n), SelectKBest(f_classif, k=n), SelectKBest(chi2, k=n), CorrelationSelector(n_features=n), MutualInformationSelector(n_features=n)]
    ensemble = [EnsembleSelector(selectors=selectors)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, ensemble, classifiers)
    ensemble_results = pd.concat([ensemble_results, df])
    
ensemble_results.to_csv('data/ensemble.csv', index=False)