In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector, RFE, mutual_info_classif, SelectKBest, f_classif, chi2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNet
from evaluation import performance_score, single_evaluation, full_evaluation
from feature_selectors import CorrelationSelector, MutualInformationSelector, RandomForestSelector
from boruta import BorutaPy


# Load dataset

In [2]:
# Read arcene_train.data
train_data = pd.read_csv('Arcene-dataset/arcene_train.data', sep=' ', header=None)
train_data = train_data.drop(train_data.columns[10000], axis=1)

# Read arcene_train.labels
train_labels = pd.read_csv('Arcene-dataset/arcene_train.labels', sep=' ', header=None).values.ravel()
train_labels = pd.Series(np.where(train_labels == -1, 0, train_labels))

# Read arcene_valid.data
valid_data = pd.read_csv('Arcene-dataset/arcene_valid.data', sep=' ', header=None)
valid_data = valid_data.drop(valid_data.columns[10000], axis=1)

# Read arcene_valid.labels
valid_labels = pd.read_csv('Arcene-dataset/arcene_valid.labels', sep=' ', header=None).values.ravel()
valid_labels = pd.Series(np.where(valid_labels == -1, 0, valid_labels))


# Estimators

In [3]:
svm1 = SVC(kernel='rbf', C=1, random_state=0)
svm2 = SVC(kernel='linear', C=1, random_state=0)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=0)
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)

classifiers = np.array([svm1, svm2, tree, xgboost, rfc])

# Feature selection methods

## Dimensionality reduction methods

### PCA

In [None]:
pca_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [PCA()], classifiers, [10, 20, 50, 80])
pca_df.to_csv('data/pca2.csv', index=False)

## Wrapper methods

### RFE

In [None]:
rfe_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
selector = RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=7000, step=1, verbose=0)
classifier = rfc

accuracy, perf_score, n_features = single_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifier)
df = pd.DataFrame({'Selector': ['Boruta'], 'Classifier': [classifier.__class__.__name__], 'Number_of_Features': [n_features], 'Accuracy': [accuracy], 'Performance_score': [perf_score]})
rfe_df = pd.concat([rfe_df, df], ignore_index=True)

rfe_df.to_csv('data/rfe.csv', index=False)

### SFS

In [None]:
sfs_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
sfs = SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='forward')
classifier = rfc

accuracy, perf_score, n_features = single_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifier)
df = pd.DataFrame({'Selector': ['Boruta'], 'Classifier': [classifier.__class__.__name__], 'Number_of_Features': [n_features], 'Accuracy': [accuracy], 'Performance_score': [perf_score]})
sfs_df = pd.concat([sfs_df, df], ignore_index=True)

sfs_df.to_csv('data/sfs.csv', index=False)


### SBS

In [None]:
sbs_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
sbs = SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='backward')
classifier = rfc

accuracy, perf_score, n_features = single_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifier)
df = pd.DataFrame({'Selector': ['Boruta'], 'Classifier': [classifier.__class__.__name__], 'Number_of_Features': [n_features], 'Accuracy': [accuracy], 'Performance_score': [perf_score]})
sbs_df = pd.concat([sbs_df, df], ignore_index=True)

sbs_df.to_csv('data/sbs.csv', index=False)


## Embeded methods

### Lasso

In [17]:
lasso_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
for C in Cs:
    lasso = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=0)
    lasso.fit(train_data, train_labels)
    n_features = sum(lasso.coef_[0] != 0)
    score = lasso.score(valid_data, valid_labels)
    perf_score = performance_score(score, n_features)
    lasso_df = pd.concat([lasso_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score]})], ignore_index=True)

lasso_df.to_csv('data/lasso.csv', index=False)

### Elastic net

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
l1_ratio = [0.5, 0.7, 0.9]
elastic_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score', 'alpha', 'l1_ratio'])
for ratio in l1_ratio:
    for alpha in alphas:
        elastic = ElasticNet(alpha=alpha, l1_ratio=ratio, random_state=0, max_iter = 2000)
        elastic.fit(train_data, train_labels)
        n_features = sum(elastic.coef_!= 0)
        score = elastic.score(valid_data, valid_labels)
        perf_score = performance_score(score, n_features)
        elastic_df = pd.concat([elastic_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score], "alpha": [alpha], "l1_ratio": [ratio]})], ignore_index=True)

elastic_df.to_csv('data/elasticNet.csv', index=False)

### Random forest

In [None]:
rfs_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [RandomForestSelector()], classifiers, n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
rfs_df.to_csv('data/rfs2.csv', index=False)

## Filter methods

### Correlation coefficient

In [None]:
corr_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [CorrelationSelector()], classifiers, n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
corr_df.to_csv('data/corr2.csv', index=False)

### Mutual information

In [None]:
mutual_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [MutualInformationSelector()], classifiers, n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
mutual_df.to_csv('data/mutual2.csv', index=False)

### Select K - Best

#### ANOVA

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000]
selector = SelectKBest(f_classif)
anova_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [selector], classifiers, n_features)
anova_df.to_csv('data/anova.csv', index=False)

### chi2

In [8]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000]
selector = SelectKBest(chi2)
chi2_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, [selector], classifiers, n_features)
chi2_df.to_csv('data/chi2.csv', index=False)

## Hybrid + wrapper

### Boruta algorithm

In [None]:
boruta_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
selector = BorutaPy(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_estimators='auto', verbose=1, random_state=0)
for classifier in classifiers:
    accuracy, perf_score, n_features = single_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifier)
    df = pd.DataFrame({'Selector': ['Boruta'], 'Classifier': [classifier.__class__.__name__], 'Number_of_Features': [n_features], 'Accuracy': [accuracy], 'Performance_score': [perf_score]})
    boruta_df = pd.concat([boruta_df, df], ignore_index=True)

boruta_df.to_csv('data/boruta2.csv', index=False)