In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector, RFE, mutual_info_classif, SelectKBest, f_classif, chi2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNet
from feature_selection_package.evaluation import performance_score, single_evaluation, full_evaluation
from feature_selection_package.feature_selectors import CorrelationSelector, MutualInformationSelector, RandomForestSelector, EnsembleSelector
from boruta import BorutaPy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import balanced_accuracy_score

# Load dataset

In [2]:
# Load data artificial
artificial_train_data = pd.read_csv('data/artificial_train.data',header=None,sep=' ').dropna(axis=1)
artificial_train_labels = pd.read_csv('data/artificial_train.labels',header=None,sep=' ').dropna(axis=1)
artificial_valid_data = pd.read_csv('data/artificial_valid.data',header=None,sep=' ').dropna(axis=1)


In [9]:
artificial_train_labels.value_counts()

-1    1000
 1    1000
dtype: int64

In [3]:
train_data = artificial_train_data.loc[:np.round(artificial_train_data.shape[0]*0.8)]
train_labels = artificial_train_labels.loc[:np.round(artificial_train_labels.shape[0]*0.8)].replace(-1,0).values.ravel()
valid_data = artificial_train_data.loc[np.round(artificial_train_data.shape[0]*0.8):]
valid_labels = artificial_train_labels.loc[np.round(artificial_train_labels.shape[0]*0.8):].replace(-1,0).values.ravel()

# Check if train and valid data is balanced

In [5]:
print('number of observations in each class in train set:', np.unique(train_labels, return_counts=True))
print('number of observations in each class in validation set:', np.unique(valid_labels, return_counts=True))

number of observations in each class in train set: (array([0, 1], dtype=int64), array([793, 808], dtype=int64))
number of observations in each class in validation set: (array([0, 1], dtype=int64), array([208, 192], dtype=int64))


# Estimators

In [4]:
svm1 = SVC(kernel='rbf', C=1, random_state=0)
svm2 = SVC(kernel='linear', C=1, random_state=0)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=0)
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)

logreg = LogisticRegression(penalty='l2', C=1, random_state=0, max_iter=1000)

classifiers = np.array([svm1, svm2, tree, xgboost, rfc, logreg])

# Feature selection methods

## Dimensionality reduction methods

### PCA

In [6]:
n_features = [10, 20, 30, 50, 75]
pca_results = pd.DataFrame()
for n in n_features:
    selector = [PCA(n_components=n)]
    pca_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    pca_results = pd.concat([pca_results, pca_df])
pca_results.to_csv('data2/pca.csv', index=False)

## Wrapper methods

### RFE

In [21]:
n_features = [10, 20, 50]
rfe_results = pd.DataFrame()
for n in n_features:
    selector = [RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=n, step=1, verbose=1)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    rfe_results = pd.concat([rfe_results, df])
rfe_results.to_csv('data2/RFE.csv', index=False)

KeyboardInterrupt: 

## Embeded methods

### Lasso

In [None]:
lasso_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score', 'Supported_Features'])
Cs = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 10]
for C in Cs:
    lasso = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=0)
    lasso.fit(train_data, train_labels)
    n_features = sum(lasso.coef_[0] != 0)
    supported_features = (lasso.coef_[0] != 0)
    score = balanced_accuracy_score(valid_labels, lasso.predict(valid_data))
    perf_score = performance_score(score, n_features, dataset_type='sms')
    lasso_df = pd.concat([lasso_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score], 'Supported_Features': [supported_features]})], ignore_index=True)

lasso_df.to_csv('data2/lasso.csv', index=False)

### Elastic net

In [None]:
alphas = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 10]
l1_ratio = [0.5, 0.7, 0.9]
elastic_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score', 'alpha', 'l1_ratio', 'Supported_Features'])
for ratio in l1_ratio:
    for alpha in alphas:
        elastic = ElasticNet(alpha=alpha, l1_ratio=ratio, random_state=0, max_iter = 10000)
        elastic.fit(train_data, train_labels)
        n_features = sum(elastic.coef_!= 0)
        supported_features = (elastic.coef_[0] != 0)
        y_pred = np.where(elastic.predict(valid_data) > 0.5, 1, 0)
        score = balanced_accuracy_score(valid_labels, y_pred)
        perf_score = performance_score(score, n_features, dataset_type='sms')
        elastic_df = pd.concat([elastic_df, pd.DataFrame({'Selector': ['ElasticNet'], 'Classifier': ['ElasticNet'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score], "alpha": [alpha], "l1_ratio": [ratio], 'Supported_Features': [supported_features]})], ignore_index=True)

elastic_df.to_csv('data2/elasticNet.csv', index=False)

### Random forest

In [9]:
n_features = [10, 20, 30, 50, 75, 100]
forest_results = pd.DataFrame()
for n in n_features:
    selector = [RandomForestSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    forest_results = pd.concat([forest_results, df])
forest_results.to_csv('data2/forest.csv', index=False)

## Filter methods

### Correlation coefficient

In [10]:
n_features = [10, 20, 30, 50, 75, 100]
corr_results = pd.DataFrame()
for n in n_features:
    selector = [CorrelationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    corr_results = pd.concat([corr_results, df])
corr_results.to_csv('data2/corr.csv', index=False)

### Mutual information

In [11]:
n_features = [10, 20, 30, 50, 75, 100]
mutual_results = pd.DataFrame()
for n in n_features:
    selector = [MutualInformationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    mutual_results = pd.concat([mutual_results, df])
mutual_results.to_csv('data2/mutual.csv', index=False)

### Select K - Best

#### ANOVA

In [12]:
n_features = [10, 20, 30, 50, 75, 100]
anova_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(f_classif, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    anova_results = pd.concat([anova_results, df])
anova_results.to_csv('data2/anova.csv', index=False)

### chi2

In [13]:
n_features = [10, 20, 30, 50, 75, 100]
chi2_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(chi2, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    chi2_results = pd.concat([chi2_results, df])
chi2_results.to_csv('data2/chi2.csv', index=False)

## Hybrid + wrapper

### Boruta algorithm

In [None]:
selector = [BorutaPy(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_estimators='auto', verbose=1, random_state=0)]
boruta_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
boruta_results.to_csv('data2/boruta.csv', index=False)

## Stacking

In [17]:
n_features_rfs = [100, 200]
n_features_rfe = [10, 25, 50]

In [18]:
stack_results = pd.DataFrame()
for n in n_features_rfs:
    for m in n_features_rfe:
        selector1 = RandomForestSelector(n_features=n)
        selector2 = RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=m, step=1, verbose=1)
        selectors = [[selector1, selector2]]
        df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selectors, classifiers, dataset_type='artificial')
        stack_results = pd.concat([stack_results, df])
        stack_results.to_csv('data/stack.csv', index=False)

stack_results.to_csv('data2/stack.csv', index=False)

Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 f

## Ensemble

In [15]:
n_features = [10, 20, 30, 50, 75, 100]
ensemble_results = pd.DataFrame()
for n in n_features:
    selectors = [RandomForestSelector(n_features=n), SelectKBest(f_classif, k=n), SelectKBest(chi2, k=n), CorrelationSelector(n_features=n), MutualInformationSelector(n_features=n)]
    ensemble = [EnsembleSelector(selectors=selectors)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, ensemble, classifiers, dataset_type='artificial')
    ensemble_results = pd.concat([ensemble_results, df])
    
ensemble_results.to_csv('data2/ensemble.csv', index=False)