In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector, RFE, mutual_info_classif, SelectKBest, f_classif, chi2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNet
from feature_selection_package.evaluation import performance_score, single_evaluation, full_evaluation
from feature_selection_package.feature_selectors import CorrelationSelector, MutualInformationSelector, RandomForestSelector, EnsembleSelector
from boruta import BorutaPy
from sklearn.feature_extraction.text import CountVectorizer

def get_word_counts_train_test(train, test):
    vectorizer = CountVectorizer()
    word_counts_train = vectorizer.fit_transform(train['message'])
    word_counts_test = vectorizer.transform(test['message'])
    feature_names = vectorizer.get_feature_names_out()
    counts_train_df = pd.DataFrame(word_counts_train.toarray(), columns=feature_names)
    counts_test_df = pd.DataFrame(word_counts_test.toarray(), columns=feature_names)
    result_train_df = pd.concat([train['label'], counts_train_df], axis=1)
    result_test_df = pd.concat([test['label'], counts_test_df], axis=1)
    return result_train_df, result_test_df



# Load dataset

In [2]:
# Load data artificial
artificial_train_data = pd.read_csv('data/artificial_train.data',header=None,sep=' ').dropna(axis=1)
artificial_train_labels = pd.read_csv('data/artificial_train.labels',header=None,sep=' ').dropna(axis=1)
artificial_valid_data = pd.read_csv('data/artificial_valid.data',header=None,sep=' ').dropna(axis=1)


In [3]:
train_data = artificial_train_data.loc[:np.round(artificial_train_data.shape[0]*0.8)]
train_labels = artificial_train_labels.loc[:np.round(artificial_train_labels.shape[0]*0.8)].replace(-1,0).values.ravel()
valid_data = artificial_train_data.loc[np.round(artificial_train_data.shape[0]*0.8):]
valid_labels = artificial_train_labels.loc[np.round(artificial_train_labels.shape[0]*0.8):].replace(-1,0).values.ravel()

In [9]:
print('number of observations in each class in train set:', np.unique(train_labels, return_counts=True))
print('number of observations in each class in validation set:', np.unique(valid_labels, return_counts=True))


number of observations in each class in train set: (array([0, 1], dtype=int64), array([793, 808], dtype=int64))
number of observations in each class in validation set: (array([0, 1], dtype=int64), array([208, 192], dtype=int64))


In [17]:

# Load data sms
# sms_train = pd.read_csv('data/sms_train.csv')
# sms_train_data, sms_train_labels = sms_train.iloc[:, 1], sms_train.iloc[:, 0]

# sms_test_data = pd.read_csv('data/sms_test.csv')
# sms_test = sms_test_data.copy()
# sms_test['label'] = np.nan


In [60]:
# sms_test_data

Unnamed: 0,message
0,"Yo, you at jp and hungry like a mofo?"
1,It's é only $140 ard...É rest all ard $180 at ...
2,"&lt;#&gt; , that's all? Guess that's easy enough"
3,Y?WHERE U AT DOGBREATH? ITS JUST SOUNDING LIKE...
4,Good afternoon sexy buns! How goes the job sea...
...,...
995,Tell your friends what you plan to do on Valen...
996,No. Yes please. Been swimming?
997,Thank you. I like you as well...
998,Stupid.its not possible


In [15]:
# there is no use for that dataset
# sms = pd.read_csv('data/sms.tsv', header=None, sep='\t')
# sms.columns = ['label', 'message']

In [18]:
# preprocessed_sms_train , preprocessed_sms_test = get_word_counts_train_test(sms_train, sms_test)


In [19]:
# train_data = preprocessed_sms_train.iloc[:np.round(len(preprocessed_sms_train)*0.8).astype(int), 1:]
# train_labels = preprocessed_sms_train.iloc[:np.round(len(preprocessed_sms_train)*0.8).astype(int), 0]
# valid_data = preprocessed_sms_train.iloc[np.round(len(preprocessed_sms_train)*0.8).astype(int):, 1:]
# valid_labels = preprocessed_sms_train.iloc[np.round(len(preprocessed_sms_train)*0.8).astype(int):, 0]

# Check if train and valid data is balanced

In [20]:
print('number of observations in each class in train set:', np.unique(train_labels, return_counts=True))
print('number of observations in each class in validation set:', np.unique(valid_labels, return_counts=True))

number of observations in each class in train set: (array([0, 1], dtype=int64), array([3170,  488], dtype=int64))
number of observations in each class in validation set: (array([0, 1], dtype=int64), array([789, 125], dtype=int64))


# Estimators

In [15]:
svm1 = SVC(kernel='rbf', C=1, random_state=0)
svm2 = SVC(kernel='linear', C=1, random_state=0)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=0)
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)

logreg = LogisticRegression(penalty='l2', C=1, random_state=0, max_iter=1000)

classifiers = np.array([svm1, svm2, tree, xgboost, rfc, logreg])

# Feature selection methods

## Dimensionality reduction methods

### PCA

In [7]:
n_features = [10, 20, 50, 80]
pca_results = pd.DataFrame()
for n in n_features:
    selector = [PCA(n_components=n)]
    pca_df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    pca_results = pd.concat([pca_results, pca_df])
pca_results.to_csv('data2/pca.csv', index=False)

## Wrapper methods

### RFE

In [None]:
n_features = [100, 1000, 7000]
selector = [RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=7000, step=1, verbose=0)]
rfe_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
rfe_results.to_csv('data2/RFE.csv', index=False)

### SFS

In [None]:
selector = [SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='forward')]
sfs_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
sfs_results.to_csv('data2/SFS.csv', index=False)

### SBS

In [None]:
selector = [SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, direction='backward')]
sbs_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
sbs_results.to_csv('data2/SBS.csv', index=False)

## Embeded methods

### Lasso

In [9]:
lasso_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score'])
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
for C in Cs:
    lasso = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=0)
    lasso.fit(train_data, train_labels)
    n_features = sum(lasso.coef_[0] != 0)
    score = lasso.score(valid_data, valid_labels)
    perf_score = performance_score(score, n_features)
    lasso_df = pd.concat([lasso_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score]})], ignore_index=True)

lasso_df.to_csv('data2/lasso.csv', index=False)

### Elastic net

In [71]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
l1_ratio = [0.9, 0.95, 0.98]
elastic_df = pd.DataFrame(columns=['Selector', 'Classifier', 'Number_of_Features', 'Accuracy', 'Performance_score', 'alpha', 'l1_ratio'])
for ratio in l1_ratio:
    for alpha in alphas:
        elastic = ElasticNet(alpha=alpha, l1_ratio=ratio, random_state=0, max_iter = 10000)
        elastic.fit(train_data, train_labels)
        n_features = sum(elastic.coef_!= 0)
        score = elastic.score(valid_data, valid_labels)
        perf_score = performance_score(score, n_features)
        elastic_df = pd.concat([elastic_df, pd.DataFrame({'Selector': ['Lasso'], 'Classifier': ['Lasso'], 'Number_of_Features': [n_features], 'Accuracy': [score], 'Performance_score': [perf_score], "alpha": [alpha], "l1_ratio": [ratio]})], ignore_index=True)

elastic_df.to_csv('data2/elasticNet.csv', index=False)

In [72]:
elasticNet = pd.read_csv('data2/elasticNet.csv')
elasticNet['Selector'] = 'ElasticNet'
elasticNet['Classifier'] = 'ElasticNet'
elasticNet.to_csv('data2/elasticNet.csv', index=False)

### Random forest

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
forest_results = pd.DataFrame()
for n in n_features:
    selector = [RandomForestSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    forest_results = pd.concat([forest_results, df])
forest_results.to_csv('data2/forest.csv', index=False)

## Filter methods

### Correlation coefficient

In [16]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
corr_results = pd.DataFrame()
for n in n_features:
    selector = [CorrelationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    corr_results = pd.concat([corr_results, df])
corr_results.to_csv('data2/corr.csv', index=False)

### Mutual information

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
mutual_results = pd.DataFrame()
for n in n_features:
    selector = [MutualInformationSelector(n_features=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers, dataset_type='artificial')
    mutual_results = pd.concat([mutual_results, df])
mutual_results.to_csv('data2/mutual.csv', index=False)

### Select K - Best

#### ANOVA

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
anova_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(f_classif, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    anova_results = pd.concat([anova_results, df])
anova_results.to_csv('data2/anova.csv', index=False)

### chi2

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
chi2_results = pd.DataFrame()
for n in n_features:
    selector = [SelectKBest(chi2, k=n)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
    chi2_results = pd.concat([chi2_results, df])
chi2_results.to_csv('data2/chi2.csv', index=False)

## Hybrid + wrapper

### Boruta algorithm

In [None]:
selector = [BorutaPy(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_estimators='auto', verbose=1, random_state=0)]
boruta_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selector, classifiers)
boruta_results.to_csv('data/boruta.csv', index=False)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

## Stacking

In [None]:
selector1 = RandomForestSelector(n_features=500)
selector2 = RFE(estimator=RandomForestClassifier(n_estimators=100, max_depth=3), n_features_to_select=100, step=1, verbose=1)
selectors = [[selector1, selector2]]
stack_results = full_evaluation(train_data, train_labels, valid_data, valid_labels, selectors, classifiers)
stack_results.to_csv('data/stack.csv', index=False)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

## Ensemble

In [None]:
n_features = [10, 50, 100, 200, 500, 1000, 2000, 5000, 7000]
ensemble_results = pd.DataFrame()
for n in n_features:
    selectors = [RandomForestSelector(n_features=n), SelectKBest(f_classif, k=n), SelectKBest(chi2, k=n), CorrelationSelector(n_features=n), MutualInformationSelector(n_features=n)]
    ensemble = [EnsembleSelector(selectors=selectors)]
    df = full_evaluation(train_data, train_labels, valid_data, valid_labels, ensemble, classifiers)
    ensemble_results = pd.concat([ensemble_results, df])
    
ensemble_results.to_csv('data2/ensemble.csv', index=False)

   91  105  113  119  128  144  165  167  173  178  183  187  189  199
  203  205  206  216  221  222  226  238  245  248  251  259  265  270
  274  280  292  331  333  335  359  360  371  380  384  411  417  420
  423  429  433  460  483  542  544  554  555  567  569  578  592  594
  600  601  606  626  627  646  656  666  672  708  714  718  738  746
  749  751  769  772  773  778  782  789  791  796  806  814  822  824
  826  830  831  836  843  853  857  862  870  871  882  885  899  914
  917  923  926  939  949  950  966  971  975  977  985  987  988  998
  999 1012 1020 1026 1029 1030 1032 1040 1043 1044 1058 1065 1080 1086
 1095 1104 1126 1129 1143 1166 1169 1176 1195 1206 1209 1216 1217 1219
 1225 1226 1240 1241 1245 1248 1253 1255 1270 1279 1283 1306 1308 1338
 1343 1349 1366 1397 1398 1420 1439 1445 1446 1478 1481 1489 1491 1493
 1499 1512 1514 1515 1524 1527 1530 1532 1538 1551 1560 1563 1569 1579
 1582 1591 1595 1621 1623 1633 1643 1656 1667 1679 1687 1688 1690 1702
 1714 