In [2]:
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split  
from sklearn.feature_selection import VarianceThreshold

In [3]:
paribas_data = pd.read_csv('train.csv', nrows=20000)  
paribas_data.shape

(20000, 133)

In [4]:
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  
numerical_columns = list(paribas_data.select_dtypes(include=num_colums).columns)  
paribas_data = paribas_data[numerical_columns]  
paribas_data.shape  

(20000, 114)

In [5]:
train_features, test_features, train_labels, test_labels = train_test_split(  
    paribas_data.drop(labels=['target', 'ID'], axis=1),
    paribas_data['target'],
    test_size=0.2,
    random_state=41)


In [6]:
correlated_features = set()  
correlation_matrix = paribas_data.corr()  


In [7]:
for i in range(len(correlation_matrix .columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)


train_features.drop(labels=correlated_features, axis=1, inplace=True)  
test_features.drop(labels=correlated_features, axis=1, inplace=True)

train_features.shape, test_features.shape

((16000, 57), (4000, 57))

## Feature Selection ##

### Wrapper###

The __k_features__ specifies the number of features to select. You can set any number of features here. The __forward__ parameter, if set to True, performs step forward feature selection. The __verbose__ parameter is used for logging the progress of the feature selector, the __scoring__ (https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5)parameter defines the performance evaluation criteria and finally, __cv__ refers to cross-validation folds.

In [8]:
#Implementing Step Forward Feature Selection in Python
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier  
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),  
           k_features=15,
           forward=True, # for backward feature Selection set as False
           verbose=2,
           scoring='roc_auc',
           cv=4)

In [9]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)  

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:   46.1s finished

[2019-05-26 20:10:53] Features: 1/15 -- score: 0.6118576347827033[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:   37.1s finished

[2019-05-26 20:11:30] Features: 2/15 -- score: 0.6289308104943303[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:   38.2s finished

[2019-05-26 20:12:09] Features: 3/15 -- score: 0.6573040438821603[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:   43.0s finished

[2019-05-26 20:12:52] Features: 4/15 -- score: 0.6353557363903706[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:   39.9s finished

[2019-05-26 20:13:32] Features: 5/15 -- score: 0.6436895522085306[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:   38.1s finished

[2019-05-26 20:14:10] Features: 6/15 -- score: 0.647603693142484[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:   38.2s finished

[2019-05-26 20:14:48] Features: 7/15 -- score: 0.6454937547180324[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s












[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   37.5s finished

[2019-05-26 20:15:26] Features: 8/15 -- score: 0.6452926045885841[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:   41.4s finished

[2019-05-26 20:16:07] Features: 9/15 -- score: 0.6489264157955874[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   40.6s finished

[2019-05-26 20:16:48] Features: 10/15 -- score: 0.6558382369588293[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  47 out of  47 | elapsed:   39.0s finished

[2019-05-26 20:17:27] Features: 11/15 -- score: 0.6522126368126894[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  46 out of  46 | elapsed:   38.3s finished

[2019-05-26 20:18:06] Features: 12/15 -- score: 0.6539201301355875[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   37.3s finished

[2019-05-26 20:18:43] Features: 13/15 -- score: 0.6557679958761226[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s










[Parallel(n_jobs=1)]: Done  44 out of  44 | elapsed:   37.2s finished

[2019-05-26 20:19:20] Features: 14/15 -- score: 0.6539678697393018[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s








[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   36.5s finished

[2019-05-26 20:19:57] Features: 15/15 -- score: 0.6532875112021457

In [10]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]  

clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)  
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))  
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))  
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))  

Accuracy on training set: 0.7084535377684098
Accuracy on test set: 0.713569628990509


In [11]:
# from mlxtend.feature_selection import ExhaustiveFeatureSelector  

In [15]:
test_pred

array([[0.3366072 , 0.6633928 ],
       [0.30307468, 0.69692532],
       [0.30289291, 0.69710709],
       ...,
       [0.22834289, 0.77165711],
       [0.30669955, 0.69330045],
       [0.1607965 , 0.8392035 ]])

The class has __min_features__ and __max_features__ attributes which can be used to specify the minimum and the maximum number of features in the combination.

In [12]:
# feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1),  
#            min_features=2,
#            max_features=4,
#            scoring='roc_auc',
#            print_progress=True,
#            cv=2)
# features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)  

# filtered_features= train_features.columns[list(features.k_feature_idx_)]  

# clf_Exf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)  
# clf_Exf.fit(train_features[filtered_features].fillna(0), train_labels)

# train_pred = clf_Exf.predict_proba(train_features[filtered_features].fillna(0))  
# print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

# test_pred = clf_Exf.predict_proba(test_features[filtered_features].fillna(0))  
# print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))  