In [None]:
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split  
from sklearn.feature_selection import VarianceThreshold

In [None]:
paribas_data = pd.read_csv('train.csv', nrows=20000)  
paribas_data.shape

In [None]:
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  
numerical_columns = list(paribas_data.select_dtypes(include=num_colums).columns)  
paribas_data = paribas_data[numerical_columns]  
paribas_data.shape  

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(  
    paribas_data.drop(labels=['target', 'ID'], axis=1),
    paribas_data['target'],
    test_size=0.2,
    random_state=41)


In [None]:
correlated_features = set()  
correlation_matrix = paribas_data.corr()  


In [None]:
for i in range(len(correlation_matrix .columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)


train_features.drop(labels=correlated_features, axis=1, inplace=True)  
test_features.drop(labels=correlated_features, axis=1, inplace=True)

train_features.shape, test_features.shape

## Feature Selection ##

### Wrapper###

The __k_features__ specifies the number of features to select. You can set any number of features here. The __forward__ parameter, if set to True, performs step forward feature selection. The __verbose__ parameter is used for logging the progress of the feature selector, the __scoring__ (https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5)parameter defines the performance evaluation criteria and finally, __cv__ refers to cross-validation folds.

In [None]:
#Implementing Step Forward Feature Selection in Python
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier  
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),  
           k_features=15,
           forward=True, # for backward feature Selection set as False
           verbose=2,
           scoring='roc_auc',
           cv=4)

In [None]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)  

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]  

clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)  
model = clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))  
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))  
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))  

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector  

The class has __min_features__ and __max_features__ attributes which can be used to specify the minimum and the maximum number of features in the combination.

In [None]:
feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1),  
           min_features=2,
           max_features=4,
           scoring='roc_auc',
           print_progress=True,
           cv=2)
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)  

filtered_features= train_features.columns[list(features.k_feature_idx_)]  

clf_Exf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)  
clf_Exf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf_Exf.predict_proba(train_features[filtered_features].fillna(0))  
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf_Exf.predict_proba(test_features[filtered_features].fillna(0))  
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))  

In [None]:
test = pd.read_csv('test.csv')

In [None]:
Y_predict = model.predict_proba(test[filtered_features].fillna(0))

In [None]:
Y_predict[:,0]  

In [None]:
X = np.reshape(Y_predict, (1,len(Y_predict)*2))

X.shape

In [None]:
submission = pd.DataFrame({ 'ID': test['ID'],
                            'PredictedProb': Y_predict[:,1]  })
submission.to_csv("submission.csv", index=False)
submission.head()


In [None]:
submission = pd.DataFrame({ 'ID': test['ID'],
                            'PredictedProb': Y_predict[:,0]  })