In [35]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score

from feature_engine.selection import (
    DropConstantFeatures,
    DropCorrelatedFeatures,
    DropDuplicateFeatures,
    SmartCorrelatedSelection,
)

### Preprocess data

In [36]:
data = pd.read_csv('Feature_Selection/precleaned-datasets/dataset_1.csv')
data.shape

(50000, 301)

In [37]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [38]:
# I keep a copy of the dataset with all the variables
# to compare the performance of machine learning models
# at the end of the notebook

X_train_original = X_train.copy()
X_test_original = X_test.copy()

### Feature selection

In [39]:
# create a randomforest classifier for smartgroupselection 
# to select the features from each group

rf = RandomForestClassifier(
    n_estimators=10,
    random_state=20,
    n_jobs=-1
)

In [40]:
# we stack all the selection methods inside a pipeline

pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('variance', VarianceThreshold(threshold=0.01)),
    ('duplicated', DropDuplicateFeatures()),
#     ('drop by correlation', DropCorrelatedFeatures(
#    method='pearson', threshold=0.8, missing_values='ignore', variables=None)),
#     ('correlation', SmartCorrelatedSelection(selection_method='variance')),
    ('smart correlation', SmartCorrelatedSelection(
        selection_method="model_performance", 
        estimator=rf,
        scoring="roc_auc",
        cv=3
    ))
])

pipe.fit(X_train, y_train)

In [41]:
# remove features by pipeline

X_train_p1 = pipe.transform(X_train)
X_test_p1 = pipe.transform(X_test)

X_train_p1.shape, X_test_p1.shape

((35000, 76), (15000, 76))

### Evaluate the difference in performance of the original and simplified datasets

In [42]:
# define the functions to run logistic regression and random forest

def run_logistic(X_train, X_test, y_train, y_test):
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44, max_iter=500)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))    

In [43]:
# original
run_randomForests(X_train_original,
                  X_test_original,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.807612232524249
Test set
Random Forests roc-auc: 0.7868832427636059


In [44]:
# filter methods - correlation
run_randomForests(X_train_p1,
                  X_test_p1,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8152664218595246
Test set
Random Forests roc-auc: 0.795064472646812


In [45]:
print("use the original dataset")
run_logistic(X_train_original,
                  X_test_original,
                  y_train, y_test)

print("use the feature selected dataset")
run_logistic(X_train_p1,
                  X_test_p1,
                  y_train, y_test)

use the original dataset
Train set
Logistic Regression roc-auc: 0.6035712266362897
Test set
Logistic Regression roc-auc: 0.5884884604841877
use the feature selected dataset
Train set
Logistic Regression roc-auc: 0.6004018361054397
Test set
Logistic Regression roc-auc: 0.5863775782700476


### Use embedded tree importance for feature selection

In [46]:
# we stack all the selection methods inside a pipeline

pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('variance', VarianceThreshold(threshold=0.01)),
    ('duplicated', DropDuplicateFeatures()),
    ('drop by correlation', DropCorrelatedFeatures(
        method='pearson', threshold=0.8, 
        missing_values='ignore', variables=None)),
    ('embedded features', SelectFromModel(
        RandomForestClassifier(n_estimators=50, random_state=10, n_jobs=-1))),  
    ('scale', StandardScaler())
])

pipe.fit(X_train, y_train)

In [48]:
X_train_p2 = pipe.transform(X_train)
X_test_p2 = pipe.transform(X_test)
print(f"X_train shape = {X_train_p2.shape}")

print("randomforest for original dataset")
run_randomForests(X_train_original,
                  X_test_original,
                  y_train, y_test)

print("randomforest for feature selected dataset")
run_randomForests(X_train_p2,
                  X_test_p2,
                  y_train, y_test)

print("logistic regression for original dataset")
run_logistic(X_train_original,
                  X_test_original,
                  y_train, y_test)

print("logistic regression for feature selected dataset")
run_logistic(X_train_p2,
                  X_test_p2,
                  y_train, y_test)

X_train shape = (35000, 12)
randomforest for original dataset
Train set
Random Forests roc-auc: 0.807612232524249
Test set
Random Forests roc-auc: 0.7868832427636059
randomforest for feature selected dataset
Train set
Random Forests roc-auc: 0.8225264717823371
Test set
Random Forests roc-auc: 0.7987537266404989
logistic regression for original dataset
Train set
Logistic Regression roc-auc: 0.6035712266362897
Test set
Logistic Regression roc-auc: 0.5884884604841877
logistic regression for feature selected dataset
Train set
Logistic Regression roc-auc: 0.7564491630078737
Test set
Logistic Regression roc-auc: 0.7494123564801862
