## Basic methods plus correlation pipeline with Feature-engine

We will apply basic methods to remove constant, quasi-constant and duplicated features, followed up by removing correlated features, in 1 single step, using Feature-engine and the Scikit-learn Pipeline.

In [1]:
!pip install feature_engine -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/375.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/375.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.0/375.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropConstantFeatures,DropDuplicateFeatures,SmartCorrelatedSelection

In [3]:
path = "https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/dataset_1.csv"
# path = '../dataset_1.csv'

In [4]:
data = pd.read_csv(path)
data.shape

(50000, 301)

In [5]:
TARGET= "target"
X = data.drop(labels=[TARGET], axis=1)
y =data[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X,  # drop the target
                                                    y,  # just the target
                                                    test_size=0.3,
                                                    random_state=667,
                                                    )
X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [7]:
pipe = Pipeline([
                ('constant', DropConstantFeatures(tol=0.998)),
                ('duplicated', DropDuplicateFeatures()),
                ('correlation', SmartCorrelatedSelection(selection_method='variance')),
                ])

pipe.fit(X_train)

In [8]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((35000, 78), (15000, 78))

In [9]:
def run_logistic(X_train, X_test, y_train, y_test):
    logit = LogisticRegression(random_state=44, max_iter=500)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [10]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train)

run_logistic(scaler.transform(X_train),
             scaler.transform(X_test),
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7992898650938297
Test set
Logistic Regression roc-auc: 0.7848426398135036


### EXAMPLE IN PRODUCTION with ENTIRE PIPELINE END2END
(titanic example)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection

columns_to_ohe = ['Sex', 'Embarked']

complete_pipeline = Pipeline([
                            # Feature Selection Steps
                            ('constant_features_removal', DropConstantFeatures(tol=0.998)),
                            ('duplicate_features_removal', DropDuplicateFeatures()),
                            ('correlation_removal', SmartCorrelatedSelection(selection_method='variance')),
                            # Preprocessing Steps
                            ('preprocessing', ColumnTransformer([('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'),
                                                                                        columns_to_ohe)], remainder='passthrough')),
                            ('scaler', StandardScaler()),
                            ('classifier', RandomForestClassifier(
                                                                    bootstrap=True,
                                                                    max_depth=80,
                                                                    max_features=2,
                                                                    min_samples_leaf=3,
                                                                    min_samples_split=8,
                                                                    n_estimators=100
                                                                    ))
                            ])

complete_pipeline