## Duplicated features with Feature-engine

In this notebook, we will identify and remove duplicated features with Feature-engine.

In [2]:
!pip install feature_engine -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/375.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/375.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.0/375.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures

In [4]:
path = "https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/dataset_1.csv"
# path = '../dataset_1.csv'

In [5]:
data = pd.read_csv(path)
data.shape

(50000, 301)

In [6]:
data

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.00,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
1,0,0,0.0,3.00,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
2,0,0,0.0,5.88,0.0,0,0,0,0,0,...,0.00,0,0,3,0,0,0,0.0,67772.7216,0
3,0,0,0.0,14.10,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
4,0,0,0.0,5.76,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0.0,2.85,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
49996,0,0,0.0,2.91,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
49997,0,0,0.0,8.46,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0
49998,0,0,0.0,2.76,0.0,0,0,0,0,0,...,0.00,0,0,0,0,0,0,0.0,0.0000,0


**Important**

In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfit.

In [23]:
TARGET = 'target'

X = data.drop(labels=[TARGET], axis=1)
y =data[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X,  # drop the target
                                                    y,  # just the target
                                                    test_size=0.3,
                                                    random_state=667,
                                                    )

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

## Remove constant and quasi-constant

In [24]:
# remove constant and quasi-constant features first:
# we use Feature-engine for this
sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')
sel.fit(X_train)

In [25]:
# remove the quasi-constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 158), (15000, 158))

## Remove duplicated features

In [26]:
sel = DropDuplicateFeatures(variables=None,
                            missing_values='raise',
                            )
# find the duplicate features, this might take a while
sel.fit(X_train)

In [27]:
# these are the pairs of duplicated features
# each set are duplicates
sel.duplicated_feature_sets_

[{'var_148', 'var_37'},
 {'var_199', 'var_84'},
 {'var_143', 'var_296'},
 {'var_177', 'var_198', 'var_250'},
 {'var_226', 'var_232'},
 {'var_229', 'var_269'}]

In [28]:
# these are the features that will be dropped
# 1 from each of the pairs above

sel.features_to_drop_

{'var_148', 'var_198', 'var_199', 'var_232', 'var_250', 'var_269', 'var_296'}

In [29]:
# let's explore our list of duplicated features
len(sel.features_to_drop_)

7

In [30]:
# remove the duplicated features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 151), (15000, 151))

## Stack Feature selection in a Pipeline

We can perform both steps together by setting up the transformers within a pipeline.

In [31]:
TARGET = 'target'

X = data.drop(labels=[TARGET], axis=1)
y =data[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X,  # drop the target
                                                    y,  # just the target
                                                    test_size=0.3,
                                                    random_state=667,
                                                    )

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [33]:
pipe = Pipeline([
                ('constant', DropConstantFeatures(tol=0.998)),
                ('duplicated', DropDuplicateFeatures()),
                ])

pipe.fit(X_train)

In [34]:
# remove features
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((35000, 151), (15000, 151))

In [35]:
# we can navigate the pipeline transformers
len(pipe.named_steps['constant'].features_to_drop_)

142

In [36]:
pipe.named_steps['duplicated'].features_to_drop_

{'var_148', 'var_198', 'var_199', 'var_232', 'var_250', 'var_269', 'var_296'}