In [110]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns

## Loading Satanders Dataset

In [122]:
df_train = pd.read_csv('dataset/train.csv')
y_train = df_train.TARGET
df_train = df_train.drop(columns=['TARGET'])

#df_test = pd.read_csv('dataset/test.csv')

In [123]:
print(f'data shape {df_train.shape}')

data shape (76020, 370)


## Feature Elimination Techniques
We are going to cover following methods:
1. Removing Feature From Low Variance
2. Univeriare Feature Selection
3. Recursive Feature Elimation
4. Feature Selection using SelectFromModel
5. Feature Selection Using Pipeline

### 1) Removing Feature From Low Variance
VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.

As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. Boolean features are Bernoulli random variables, and the variance of such variables is given by

so we can select using the threshold .8 * (1 - .8):

    1.1 VarianceThreshold

In [126]:
from sklearn.feature_selection import VarianceThreshold

# First Example 
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [133]:
print(f'Variance value per columng {sel.variances_}, threshold {sel.threshold : 2.2f}')

Variance value per columng [0.13888889 0.22222222 0.25      ], threshold  0.16


In [140]:
sel = VarianceThreshold(threshold=0.01)
result = pd.DataFrame(sel.fit_transform(df_train))

In [141]:
sel.variances_

array([1.91683370e+09, 1.52359114e+09, 1.67868316e+02, 2.60740688e+06,
       1.15133719e+05, 2.98402939e+05, 8.67787936e+03, 2.36347744e+04,
       9.36645365e+02, 1.33321911e+03, 9.07591137e+03, 1.02146342e+05,
       2.62299112e+05, 2.82911673e+05, 9.02652290e+05, 4.86796463e+05,
       2.86728365e+05, 9.09300231e+05, 5.08030455e+05, 2.16586072e+05,
       1.13262366e-02, 3.74801394e-03, 0.00000000e+00, 0.00000000e+00,
       4.02138284e-02, 2.23182814e-01, 1.05224390e-04, 2.63081739e-05,
       3.17554285e-02, 2.77799069e-02, 6.29625242e-02, 4.33949526e-02,
       4.95194074e-02, 4.10925634e-02, 3.97556748e-02, 1.00649809e-02,
       9.89742172e-03, 2.63081739e-05, 2.63081739e-05, 4.82688032e-02,
       2.30922691e-02, 5.27313341e-03, 1.79890956e-03, 1.44489386e-03,
       2.63081739e-05, 2.63081739e-05, 4.17865551e-03, 3.61744209e-03,
       2.68938681e-03, 4.05751756e-02, 3.64495116e-02, 2.57288561e-02,
       2.40312096e-02, 2.67990643e-02, 2.40312096e-02, 2.30797367e-02,
      

In [142]:
# Find the remaining column id
remain_features_id = np.where(sel.variances_ > sel.threshold)

In [143]:
# Assign remaining column name
result.columns = df_train.columns[remain_features_id]

In [144]:
result

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000
1,3.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000
2,4.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000
3,8.0,2.0,37.0,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000
4,10.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829.0,2.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000
76016,151830.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000
76017,151835.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000
76018,151836.0,2.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000


### Univeriare Feature Selection
    2.1. GenericUnivarietSelect
    2.2. SelectKBest
    2.3. SelectPercentile
    2.4. SelectFpr
    2.5. SelectFdr
    2.6. SelectFwe
    2.7. chi2
    2.8. f_classif
    2.9. f_regerssion
    2.10. mutual_info_classif
    2.11. mutual_info_regression


2.2 SelectKBest usinf X2

In [119]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X, y = load_iris(return_X_y=True)
X.shape

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

(150, 2)

In [120]:
X_new = SelectKBest(k='all').fit_transform(df_train, y_train)

In [121]:
X_new.shape

(76020, 370)

### Recursive Feature Elimation
    3.1. RFECV


### Feature Selection using SelectFromModel
    4.1. SelectFromModel
        4.1.1 L1-based feature selection
        4.2.1 Tree-based feature selection

### Feature Selection Using Pipeline
    * example