### Feature Selection - Dropping Constant Features

In this step we will be removing features which have constant features which are actually not important for solving the problem statement

In [4]:
import pandas as pd

data = pd.DataFrame({"A" : [0, 1, 2, 3],
                     "B" : [1, 2, 2, 3],
                     "C" : [0, 0, 0, 0],
                     "D" : [10, 10, 10, 10]})
data

Unnamed: 0,A,B,C,D
0,0,1,0,10
1,1,2,0,10
2,2,2,0,10
3,3,3,0,10


### Variance Threshold

Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), 
not the desired outputs (y), and can thus be used for unsupervised learning.

In [9]:
from sklearn.feature_selection import VarianceThreshold

var_th = VarianceThreshold(threshold=0)
var_th.fit(data)

VarianceThreshold(threshold=0)

In [13]:
len(var_th.get_support(indices=True))

2

In [22]:
constant_columns = [column for column in data.columns if column not in data.columns[var_th.get_support()]]
constant_columns

['C', 'D']

In [25]:
final_data = data.drop(labels=constant_columns, axis=1)
final_data

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,2
3,3,3


### Santander Dataset

In [29]:
sd_df = pd.read_csv('train.csv', nrows=1000)
sd_df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016,0


In [33]:
X = sd_df.drop(labels=['TARGET'], axis=1)

In [34]:
y=sd_df['TARGET']

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [37]:
X_train.shape, y_train.shape

((800, 370), (800,))

### Applying the Variance threshold

In [38]:
var_thres = VarianceThreshold(threshold=0)
var_thres.fit(X_train)

VarianceThreshold(threshold=0)

In [44]:
(var_thres.get_support())

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
        True,  True,  True,  True, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,