## Feature selection - Dropping constant features
in this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [2]:
##import pandas to ceate DataFrame
import pandas as pd

## Make DataFrame of the given data
data = pd.DataFrame({"A":[1,2,4,1,2,4],
                   "B":[4,5,6,7,8,9],
                   "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]})

In [3]:
data

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1
5,4,9,0,1


## Variance Threshold

Feature selector that removes all low - variance features

This feature selection algorithm looks only at the feature (X),not the desired output(Y)and can thus be used for unsupervised learning

In [5]:
##it will remove 0 variance features

from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold =0)
var_thres.fit(data)

VarianceThreshold(threshold=0)

In [11]:
var_thres.get_support()

array([ True,  True, False, False])

In [9]:
constant_columns = [column for column in data.columns if column not in data.columns[var_thres.get_support()]]

print(len(constant_columns))

2


In [10]:
constant_columns

['C', 'D']

In [13]:
for feature in constant_columns:
    print(feature)
    

C
D


In [15]:
data.drop(constant_columns,axis = 1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


In [1]:
### let's practice on bigger dataset

In [3]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold



In [6]:
df = pd.read_csv("dataset1.csv",nrows = 1000)

In [7]:
df.head(5)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016,0


In [8]:
df.shape

(1000, 371)

In [9]:
x = df.drop(labels=["TARGET"],axis = 1)
y = df["TARGET"]

In [13]:
from sklearn.model_selection import train_test_split
##separate dataset into train and test
X_train,X_test,y_train,y_test = train_test_split(
    df.drop(labels=["TARGET"],axis = 1),df["TARGET"],test_size= 0.3,random_state=0)

In [14]:
X_train.shape,X_test.shape

((700, 370), (300, 370))

In [15]:
##it will remove 0 variance features

from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold =0)
var_thres.fit(X_train)

VarianceThreshold(threshold=0)

In [16]:
### Finding non constant features
sum(var_thres.get_support())

233

In [17]:
var_thres.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
        True,  True,  True,  True, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [18]:
len(X_train.columns[var_thres.get_support()])

233

In [21]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

137


In [23]:
for column in constant_columns:
    print(column)

imp_op_var40_efect_ult1
ind_var2_0
ind_var2
ind_var6_0
ind_var6
ind_var13_medio_0
ind_var13_medio
ind_var14
ind_var18_0
ind_var18
ind_var27_0
ind_var28_0
ind_var28
ind_var27
ind_var29_0
ind_var29
ind_var32_cte
ind_var32_0
ind_var32
ind_var33_0
ind_var33
ind_var34_0
ind_var34
ind_var41
ind_var46_0
ind_var46
num_var6_0
num_var6
num_var13_medio_0
num_var13_medio
num_var14
num_var18_0
num_var18
num_var27_0
num_var28_0
num_var28
num_var27
num_var29_0
num_var29
num_var32_0
num_var32
num_var33_0
num_var33
num_var34_0
num_var34
num_var41
num_var46_0
num_var46
saldo_var6
saldo_var13_medio
saldo_var14
saldo_var18
saldo_var28
saldo_var27
saldo_var29
saldo_var32
saldo_var33
saldo_var34
saldo_var41
saldo_var46
delta_imp_amort_var18_1y3
delta_imp_amort_var34_1y3
delta_imp_aport_var33_1y3
delta_imp_reemb_var17_1y3
delta_imp_reemb_var33_1y3
delta_imp_trasp_var17_out_1y3
delta_imp_trasp_var33_in_1y3
delta_imp_trasp_var33_out_1y3
delta_num_aport_var33_1y3
delta_num_reemb_var17_1y3
delta_num_reemb_var33_

In [24]:
X_train.drop(constant_columns,axis=1)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,saldo_medio_var17_hace2,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
105,195,2,28,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93861.78
68,144,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,165258.42
479,965,2,24,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61653.75
399,798,2,34,0.0,1200.45,1698.42,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119707.20
434,864,2,28,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109607.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,1660,2,38,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45189.33
192,378,2,27,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145599.06
629,1254,2,31,0.0,36.90,36.90,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99613.92
559,1117,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48869.88
