#### Univariate Feature Selection

In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import  DecisionTreeClassifier

In [5]:
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(filename, names = names)
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [26]:
X = df.iloc[:,0:8]
Y = df.iloc[:,8]

In [27]:
# feature extraction
test = SelectKBest(score_func = chi2, k = 4)

In [28]:
fit = test.fit(X, Y)
fit

SelectKBest(k=4, score_func=<function chi2 at 0x000001BACDD03670>)

In [29]:
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [30]:
features = fit.transform(X)
features

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

In [31]:
#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

#### Recursive Feature Elimination

In [32]:
# Feature Extraction with RFE
model = LogisticRegression(max_iter=400)
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)



In [33]:
#Num Features: 
fit.n_features_

3

In [34]:
#Selected Features:
fit.support_

array([ True, False, False, False, False,  True,  True, False])

In [35]:
# Feature Ranking:
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

#### Feature Importance using Decision Tree

In [36]:
# Feature Importance with Extra Trees Classifier
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)

DecisionTreeClassifier()

In [37]:
model.feature_importances_

array([0.062, 0.317, 0.08 , 0.023, 0.032, 0.252, 0.124, 0.111])