## Feature Selection (Statistics)

http://d0evi1.com/sklearn/feature_selection/
    
http://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
### Read Datset  


df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))

### Train Test split



X = df_wine.drop(['Class label'],axis = 1)
y = df_wine[['Class label']]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)
X_train.head()


Class labels [1 2 3]


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
161,13.69,3.26,2.54,20.0,107,1.83,0.56,0.5,0.8,5.88,0.96,1.82,680
92,12.69,1.53,2.26,20.7,80,1.38,1.46,0.58,1.62,3.05,0.96,2.06,495
94,11.62,1.99,2.28,18.0,98,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345
174,13.4,3.91,2.48,23.0,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750
24,13.5,1.81,2.61,20.0,96,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845


## Remove feature such variable are either one or zero in more than 80%

- VarianceThreshold(threshold = (p*(1-.p)))  >> p = threshold percent

In [2]:
from sklearn.feature_selection import VarianceThreshold
print("before = ",X_train.shape)
sel = VarianceThreshold(threshold = (0.8*(1-.8)))
X_lowVal = sel.fit_transform(X_train)
print("after =",X_lowVal.shape)

X_lowTest = sel.transform(X_test)

before =  (142, 13)
after = (142, 10)


# Univariate Feature Selection


### Regression: f_regression
### Classify: chi2 or f_classif

- Use Statistical tests
- SelectKBest(score_func, k = top k)
- SelectPercentile
#### Score function:
http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest


In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

k_best = SelectKBest(chi2,k = 8)
k_best.fit_transform(X_train,y_train)
X_test_new =k_best.transform(X_test)

X_test_new.shape 

(36, 8)

##  RFE

- Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. 

### RFE(estimator,n_features_to_select = None, step = 1)

- n_features_to_select: number of features to select. (number to keep)
- number of features to remove at each iteration.


In [6]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

X,y = make_friedman1(n_samples = 50,n_features = 10,random_state = 0)
estimator = SVR(kernel = 'linear')
selector = RFE(estimator,8,step = 1)
selector = selector.fit(X,y)

selector.support_

#selector.ranking_

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
       False])