In [20]:
from sklearn.datasets import load_iris
iris = load_iris()

## 方差选择法

使用方差选择法，先要计算各个特征的方差，然后根据阈值，选择方差大于阈值的特征。使用feature_selection库的VarianceThreshold类来选择特征的代码

In [23]:
from sklearn.feature_selection import VarianceThreshold
#方差选择法，返回值为特征选择后的数据
#参数threshold为方差的阈值
VarianceThreshold(threshold=0.2).fit_transform(iris.data)

array([[ 5.1,  1.4,  0.2],
       [ 4.9,  1.4,  0.2],
       [ 4.7,  1.3,  0.2],
       [ 4.6,  1.5,  0.2],
       [ 5. ,  1.4,  0.2],
       [ 5.4,  1.7,  0.4],
       [ 4.6,  1.4,  0.3],
       [ 5. ,  1.5,  0.2],
       [ 4.4,  1.4,  0.2],
       [ 4.9,  1.5,  0.1],
       [ 5.4,  1.5,  0.2],
       [ 4.8,  1.6,  0.2],
       [ 4.8,  1.4,  0.1],
       [ 4.3,  1.1,  0.1],
       [ 5.8,  1.2,  0.2],
       [ 5.7,  1.5,  0.4],
       [ 5.4,  1.3,  0.4],
       [ 5.1,  1.4,  0.3],
       [ 5.7,  1.7,  0.3],
       [ 5.1,  1.5,  0.3],
       [ 5.4,  1.7,  0.2],
       [ 5.1,  1.5,  0.4],
       [ 4.6,  1. ,  0.2],
       [ 5.1,  1.7,  0.5],
       [ 4.8,  1.9,  0.2],
       [ 5. ,  1.6,  0.2],
       [ 5. ,  1.6,  0.4],
       [ 5.2,  1.5,  0.2],
       [ 5.2,  1.4,  0.2],
       [ 4.7,  1.6,  0.2],
       [ 4.8,  1.6,  0.2],
       [ 5.4,  1.5,  0.4],
       [ 5.2,  1.5,  0.1],
       [ 5.5,  1.4,  0.2],
       [ 4.9,  1.5,  0.1],
       [ 5. ,  1.2,  0.2],
       [ 5.5,  1.3,  0.2],
 

## 卡方检验

经典的卡方检验是检验定性自变量对定性因变量的相关性。

In [24]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
SelectKBest(chi2, k=3).fit_transform(iris.data, iris.target)

array([[ 5.1,  1.4,  0.2],
       [ 4.9,  1.4,  0.2],
       [ 4.7,  1.3,  0.2],
       [ 4.6,  1.5,  0.2],
       [ 5. ,  1.4,  0.2],
       [ 5.4,  1.7,  0.4],
       [ 4.6,  1.4,  0.3],
       [ 5. ,  1.5,  0.2],
       [ 4.4,  1.4,  0.2],
       [ 4.9,  1.5,  0.1],
       [ 5.4,  1.5,  0.2],
       [ 4.8,  1.6,  0.2],
       [ 4.8,  1.4,  0.1],
       [ 4.3,  1.1,  0.1],
       [ 5.8,  1.2,  0.2],
       [ 5.7,  1.5,  0.4],
       [ 5.4,  1.3,  0.4],
       [ 5.1,  1.4,  0.3],
       [ 5.7,  1.7,  0.3],
       [ 5.1,  1.5,  0.3],
       [ 5.4,  1.7,  0.2],
       [ 5.1,  1.5,  0.4],
       [ 4.6,  1. ,  0.2],
       [ 5.1,  1.7,  0.5],
       [ 4.8,  1.9,  0.2],
       [ 5. ,  1.6,  0.2],
       [ 5. ,  1.6,  0.4],
       [ 5.2,  1.5,  0.2],
       [ 5.2,  1.4,  0.2],
       [ 4.7,  1.6,  0.2],
       [ 4.8,  1.6,  0.2],
       [ 5.4,  1.5,  0.4],
       [ 5.2,  1.5,  0.1],
       [ 5.5,  1.4,  0.2],
       [ 4.9,  1.5,  0.1],
       [ 5. ,  1.2,  0.2],
       [ 5.5,  1.3,  0.2],
 

## 递归特征消除

In [1]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset.data, dataset.target)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]


## 特征重要性

In [11]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(dataset.data, dataset.target)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.17237871  0.05230192  0.46008119  0.31523818]


## 基于惩罚项的特征选择法

使用带惩罚项的基模型，除了筛选出特征外，同时也进行了降维。使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型，

In [18]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
 
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)

array([[ 5.1,  3.5,  1.4],
       [ 4.9,  3. ,  1.4],
       [ 4.7,  3.2,  1.3],
       [ 4.6,  3.1,  1.5],
       [ 5. ,  3.6,  1.4],
       [ 5.4,  3.9,  1.7],
       [ 4.6,  3.4,  1.4],
       [ 5. ,  3.4,  1.5],
       [ 4.4,  2.9,  1.4],
       [ 4.9,  3.1,  1.5],
       [ 5.4,  3.7,  1.5],
       [ 4.8,  3.4,  1.6],
       [ 4.8,  3. ,  1.4],
       [ 4.3,  3. ,  1.1],
       [ 5.8,  4. ,  1.2],
       [ 5.7,  4.4,  1.5],
       [ 5.4,  3.9,  1.3],
       [ 5.1,  3.5,  1.4],
       [ 5.7,  3.8,  1.7],
       [ 5.1,  3.8,  1.5],
       [ 5.4,  3.4,  1.7],
       [ 5.1,  3.7,  1.5],
       [ 4.6,  3.6,  1. ],
       [ 5.1,  3.3,  1.7],
       [ 4.8,  3.4,  1.9],
       [ 5. ,  3. ,  1.6],
       [ 5. ,  3.4,  1.6],
       [ 5.2,  3.5,  1.5],
       [ 5.2,  3.4,  1.4],
       [ 4.7,  3.2,  1.6],
       [ 4.8,  3.1,  1.6],
       [ 5.4,  3.4,  1.5],
       [ 5.2,  4.1,  1.5],
       [ 5.5,  4.2,  1.4],
       [ 4.9,  3.1,  1.5],
       [ 5. ,  3.2,  1.2],
       [ 5.5,  3.5,  1.3],
 

L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个，所以没选到的特征不代表不重要。因此可结合L2惩罚项来优化。

## 基于树模型的特征选择法

In [19]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)

array([[ 1.4,  0.2],
       [ 1.4,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.7,  0.4],
       [ 1.4,  0.3],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.5,  0.2],
       [ 1.6,  0.2],
       [ 1.4,  0.1],
       [ 1.1,  0.1],
       [ 1.2,  0.2],
       [ 1.5,  0.4],
       [ 1.3,  0.4],
       [ 1.4,  0.3],
       [ 1.7,  0.3],
       [ 1.5,  0.3],
       [ 1.7,  0.2],
       [ 1.5,  0.4],
       [ 1. ,  0.2],
       [ 1.7,  0.5],
       [ 1.9,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.4],
       [ 1.5,  0.2],
       [ 1.4,  0.2],
       [ 1.6,  0.2],
       [ 1.6,  0.2],
       [ 1.5,  0.4],
       [ 1.5,  0.1],
       [ 1.4,  0.2],
       [ 1.5,  0.1],
       [ 1.2,  0.2],
       [ 1.3,  0.2],
       [ 1.5,  0.1],
       [ 1.3,  0.2],
       [ 1.5,  0.2],
       [ 1.3,  0.3],
       [ 1.3,  0.3],
       [ 1.3,  0.2],
       [ 1.6,  0.6],
       [ 1.9,  0.4],
       [ 1.4,  0.3],
       [ 1.6,  0.2],
       [ 1.4,

http://adataanalyst.com/machine-learning/comprehensive-guide-feature-engineering/