In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

In [10]:
# 数值型特征方差阈值化
'''
依据是方差小的信息要比方差大的信息包含更少，所以选择过滤阈值一下的方差

 tips: 如果数据集已经进行了标准化，那么这个方法不会有效。标准化后方差全部都是1无法区分
'''
iris = datasets.load_iris()

features = iris.data
target = iris.target

# 创建VarianceThreshold对象
threshold = VarianceThreshold(threshold=.5)
features_high_variance = threshold.fit_transform(features)

features_high_variance[0:3]

array([[ 5.1,  1.4,  0.2],
       [ 4.9,  1.4,  0.2],
       [ 4.7,  1.3,  0.2]])

In [12]:
# 显示方差
threshold.fit(features).variances_

# 标准化后无法进行区分
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

selector = VarianceThreshold()
selector.fit(features_std).variances_

array([ 1.,  1.,  1.,  1.])

In [13]:
# 二值特征的方差阈值化

# 特征信息如下
# 0  : 80% 分类为0
# 1 : 80% 分类为1
# 2 : 60% 分类为0， 40% 为1
features = [[0,1,0],
           [0,1,1],
           [0,1,0],
           [0,1,1],
           [1,0,0]]

threshold = VarianceThreshold(threshold=(.75 * (1-.75) ) )
threshold.fit_transform(features)and

'''
二项分布方差公式如下
var(x) = p(1-p)
设置 p的值可以删除大部分观察属于同一个类型的特征
'''

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [11]:
#处理高度相关特征

import numpy as np
import pandas as pd

features = np.array([[1,1,1],
                    [2,2,0],
                    [3,3,1],
                    [4,4,0],
                    [5,5,1],
                    [6,6,0],
                    [7,7,1],
                    [8,7,0],
                    [9,7,1]])
dataframe = pd.DataFrame(features)
# 创建矩阵   corr()表示特征之间相关系数   abs()取绝对值 
corr_matrix = dataframe.corr().abs()
# 选择矩阵的上三角
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# 找到相关性大于 0.95的特征
to_drop = [column for column in upper.columns if any(upper[column]> 0.95)]

# 删除特征
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [10]:
to_drop

[1]

In [13]:
# 删除与分类任务不相关的特征
'''
对于分类型特征，计算每个特征和目标向量的卡方效验
'''
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif

iris = datasets.load_iris()
features = iris.data
target = iris.target

# 将分类数据转换成整数
features = features.astype(int)

# 选择卡方效验统计量最大的2个特征
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

print('original:', features.shape[1])
print('reduce:', features_kbest.shape[1])

original: 4
reduce: 2


In [14]:
'''
对于数值型特征，计算每个特征和目标向量的方差分析F值
'''
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

print('original:', features.shape[1])
print('reduce:', features_kbest.shape[1])

original: 4
reduce: 2


In [15]:
'''
可以通过SelectPercentile方法来选择前面n% 的数据
'''
from sklearn.feature_selection import SelectPercentile

# 选取前75%的特征
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

print('original:', features.shape[1])
print('reduce:', features_kbest.shape[1])

original: 4
reduce: 3


In [20]:
# 递归式特征消除
import warnings
from sklearn.feature_selection import RFECV
from sklearn import linear_model

warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

# 生成特征矩阵，目标向量
features,target = datasets.make_regression(n_samples= 10000,
                                          n_features=100,
                                          n_informative=2,
                                          random_state = 1)

# 创建线性回归
ols = linear_model.LinearRegression()

# 递归消除特征
rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error')
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799, -0.7599597 ,  0.7031277 ],
       [-1.07500204, -0.03209062,  2.56148527],
       [ 1.37940721,  1.01718553, -1.77039484],
       ..., 
       [-0.80331656,  0.01438775, -1.60648007],
       [ 0.39508844,  0.6361969 , -1.34564911],
       [-0.55383035,  1.77013672,  0.82880112]])

In [21]:
# 查看优秀特征数
rfecv.n_features_

3

In [22]:
# 查看特征
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False, False], dtype=bool)

In [23]:
# 查看特征排名  1-最后   优 - 差
rfecv.ranking_

array([ 2, 77, 69, 38, 39,  1, 26, 67,  7, 50, 70,  8, 89, 64,  4, 80, 43,
       83, 41, 56, 19, 23, 49,  1, 46, 20, 93, 53, 36, 25, 32, 27, 24, 75,
       15, 12, 34, 72, 14,  1, 66, 68, 97, 90, 18, 81, 60, 71, 54, 87, 92,
       82,  3, 40, 35, 16, 59, 47, 51, 79, 28, 84, 73, 86, 98,  9, 88, 63,
       65, 62, 42,  6, 17, 58, 10, 22, 78, 37, 91, 31, 44, 33,  5, 52, 48,
       95, 21, 45, 30, 29, 55, 11, 57, 61, 94, 85, 74, 13, 76, 96])