In [2]:
import numpy as np
import warnings

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
X = np.array([
    [0, 2, 0, 3],
    [0, 1, 4, 3],
    [0.1, 1, 1, 3]], 
    dtype=np.float32)
Y = np.array([1, 2, 1])

In [4]:
# 方差选择法
variance = VarianceThreshold(threshold=0.1)
print(variance)
print('-----------------')
variance.fit(X)
print(variance.transform(X))

VarianceThreshold(threshold=0.1)
-----------------
[[2. 0.]
 [1. 4.]
 [1. 1.]]


In [5]:
# 相关系数法
sk1 = SelectKBest(f_regression, k=2)
sk1.fit(X, Y)
print(sk1)
print('------------')
print(sk1.scores_)
print('------------')
print(sk1.transform(X))

SelectKBest(k=2, score_func=<function f_regression at 0x0000025012893488>)
------------
[ 0.33333333  0.33333333 16.33333333         nan]
------------
[[0.  0. ]
 [0.  4. ]
 [0.1 1. ]]


  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [6]:
# 卡方检验
# 使用chi2的时候要求特征属性为非负数
sk2 = SelectKBest(chi2, k=2)
sk2.fit(X, Y)
print(sk2)
print(sk2.scores_)
print(sk2.transform(X))

SelectKBest(k=2, score_func=<function chi2 at 0x0000025012893400>)
[0.05  0.125 4.9   0.   ]
[[2. 0.]
 [1. 4.]
 [1. 1.]]


In [7]:
# 包装法Wrapper：特征递归特征消除法
estimator = SVR(kernel='linear')
selector = RFE(estimator, 2, step=1)
selector = selector.fit(X, Y)
print(selector)
print(selector.support_)
print(selector.n_features_)
print(selector.ranking_)
print(selector.transform(X))

RFE(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
  n_features_to_select=2, step=1, verbose=0)
[False  True  True False]
2
[2 1 1 3]
[[2. 0.]
 [1. 4.]
 [1. 1.]]


In [8]:
# 基于惩罚项的特征选择法
X2 = np.array([
    [5.1, 3.5, 1.4, 0.2],
    [4.9, 3. , 1.4, 0.2],
    [-6.2, 0.4, 5.4, 2.3],
    [-5.9, 0. , 5.1, 1.8]], 
    dtype=np.float64)
Y2 = np.array([0, 0, 2, 2])
estimator = LogisticRegression(penalty='l1', C=0.1)
sfm = SelectFromModel(estimator)
sfm.fit(X2, Y2)
print(sfm)
print(sfm.transform(X2))

SelectFromModel(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=None)
[[ 5.1]
 [ 4.9]
 [-6.2]
 [-5.9]]


In [9]:
# 基于树模型的特征选择法
estimator = GradientBoostingClassifier()
sfm = SelectFromModel(estimator)
sfm.fit(X2, Y2)
print(sfm)
print(sfm.transform(X2))

SelectFromModel(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
        norm_order=1, prefit=False, threshold=None)
[[ 5.1  1.4]
 [ 4.9  1.4]
 [-6.2  5.4]
 [-5.9  5.1]]


In [10]:
from sklearn.decomposition import PCA

X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2, 1, 23],
    [ 4.9,  3. ,  1.4,  0.2, 2.3, 2.1],
    [ -6.2,  0.4,  5.4,  2.3, 2, 23],
    [ -5.9,  0. ,  5.1,  1.8, 2, 3]], 
    dtype=np.float64)
# n_components: 给定降低到多少维度，但是要求该值必须小于等于样本数目/特征数目，
# 如果给定的值大于，那么会选择样本数目/特征数目中最小的那个作为最终的特征数目
pca = PCA(n_components=6)
pca.fit(X2)
print(pca.mean_)
print(pca.components_)
print(pca.transform(X2))

[-0.525  1.725  3.325  1.125  1.825 12.775]
[[ 0.02038178 -0.01698103 -0.01350052 -0.0149724   0.03184796 -0.99893718]
 [ 0.9024592   0.25030511 -0.31422084 -0.15092666 -0.03185873  0.01965141]
 [-0.08872116 -0.06952185 -0.06858116 -0.3074396  -0.94204108 -0.02512755]
 [ 0.08903791  0.59899454  0.78582815 -0.09643168 -0.07779466 -0.02002095]]
[[-1.01160631e+01  6.49232600e+00  3.14197238e-01  2.22044605e-16]
 [ 1.08075405e+01  5.73455069e+00 -3.32785235e-01  2.22044605e-16]
 [-1.03473322e+01 -6.08709685e+00 -3.29724759e-01  4.44089210e-16]
 [ 9.65585479e+00 -6.13977984e+00  3.48312756e-01 -1.11022302e-16]]


In [11]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = np.array([
    [-1, -1, 3, 1], 
    [-2, -1, 2, 4], 
    [-3, -2, 4, 5], 
    [1, 1, 5, 4], 
    [2, 1, 6, -5], 
    [3, 2, 1, 5]
    ])
y = np.array([1, 1, 2, 2, 0, 1])
# n_components：给定降低到多少维度，要求给定的这个值和y的取值数量有关，不能超过n_class-1
clf = LinearDiscriminantAnalysis(n_components=2)
clf.fit(X, y)
print(clf.transform(X))

[[-3.2688434  -0.38911349]
 [-1.25507558 -1.78088569]
 [ 5.26064254 -0.49688862]
 [ 6.34385833  1.16134391]
 [-4.05800618  3.58297801]
 [-3.02257571 -2.07743411]]


