In [1]:
from sklearn import datasets

# 讀取鳶尾花(iris)數據
iris = datasets.load_iris()

X = iris.data    # 四個數值特徵
y = iris.target  # 目標是三個分類
print(iris.feature_names)
print(X[:3])
print(iris.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]
['setosa' 'versicolor' 'virginica']


### 移除變異度過低的特徵

In [2]:
from sklearn.feature_selection import VarianceThreshold

# 產生選擇器，挑選變異度大於門檻值（0.5）的特徵
selector = VarianceThreshold(threshold=.5)
selector.fit(X)
X_sel = selector.transform(X)
X_sel[:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [3]:
# 檢視特徵變異度
for i, name in enumerate(iris.feature_names):
    print('{}:{}'.format(name, selector.variances_[i]))

sepal length (cm):0.6811222222222222
sepal width (cm):0.1887128888888887
petal length (cm):3.0955026666666674
petal width (cm):0.5771328888888888


In [4]:
from sklearn.feature_selection import f_classif, SelectPercentile

# 若特徵為數值型，則利用 ANOVA F-value 選取給定比例(75%)的特徵
selector = SelectPercentile(f_classif, percentile=75)
X_pbest = selector.fit_transform(X, y)
X_pbest[:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

### 移除不相關特徵

In [5]:
# 將數值特徵轉為類別數據(整數)
X_cat = X.astype(int)
print(X_cat[:10])

[[5 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [5 3 1 0]
 [4 3 1 0]
 [5 3 1 0]
 [4 2 1 0]
 [4 3 1 0]]


In [6]:
from sklearn.feature_selection import SelectKBest, chi2

# 選取卡方統計量最高的二個特徵
kbest = SelectKBest(chi2, k=2)
kbest.fit(X_cat, y)
print('卡方統計量(p-value)：', kbest.scores_)
print('p值：', kbest.pvalues_)
kbest.get_support()

卡方統計量(p-value)： [ 10.28712871   5.02267003 133.06854839  74.27906977]
p值： [5.83684799e-03 8.11598175e-02 1.27213107e-29 7.42172639e-17]


array([False, False,  True,  True])

In [7]:
X_kbest = kbest.transform(X_cat)
X_kbest[:5]

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0]])