## 特徴量選択
- 単変量統計　分散分析に基づく選択  
   相互情報量
- モデルベース特徴量選択　教師あり学習モデルによる特徴量の重要性により選択
- 反復特徴量選択　RFE：再帰的特徴量削減　すべての特徴量からモデルを作り重要性の低いものを削除する過程を、指定の数になるまで繰り返す

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X = iris.data
y = iris.target

分散による閾値処理

In [29]:
from sklearn.feature_selection import VarianceThreshold

thresholder = VarianceThreshold(threshold=.5)

features_high_variance = thresholder.fit_transform(X)

features_high_variance[:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [30]:
thresholder.fit(X).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [3]:
from sklearn.feature_selection import SelectPercentile
import numpy as np

rng = np.random.RandomState(42)
noise = rng.normal(size=(len(X.data), 50))

X_w_noise = np.hstack([X.data, noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise, y, test_size=.5)

select_perc = SelectPercentile(percentile=50)
select_perc.fit(X_train, y_train)

X_train_selected_perc = select_perc.transform(X_train)

print("shape(SelectPercentile), {}".format(X_train_selected_perc.shape))

shape(SelectPercentile), (75, 27)


In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif # 回帰の場合:mutual_info_regression

y = y.astype(int)

chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(X, y)
print("shape(chi2), {}".format(features_kbest.shape))

fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(X, y)
print("shape(f_classif), {}".format(features_kbest.shape))

mi_selector = SelectKBest(mutual_info_classif, k=2)
features_kbest = mi_selector.fit_transform(X, y)
print("shape(mutual_info_classif), {}".format(features_kbest.shape))

shape(chi2), (150, 2)
shape(f_classif), (150, 2)
shape(mutual_info_classif), (150, 2)


In [8]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

select = SelectFromModel(
    RandomForestClassifier(n_estimators=100),
    threshold="median")

select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("shape: {}".format(X_train_l1.shape))

shape: (75, 27)


In [10]:
from sklearn.feature_selection import RFE, RFECV

rfe = RFE(RandomForestClassifier(n_estimators=100),
          n_features_to_select=40)

rfecv = RFECV(RandomForestClassifier(n_estimators=100),
              step=1, scoring="neg_mean_squared_error")


rfe.fit(X_train, y_train)
rfecv.fit(X_train, y_train)

X_train_l1 = rfe.transform(X_train)
X_train_l2 = rfecv.transform(X_train)

print("shape(RFE): {}".format(X_train_l1.shape))
print("shape(RFECV): {}".format(X_train_l2.shape))

shape(RFE): (75, 40)
shape(RFECV): (75, 2)


In [11]:
rfecv.n_features_

2

In [12]:
rfecv.support_

array([False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])