## 変数選択

### RFE(Recursive Feature Elimination; 再帰的特徴量削減)

In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE

In [3]:
# データセットの読込み
boston = load_boston()

# データフレームの作成
# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)

# 目的変数の追加
df["MEDV"] = boston.target

In [4]:
# estimator として GBDTを使用。特徴量を5個選択
selector = RFE(
    GradientBoostingRegressor(n_estimators=100, random_state=10), n_features_to_select=5
)
selector.fit(df.iloc[:, 0:13], df.iloc[:, 13])
mask = selector.get_support()
print(boston.feature_names)
print(mask)

# 選択した特徴量の列のみ取得
X_selected = selector.transform(df.iloc[:, 0:13])
print(
    "X.shape={}, X_selected.shape={}".format(df.iloc[:, 0:13].shape, X_selected.shape)
)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
[False False False False  True  True False  True False False  True False
  True]
X.shape=(506, 13), X_selected.shape=(506, 5)


### モデルベース特徴量選択
SelectFromModel

In [5]:
from sklearn.feature_selection import SelectFromModel

# estimator として GBDTを使用。
selector = SelectFromModel(
    GradientBoostingRegressor(n_estimators=100, random_state=10), threshold="median"
)
selector.fit(df.iloc[:, 0:13], df.iloc[:, 13])
mask = selector.get_support()
print(boston.feature_names)
print(mask)

# 選択した特徴量の列のみ取得
X_selected = selector.transform(df.iloc[:, 0:13])
print(
    "X.shape={}, X_selected.shape={}".format(df.iloc[:, 0:13].shape, X_selected.shape)
)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
[ True False False False  True  True False  True False False  True  True
  True]
X.shape=(506, 13), X_selected.shape=(506, 7)


### SelectKBest
説明変数のうち上位k個を選択

In [6]:
from sklearn.feature_selection import SelectKBest, f_regression

# 5つの特徴量を選択
selector = SelectKBest(score_func=f_regression, k=5)
selector.fit(df.iloc[:, 0:13], df.iloc[:, 13])
mask = selector.get_support()  # 各特徴量を選択したか否かのmaskを取得
print(boston.feature_names)
print(mask)

# 選択した特徴量の列のみ取得
X_selected = selector.transform(df.iloc[:, 0:13])
print(
    "X.shape={}, X_selected.shape={}".format(df.iloc[:, 0:13].shape, X_selected.shape)
)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
[False False  True False False  True False False False  True  True False
  True]
X.shape=(506, 13), X_selected.shape=(506, 5)


### SelectPercentile
説明変数のうち上位k%を選択

In [7]:
from sklearn.feature_selection import SelectPercentile, f_regression

# 特徴量のうち50%を選択
selector = SelectPercentile(score_func=f_regression, percentile=50)
selector.fit(df.iloc[:, 0:13], df.iloc[:, 13])
mask = selector.get_support()
print(boston.feature_names)
print(mask)

# 選択した特徴量の列のみ取得
X_selected = selector.transform(df.iloc[:, 0:13])
print(
    "X.shape={}, X_selected.shape={}".format(df.iloc[:, 0:13].shape, X_selected.shape)
)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
[False False  True False  True  True False False False  True  True False
  True]
X.shape=(506, 13), X_selected.shape=(506, 6)
