In [4]:
# BorutaPyのインストール
# pip install boruta_py

from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd

# データの読み込みと分割
data = load_diabetes()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストのモデルを作成 (Borutaで使用)
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Borutaの実行 (重要な特徴量の選択)
boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42)
boruta_selector.fit(X_train, y_train)

# 選択された特徴量の結果を表示
selected_features = boruta_selector.support_
all_features = data.feature_names
selected_feature_names = [feature for feature, selected in zip(all_features, selected_features) if selected]

print("Selected important features:")
print(selected_feature_names)

# 全ての特徴量と選択結果をDataFrameで表示
feature_selection_results = pd.DataFrame({
    'Feature': all_features,
    'Selected': selected_features
})

print("\nFeature selection results:")
print(feature_selection_results)


Selected important features:
['bmi', 'bp', 's2', 's5', 's6']

Feature selection results:
  Feature  Selected
0     age     False
1     sex     False
2     bmi      True
3      bp      True
4      s1     False
5      s2      True
6      s3     False
7      s4     False
8      s5      True
9      s6      True
