In [1]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
# X = pd.DataFrame(data=X, columns=init_data['feature_names'])
# y = pd.DataFrame(data=y, columns=['label'])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200,
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/X.shape[1]))

Accuracy: 0.98
Accuracy per feature: 0.03


In [3]:
# Cell 2: compute and show feature importances

# 从 baseline 随机森林中取出特征重要性
import numpy as np
X_df = pd.DataFrame(X, columns=init_data['feature_names'])
importances = forest.feature_importances_

# 按重要性从大到小排序后的特征索引
indices = np.argsort(importances)[::-1]

print("Feature ranking (from most to least important):")
for rank, idx in enumerate(indices, start=1):
    print(f"{rank:2d}) {X_df.columns[idx]:30s} importance = {importances[idx]:.4f}")

Feature ranking (from most to least important):
 1) worst concave points           importance = 0.1418
 2) mean concave points            importance = 0.1177
 3) worst radius                   importance = 0.1109
 4) worst perimeter                importance = 0.1052
 5) mean concavity                 importance = 0.0772
 6) worst area                     importance = 0.0734
 7) area error                     importance = 0.0418
 8) mean perimeter                 importance = 0.0369
 9) mean radius                    importance = 0.0356
10) worst concavity                importance = 0.0352
11) mean area                      importance = 0.0329
12) worst texture                  importance = 0.0227
13) mean texture                   importance = 0.0186
14) worst compactness              importance = 0.0168
15) radius error                   importance = 0.0154
16) worst symmetry                 importance = 0.0148
17) worst smoothness               importance = 0.0142
18) mean compactn

In [8]:
# 想保留的特征个数
k = 2

# 选出前 k 个最重要的特征
selected_idx = indices[:k]
selected_features = X_df.columns[selected_idx]

print(f"Use top {k} features:")
for name in selected_features:
    print(" -", name)

# 只保留这 k 个特征构建新的训练 / 测试数据
X_train_sel = X_train[:, selected_idx]
X_test_sel = X_test[:, selected_idx]

# 用减少后的特征重新训练随机森林
forest_sel = RandomForestClassifier(
    criterion='entropy',
    n_estimators=200,
    random_state=1,
    n_jobs=2
)
forest_sel.fit(X_train_sel, y_train)

y_pred_sel = forest_sel.predict(X_test_sel)
acc_sel = accuracy_score(y_test, y_pred_sel)
ratio_sel = acc_sel / k

print(f"\nAccuracy with {k} features: {acc_sel:.2f}")
print(f"Accuracy per feature with {k} features: {ratio_sel:.2f}")


Use top 2 features:
 - worst concave points
 - mean concave points

Accuracy with 2 features: 0.89
Accuracy per feature with 2 features: 0.44
