# 构建机器学习流水线

In [1]:
from sklearn.datasets import samples_generator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt



In [10]:
#生成20维的向量特征
X,Y = samples_generator.make_classification(n_informative=4, n_features=20, n_redundant=0, random_state=5)
# 特征选择器
selector_k_best = SelectKBest(f_regression, k=10) 
# 随机森林分类器
classifier = RandomForestClassifier(n_estimators=50, max_depth=4) 
# 构建机器学习流水线
pipeline_classifier = Pipeline([('selector', selector_k_best), ('rf', classifier)]) 

#可重新设置参数 特征点k=6 树的个数n_estimators=25
pipeline_classifier.set_params(selector__k=6, rf__n_estimators=25) 

# 训练分类器
pipeline_classifier.fit(X, Y)

Pipeline(steps=[('selector',
                 SelectKBest(k=6,
                             score_func=<function f_regression at 0x0000026BC40245E0>)),
                ('rf', RandomForestClassifier(max_depth=4, n_estimators=25))])

In [11]:
# 预测输出结果
prediction = pipeline_classifier.predict(X)
print("\nPredictions:\n", prediction) 


Predictions:
 [1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 0 1]


In [12]:
print("\nScore:", pipeline_classifier.score(X, Y))


Score: 0.98


In [13]:
features_status = pipeline_classifier.named_steps['selector'].get_support()
selected_features = []
for count, item in enumerate(features_status): 
    if item:
        selected_features.append(count)
print("\nSelected features (0-indexed):", ', '.join([str(x) for x in selected_features]))


Selected features (0-indexed): 0, 5, 9, 10, 11, 15
