# 网格搜索 + K折交叉验证

In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer

## 导入乳腺癌数据

In [2]:
data = load_breast_cancer()

## 切割训练集和验证集

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data['data'], data['target'], train_size=0.8, random_state=0)



In [4]:
regressor = DecisionTreeClassifier(random_state=0)

## GridSearch的参数

In [5]:
parameters = {'max_depth': range(1, 7)}

## 评分依据:准确率

In [6]:
scoring_fnc = make_scorer(accuracy_score)

## k折交叉验证：10折

In [7]:
kfold = KFold(n_splits=10)

## 进行GridSearch

In [8]:
grid = GridSearchCV(regressor, parameters, scoring_fnc, cv=kfold)
grid = grid.fit(X_train, y_train)

## 使用GridSearch寻找最优参数组合

In [9]:
reg = grid.best_estimator_
print('best score: %f'%grid.best_score_)
print('best parameters:')
for key in parameters.keys():
    print('%s: %d'%(key, reg.get_params()[key]))

best score: 0.938462
best parameters:
max_depth: 4


## 模型验证

In [10]:
print('test score: %f'%reg.score(X_test, y_test))

test score: 0.956140


## 输出GridSearch中间过程

In [11]:
import pandas as pd
pd.DataFrame(grid.cv_results_).T



Unnamed: 0,0,1,2,3,4,5
mean_fit_time,0.00253072,0.00239308,0.00318389,0.0039603,0.0048275,0.00530453
mean_score_time,0.000342155,0.000220895,0.000206566,0.000282168,0.0002321,0.000216699
mean_test_score,0.901099,0.934066,0.936264,0.938462,0.934066,0.934066
mean_train_score,0.923811,0.952871,0.969717,0.984126,0.990721,0.995362
param_max_depth,1,2,3,4,5,6
params,{u'max_depth': 1},{u'max_depth': 2},{u'max_depth': 3},{u'max_depth': 4},{u'max_depth': 5},{u'max_depth': 6}
rank_test_score,6,3,2,1,3,3
split0_test_score,0.869565,0.934783,0.934783,0.934783,0.913043,0.913043
split0_train_score,0.929095,0.95599,0.97066,0.98044,0.99022,0.997555
split1_test_score,0.913043,0.934783,0.913043,0.913043,0.913043,0.891304
