In [1]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import load_breast_cancer

In [5]:
dataset = load_breast_cancer()

In [9]:
t = dataset.target
x = dataset.data

In [10]:
x.shape, t.shape

((569, 30), (569,))

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train_val, x_test, t_train_val, t_test = train_test_split(x,t,test_size=0.2,random_state=1)

In [13]:
x_train, x_val, t_train, t_val = train_test_split(x_train_val,t_train_val,test_size=0.3,random_state=1)

In [16]:
x_train.shape,x_val.shape,x_test.shape

((318, 30), (137, 30), (114, 30))

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
dtree = DecisionTreeClassifier(max_depth=10,min_samples_split=30, random_state=0)

In [25]:
dtree.fit(x_train, t_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

In [26]:
print(f'train score: {dtree.score(x_train, t_train)}')
print(f'val score: {dtree.score(x_val, t_val)}')

train score: 0.9308176100628931
val score: 0.9562043795620438


In [27]:
print(f'test score: {dtree.score(x_test, t_test)}')

test score: 0.9298245614035088


In [28]:
from sklearn.model_selection import GridSearchCV

In [34]:
estimator = DecisionTreeClassifier(random_state=0)

In [38]:
param_grid =[
    {'max_depth':[3,20,50]},
    {'min_samples_split':[3,20,30]},
             ]

In [39]:
cv = 5

In [41]:
tuned_model= GridSearchCV(estimator=estimator,
                          param_grid=param_grid,
                          cv=cv, 
                          return_train_score=False
                         )

In [42]:
tuned_model

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [3, 20, 50]},
                         {'min_samples_split': [3, 20, 30]}])

In [44]:
tuned_model.fit(x_train_val,t_train_val)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [3, 20, 50]},
                         {'min_samples_split': [3, 20, 30]}])

In [45]:
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5
mean_fit_time,0.009212,0.01049,0.008804,0.006599,0.005194,0.004807
std_fit_time,0.000395,0.000769,0.001718,0.0008,0.000403,0.000401
mean_score_time,0.000994,0.001,0.000996,0.000602,0.000201,0.000398
std_score_time,0.000005,0.000002,0.000005,0.000492,0.000402,0.000488
param_max_depth,3,20,50,,,
param_min_samples_split,,,,3,20,30
params,{'max_depth': 3},{'max_depth': 20},{'max_depth': 50},{'min_samples_split': 3},{'min_samples_split': 20},{'min_samples_split': 30}
split0_test_score,0.923077,0.956044,0.956044,0.956044,0.912088,0.912088
split1_test_score,0.901099,0.912088,0.912088,0.912088,0.901099,0.901099
split2_test_score,0.934066,0.923077,0.923077,0.923077,0.934066,0.934066


In [46]:
param_grid =[
    {'max_depth':[5,10,15]},
    {'min_samples_split':[10,12,15]},
             ]

In [47]:
cv = 5

In [49]:
tuned_model= GridSearchCV(estimator=estimator,
                          param_grid=param_grid,
                          cv=cv, 
                          return_train_score=False
                         )

In [50]:
tuned_model.fit(x_train_val,t_train_val)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [5, 10, 15]},
                         {'min_samples_split': [10, 12, 15]}])

In [51]:
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5
mean_fit_time,0.005797,0.005197,0.004,0.004199,0.003799,0.003799
std_fit_time,0.000753,0.001166,0.000001,0.000401,0.000399,0.0004
mean_score_time,0.000401,0.0006,0.0002,0.0002,0.000401,0.000401
std_score_time,0.000492,0.00049,0.0004,0.0004,0.000492,0.000492
param_max_depth,5,10,15,,,
param_min_samples_split,,,,10,12,15
params,{'max_depth': 5},{'max_depth': 10},{'max_depth': 15},{'min_samples_split': 10},{'min_samples_split': 12},{'min_samples_split': 15}
split0_test_score,0.956044,0.956044,0.956044,0.967033,0.923077,0.912088
split1_test_score,0.912088,0.912088,0.912088,0.912088,0.901099,0.901099
split2_test_score,0.923077,0.923077,0.923077,0.923077,0.934066,0.934066


In [53]:
tuned_model.best_params_

{'min_samples_split': 10}

In [54]:
best_model = tuned_model.best_estimator_

In [56]:
print(best_model.score(x_train_val,t_train_val))
print(best_model.score(x_test,t_test
                    ))

0.9934065934065934
0.956140350877193
