In [85]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

In [2]:
iris = load_iris()

In [5]:
iris_data = iris.data
iris_label = iris.target

In [23]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=7)

In [24]:
model = DecisionTreeClassifier(random_state=77)

In [25]:
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=77, splitter='best')

In [26]:
pred = model.predict(X_test)

In [32]:
print('Prediction accuracy: {0:.2f}%'.format(accuracy_score(y_test, pred) * 100.0))

Prediction accuracy: 86.67%


In [34]:
# use KFold to improve prediction performance.
kfold = KFold(n_splits=5)
accuracy_list = []

In [44]:
n_iter = 0

for train_index, test_index in kfold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    
    print('=' * 50)
    print('{0}-th Cross validation accuracy: {1}\nTraining data size: {2}\nValidation data size: {3}'
          .format(n_iter, accuracy, train_size, test_size))
    print('=' * 50)
    print()
    
    accuracy_list.append(accuracy)

1-th Cross validation accuracy: 1.0
Training data size: 120
Validation data size: 30

2-th Cross validation accuracy: 1.0
Training data size: 120
Validation data size: 30

3-th Cross validation accuracy: 0.8333
Training data size: 120
Validation data size: 30

4-th Cross validation accuracy: 0.9333
Training data size: 120
Validation data size: 30

5-th Cross validation accuracy: 0.9
Training data size: 120
Validation data size: 30



In [51]:
print('Average accuracy: {0:.2f}%'.format(np.mean(accuracy_list) * 100.0))

Average accuracy: 93.33%


In [54]:
# use Stratified KFold to improve prediction performance.
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df['label'] = iris.target

In [55]:
# check dataframe
iris_df['label'].value_counts()

2    50
1    50
0    50
Name: label, dtype: int64

In [61]:
skf = StratifiedKFold(n_splits=3)
accuracy_list = []
n_iter = 0

for train_index, test_index in skf.split(iris_data, iris_label):
    
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    
    print('=' * 100)
    print('{0}-th\n- Cross validation accuracy: {1}\n- Training data size: {2}\n- Validation data size: {3}'
          .format(n_iter, accuracy, train_size, test_size))
    
    print('\nVerification set index: {0}'.format(test_index))
    print('=' * 100)
    print()
    
    accuracy_list.append(accuracy)

1-th
- Cross validation accuracy: 0.98
- Training data size: 100
- Validation data size: 50

Verification set index: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  50
  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115]

2-th
- Cross validation accuracy: 0.92
- Training data size: 100
- Validation data size: 50

Verification set index: [ 17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  67
  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82 116 117 118
 119 120 121 122 123 124 125 126 127 128 129 130 131 132]

3-th
- Cross validation accuracy: 0.98
- Training data size: 100
- Validation data size: 50

Verification set index: [ 34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  83  84
  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149]



In [76]:
print('Accuracy by cross validation: ', np.round(accuracy_list, 4))

Accuracy by cross validation:  [0.98 0.92 0.98]


In [77]:
print('Average verification accuracy: {0:.0f}%'.format(np.mean(accuracy_list) * 100.0))

Average verification accuracy: 96%


## Easily use cross-validation

cross_val_score(
    estimator,
    X,
    y=None,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=None,
    verbose=0,
    fit_params=None,
    pre_dispatch='2*n_jobs',
    error_score=nan,
)

In [84]:
scores = cross_val_score(model, iris_data, iris_label, scoring='accuracy', cv=3)

In [82]:
print('Accuracy by cross validation: ', np.round(scores, 4))

Accuracy by cross validation:  [0.98 0.92 0.98]


In [83]:
print('Average verification accuracy: {0:.0f}%'.format(np.mean(scores) * 100.0))

Average verification accuracy: 96%


GridSearchCV(
    estimator,
    param_grid,
    scoring=None,
    n_jobs=None,
    iid='deprecated',
    refit=True,
    cv=None,
    verbose=0,
    pre_dispatch='2*n_jobs',
    error_score=nan,
    return_train_score=False,
)

In [109]:
iris = load_iris()
iris_data = iris.data
iris_label = iris.target

dtc_model = DecisionTreeClassifier()

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=121)

parameters = {'max_depth': [1, 2, 3], 'min_samples_split':[2, 3]}
# refit = True is default, If True, re-learn with the best parameter setting
grid_dtree = GridSearchCV(dtc_model, param_grid=parameters, cv=3, refit=True)

In [110]:
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},

In [111]:
scores_df = pd.DataFrame(grid_dtree.cv_results_)

In [112]:
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [113]:
print('GridSearchCV Optimal parameters: ', grid_dtree.best_params_)

GridSearchCV Optimal parameters:  {'max_depth': 3, 'min_samples_split': 2}


In [114]:
print('GridSearchCV Highest accuracy: {0:.2f}%'.format(grid_dtree.best_score_ * 100.0))

GridSearchCV Highest accuracy: 97.50%


In [120]:
pred = grid_dtree.predict(X_test)
print('Test data set accuracy: {0:.2f}%'.format(accuracy_score(y_test,pred) * 100.0))

Test data set accuracy: 96.67%


In [121]:
# GridSearchCV's refit returns the already trained estimator
estimator = grid_dtree.best_estimator_

pred = estimator.predict(X_test)
print('Test data set accuracy: {0:.2f}%'.format(accuracy_score(y_test, pred) * 100.0))

Test data set accuracy: 96.67%
