In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import datasets

In [2]:
dataset = datasets.load_iris()      # count = 150

features = dataset.data
targets = dataset.target

feature_train, feature_test, target_train, target_test = train_test_split(features, targets, test_size=0.3)

In [3]:
model = DecisionTreeClassifier(criterion='gini')
model.fit(feature_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [4]:
predictions = model.predict(feature_test)

print(confusion_matrix(target_test, predictions))
print(accuracy_score(target_test, predictions))

[[17  0  0]
 [ 0 15  1]
 [ 0  0 12]]
0.977777777778


In [5]:
# cross validation
predicted = cross_val_predict(model, features, targets, cv=10)
print(accuracy_score(targets, predicted))

0.953333333333


## Grid Search

In [6]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [7]:
# with grid search you can find an optimal parameter "parameter tuning" !!!
param_grid = {'max_depth': np.arange(1, 10)}

In [8]:
# In every iteration, data split randomly in cross validation + DecisionTreeClassifier
# initializes the tree randomly: that's why you get different results !!!

tree = GridSearchCV(DecisionTreeClassifier(), param_grid)

In [9]:
tree.fit(feature_train, target_train)

tree_predictions = tree.predict_proba(feature_test)[:, 1]

print("Best parameter with Grid Search: ", tree.best_params_)

Best parameter with Grid Search:  {'max_depth': 7}


In [10]:
param = tree.best_params_['max_depth']

model = DecisionTreeClassifier(max_depth=param)
model.fit(feature_train, target_train)

predictions = model.predict(feature_test)

print(confusion_matrix(target_test, predictions))
print(accuracy_score(target_test, predictions))

[[17  0  0]
 [ 0 15  1]
 [ 0  0 12]]
0.977777777778
