In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz

### Load dataset

In [2]:
data = load_boston()
X, y = data.data, data.target
type(y)

numpy.ndarray

### Convert target to categorical values

In [3]:
max, min = y.max(), y.min()
range = (max - min)/3
y[y < (min+range)] = 0
y[((min+range) <= y) & (y < min+2*range)] = 1
y[y >= min+2*range] = 2

### Split train and test set

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

### Decision Tree learning

In [5]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

### Report on Decision Tree, and Decision Tree of various depths

In [6]:
print('----------------------------------------------------------------------')
print('Tree Depth: \t', clf.get_depth())
print('# of Leaves: \t', clf.get_n_leaves())
print('Train Score: \t', clf.score(X_train, y_train))
print('Test Score: \t', clf.score(X_test, y_test))
print('')

fi = np.argsort(clf.feature_importances_)
fi = reversed(fi)
for i in fi:
  print(data.feature_names[i], ': ', '%.4f'%clf.feature_importances_[i])

print('----------------------------------------------------------------------')
print('----------------------------------------------------------------------')

best_dt = None
best_score = -1
d = clf.get_depth() - 1
while d > 0: 
  dt_clf = tree.DecisionTreeClassifier(max_depth=d)
  dt_clf = dt_clf.fit(X_train, y_train)
  print('Tree Depth: \t', dt_clf.get_depth())
  print('# of Leaves: \t', dt_clf.get_n_leaves())
  print('Train Score: \t', dt_clf.score(X_train, y_train))

  score = dt_clf.score(X_test, y_test)
  print('Test Score: \t', score)
  print('')
  
  fi = np.argsort(dt_clf.feature_importances_)
  fi = reversed(fi)
  for i in fi:
    if dt_clf.feature_importances_[i] != 0:
      print(data.feature_names[i], ': ', '%.4f'%dt_clf.feature_importances_[i])
  
  print('----------------------------------------------------------------------')

  d -= 1

  if best_score < score:
    best_score = score
    best_dt = dt_clf

----------------------------------------------------------------------
Tree Depth: 	 13
# of Leaves: 	 50
Train Score: 	 1.0
Test Score: 	 0.7828947368421053

LSTAT :  0.4137
RM :  0.2079
AGE :  0.0739
DIS :  0.0673
CRIM :  0.0530
PTRATIO :  0.0411
B :  0.0359
NOX :  0.0337
TAX :  0.0317
INDUS :  0.0217
CHAS :  0.0200
RAD :  0.0000
ZN :  0.0000
----------------------------------------------------------------------
----------------------------------------------------------------------
Tree Depth: 	 12
# of Leaves: 	 49
Train Score: 	 0.9943502824858758
Test Score: 	 0.7960526315789473

LSTAT :  0.4261
RM :  0.2035
AGE :  0.0917
DIS :  0.0662
CRIM :  0.0642
PTRATIO :  0.0371
TAX :  0.0300
NOX :  0.0249
B :  0.0215
CHAS :  0.0203
INDUS :  0.0146
----------------------------------------------------------------------
Tree Depth: 	 11
# of Leaves: 	 47
Train Score: 	 0.9943502824858758
Test Score: 	 0.7894736842105263

LSTAT :  0.4292
RM :  0.2166
AGE :  0.0656
DIS :  0.0619
CRIM :  0.0493
P

### Decision Tree visualization

In [7]:
print('Tree Depth: \t', clf.get_depth())
print('# of Leaves: \t', clf.get_n_leaves())
print('Train Score: \t', clf.score(X_train, y_train))
print('Test Score: \t', clf.score(X_test, y_test))



dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("boston_dataset_decision_tree")

dot_data = tree.export_graphviz(clf, out_file=None,   
                    feature_names=data.feature_names, 
                    class_names=['low', 'mid', 'high'],
                    filled=True, rounded=True,  
                    special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

Tree Depth: 	 13
# of Leaves: 	 50
Train Score: 	 1.0
Test Score: 	 0.7828947368421053


ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

### Best Decision Tree on Test Score

In [None]:
print('Tree Depth: \t', best_dt.get_depth())
print('# of Leaves: \t', best_dt.get_n_leaves())
print('Train Score: \t', best_dt.score(X_train, y_train))
print('Test Score: \t', best_dt.score(X_test, y_test))

dot_data = tree.export_graphviz(best_dt, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("boston_dataset_decision_tree")

dot_data = tree.export_graphviz(best_dt, out_file=None,   
                    feature_names=data.feature_names, 
                    class_names=['low', 'mid', 'high'],
                    filled=True, rounded=True,  
                    special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 