In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz

ModuleNotFoundError: No module named 'graphviz'

### Load dataset

In [None]:
data = load_boston()
X, y = data.data, data.target

### Convert target to categorical values

In [None]:
max, min = y.max(), y.min()
range = (max - min)/3
y[y < (min+range)] = 0
y[((min+range) <= y) & (y < min+2*range)] = 1
y[y >= min+2*range] = 2

### Split train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

### Decision Tree learning

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

### Report on Decision Tree, and Decision Tree of various depths

In [None]:
print('----------------------------------------------------------------------')
print('Tree Depth: \t', clf.get_depth())
print('# of Leaves: \t', clf.get_n_leaves())
print('Train Score: \t', clf.score(X_train, y_train))
print('Test Score: \t', clf.score(X_test, y_test))
print('')

fi = np.argsort(clf.feature_importances_)
fi = reversed(fi)
for i in fi:
  print(data.feature_names[i], ': ', '%.4f'%clf.feature_importances_[i])

print('----------------------------------------------------------------------')
print('----------------------------------------------------------------------')

best_dt = None
best_score = -1
d = clf.get_depth() - 1
while d > 0: 
  dt_clf = tree.DecisionTreeClassifier(max_depth=d)
  dt_clf = dt_clf.fit(X_train, y_train)
  print('Tree Depth: \t', dt_clf.get_depth())
  print('# of Leaves: \t', dt_clf.get_n_leaves())
  print('Train Score: \t', dt_clf.score(X_train, y_train))

  score = dt_clf.score(X_test, y_test)
  print('Test Score: \t', score)
  print('')
  
  fi = np.argsort(dt_clf.feature_importances_)
  fi = reversed(fi)
  for i in fi:
    if dt_clf.feature_importances_[i] != 0:
      print(data.feature_names[i], ': ', '%.4f'%dt_clf.feature_importances_[i])
  
  print('----------------------------------------------------------------------')

  d -= 1

  if best_score < score:
    best_score = score
    best_dt = dt_clf

### Decision Tree visualization

In [None]:
print('Tree Depth: \t', clf.get_depth())
print('# of Leaves: \t', clf.get_n_leaves())
print('Train Score: \t', clf.score(X_train, y_train))
print('Test Score: \t', clf.score(X_test, y_test))



dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("boston_dataset_decision_tree")

dot_data = tree.export_graphviz(clf, out_file=None,   
                    feature_names=data.feature_names, 
                    class_names=['low', 'mid', 'high'],
                    filled=True, rounded=True,  
                    special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

### Best Decision Tree on Test Score

In [None]:
print('Tree Depth: \t', best_dt.get_depth())
print('# of Leaves: \t', best_dt.get_n_leaves())
print('Train Score: \t', best_dt.score(X_train, y_train))
print('Test Score: \t', best_dt.score(X_test, y_test))

dot_data = tree.export_graphviz(best_dt, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("boston_dataset_decision_tree")

dot_data = tree.export_graphviz(best_dt, out_file=None,   
                    feature_names=data.feature_names, 
                    class_names=['low', 'mid', 'high'],
                    filled=True, rounded=True,  
                    special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 