# Decision Trees - Visualization

In [None]:
import sklearn.datasets as datasets
import pandas as pd
import pydotplus

from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz

In [None]:
D=datasets.load_digits()
print(D.keys())

In [None]:
print(D['DESCR'])

In [None]:
print(D['data'][10])

In [None]:
print(D['images'][10])

Refer: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

Parameters for a decision tree:
<ul>
    <li>splitter</li>
    <li>max_depth</li>
    <li>criterion</li>
    <li>max_features</li>
    <li>max_leaf_nodes</li>
    <li>min_impurity_decrease</li>
    <li>min_impurity_split</li>
    <li>min_samples_leaf</li>
    <li>min_samples_split</li>
    <li>min_weight_fraction_leaf</li>
    <li>presort</li>
    <li>random_state</li>
    <li>class_weight</li>
</ul>

In [None]:
import re
def remove_gini_impurity(graph_val):
    return re.sub(r'gini = (.*)<br/>s', "s", graph_val)

# Visualization of a Decision Tree

## First step (Depth=1)

In [None]:
#Initial distinction of one entire category
dtree=DecisionTreeClassifier(max_depth=1)
dtree.fit(D.data,D.target)

In [None]:
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph_val = remove_gini_impurity(dot_data.getvalue())
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(graph_val)

In [None]:
Image(graph.create_png())

## Second step (Depth=2)

In [None]:
dtree=DecisionTreeClassifier(max_depth=2)

In [None]:
dtree.fit(D.data,D.target)
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph_val = remove_gini_impurity(dot_data.getvalue())
graph = pydotplus.graph_from_dot_data(graph_val) 

In [None]:
Image(graph.create_png())

### Observations
1. With the remaining samples, the condition X3 &le; 1.75 provides the best split between the two categories. 
2. There are however some outliers in each category of this classification.

## The Fully grown Decision Tree

In [None]:
dtree=DecisionTreeClassifier()

In [None]:
dtree.fit(D.data,D.target)
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph_val = remove_gini_impurity(dot_data.getvalue())
graph = pydotplus.graph_from_dot_data(graph_val)  

In [None]:
Image(graph.create_png(),height=1500,width=900)

In [None]:
#Highlighting the over-fit portion

new_graph_val = """
digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;
edge [fontname=helvetica] ;
0 [label=<X<SUB>3</SUB> &le; 1.55<br/>samples = 6<br/>value = [0, 2, 4]>, fillcolor="#8139e57f"] ;
1 [label=<samples = 3<br/>value = [0, 0, 3]>, fillcolor="#8139e5ff"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label=<X<SUB>2</SUB> &le; 5.45<br/>samples = 3<br/>value = [0, 2, 1]>, fillcolor="#39e5817f"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label=<samples = 2<br/>value = [0, 2, 0]>, fillcolor="#39e581ff"] ;
2 -> 3 ;
4 [label=<samples = 1<br/>value = [0, 0, 1]>, fillcolor="#8139e5ff"] ;
2 -> 4 ;
}
""" 
graph = pydotplus.graph_from_dot_data(new_graph_val)

### Reducing Over-Fitting

* Use the **min_samples_split** parameter to set the minimum number of samples required to make a split. 

In [None]:
from sklearn.model_selection import cross_val_score
scores=[]
for i in range(1,100):
    dtree=DecisionTreeClassifier(max_depth=i)
    scores.append(np.mean(cross_val_score(dtree, D.data, D.target, cv=4)))

In [None]:
%pylab inline

In [None]:
plot(scores)

In [None]:
from sklearn.model_selection import train_test_split
#(*arrays, **options)[source]
D_train, D_test, t_train, t_test = train_test_split(D.data, D.target, test_size=0.33, random_state=42)

In [None]:
dtree=DecisionTreeClassifier(max_depth=3)

In [None]:
dtree.fit(D_train,t_train)
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph_val = remove_gini_impurity(dot_data.getvalue())
graph = pydotplus.graph_from_dot_data(graph_val) 

In [None]:
Image(graph.create_png(),height=1500,width=900)

**You are encouraged to explore the DecisionTreeCalssifier by altering parameters passed to it.**

In [None]:
def err(dtree,X,y):
    yp=dtree.predict(X)
    #print(type(X),type(y),type(yp))
    errs=np.sum(yp!=y)+0.0
    return(errs/len(X)) 

In [None]:
train_err=[]
test_err=[]
node_count=[]
for i in range(1,30):
    dtree=DecisionTreeClassifier(max_depth=i)
    dtree.fit(D_train,t_train)
    train_err.append(err(dtree,D_train,t_train)),
    test_err.append(err(dtree,D_test,t_test))
    node_count.append(dtree.tree_.node_count)

In [None]:
plot(train_err,label='trainig error')
plot(test_err,label='test error')
legend()

In [None]:
plot(node_count)

In [None]:
train_err[-10:]

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
A=confusion_matrix(t_test,dtree.predict(D_test))

In [None]:
print('pred label  ',''.join(['%3d'%j for j in range(10)]))
for i in range(A.shape[0]):
    print('true label %d'%i,A[i,:])

### comments
* `1` is predicted as `2` 7 times
* `8,3,1' are confused with each other

In [None]:
choice=[1,3,8]
A[choice,:][:,choice]