In [6]:
import numpy as np
import os
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)
    
# Decision Trees do not require feature scaling or centering at all
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
# Let's visualize this decision tree using the export_graphviz() method
from sklearn.tree import export_graphviz

# Convert the dot file to png or pdf later
export_graphviz(
        tree_clf,
#       out_file=image_path("iris_tree.dot"),
        feature_names=iris.feature_names[2:],
        class_names=iris.target_names,
        rounded=True,
        filled=True
    )



In [9]:
'''
Different attributes of each node in decision tree.
Samples: counts total training instances it applies to
Value: counts individual training instances the node applies to
Gini: Impurity Score (Formula given in the book). A gini score of 0 signifies that there are no mistakes in the prediction
'''

# Scikit-Learn uses the CART (classification and regression tree) algorithm, which always produces a binary tree
# leaf with 5 cm long and 1.5 cm wide petals has 90% chance of Iris-Versicolor
tree_clf.predict_proba([[5, 1.5]])

array([[ 0.        ,  0.90740741,  0.09259259]])

In [10]:
tree_clf.predict([[5, 1.5]])

array([1])

In [12]:
# Note, the probabilities do not change as long as they are in the same node
tree_clf.predict_proba([[6, 1.5]])

array([[ 0.        ,  0.90740741,  0.09259259]])

In [13]:
'''
Decision Trees are known as nonparametric models, not because they do not have parameters, but because
the parameters are not determined prior to training. As a result, the model attempts to continuously fit
the data using different features, which can lead to overfitting. By restricting the degrees of freedom,
or regularizing, we can ensure that the model does not overfit the data. One form of regularizing the 
decision tree model is by restricting the max_depth.
'''
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [14]:
'''
The regression CART algorithm works by minimizing the MSE as opposed to the impurity
Note, Decisions Trees always set their decision boundaries to be perpendicular to training data.
If the data is linear at a 45 degree angle, this results in a model that does not generalize well.
One way to fix this issue is with principle component analysis (PCA). Moreover, the main issue with
Decision trees is that they're very sensisitive to small variations in the training data. One way
to solve this issue is to implement a Random Forest, which averages predictions over many trees.
'''

"\nThe regression CART algorithm works by minimizing the MSE as opposed to the impurity\nNote, Decisions Trees always set their decision boundaries to be perpendicular to training data.\nIf the data is linear at a 45 degree angle, this results in a model that does not generalize well.\nOne way to fix this issue is with principle component analysis (PCA). Moreover, the main issue with\nDecision trees is that they're very sensisitive to small variations in the training data. One way\nto solve this issue is to implement a Random Forest, which averages predictions over many trees.\n"