In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer


## 1[a].Get  your first DT Classifier runing

In [7]:
data = load_breast_cancer()

In [8]:
print("data.keys(): {}".format(data.keys()))
print("Shape of cancer data: {}".format(data.data.shape))

data.keys(): dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
Shape of cancer data: (569, 30)


In [9]:
print("Sample counts per class:\n{}".format({n: v for n, v in zip(data.target_names, np.bincount(data.target))}))

Sample counts per class:
{'malignant': 212, 'benign': 357}


In [10]:
print("Feature names:\n{}".format(data.feature_names))

Feature names:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [11]:
data = load_breast_cancer()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, stratify=data.target, random_state = 42)

In [13]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

Now that we fit the model to the training data, the next step would be to figure how well did the model perform. This can be done by calling the .score method. This is display the mean accuracy on given test data and labels.

In [14]:
print('Accuracy on training set: {:.3f}'.format(tree.score(X_train, y_train)))
print('Accuracy on test set: {:.3f}'.format(tree.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.937


 From the above analysis involving different depths of the decision tree, we can see that a depth of 3 is most optimal. This is due to the fact that it has a good accuracy for training data (95.3%) (doesn't memorize test labels) and generalizes well to the test data (94.4%)

Now that to have the optimal model figured, let's move on to visualize the decision tree we came up with.

## 1[b]: Ploting - same example

In [6]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

To read the file we exported above to create a visualization by using graphviz module.

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
from sklearn.tree import DecisionTreeClassifier,export_graphviz

export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"], feature_name=data.feature_names, impurity=False, filled=True)


In [None]:
from sklearn.tree import export_graphviz
export_graphviz(model, out_file = "model.dot", class_names = ["malignant", "benign"], feature_names = canc

In [None]:
import graphviz
 
with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

Our deicison tree is shown above, let's note down some inferences that we can draw from it below:

- Right node of the tree (Depth 1, right): The node starts off with 129 samples, out of which 127 are malignant and 2 are benign.This node splits from the root since the worst perimeter for these samples > 115.35.
- As we move further down that node, some finer distinctions are used to split off the 2 benign samples. All the malignant cases accumulate on the rightmost leaf. Here, the gini score is 0.0 which means this only applies to malignant class. A node's gini attribute measures it's impurity.



- Left node of the tree (Depth 1, left): The node starts off with 32 malignant and 265 benign cases (worst perimeter <= 115.35.
With each step, more distinctions are applied and we end up with most of the benign cases in the leftmost leaf where we have a gini score of 0.034.

 Visualizing Decision Trees

export_graphviz function converts decision tree classifier into dot file and pydotplus convert this dot file to png or displayable form on Jupyter.

In the decision tree chart, each internal node has a decision rule that splits the data. Gini referred as Gini ratio, which measures the impurity of the node. You can say a node is pure when all of its records belong to the same class, such nodes known as the leaf node.

Here, the resultant tree is unpruned. This unpruned tree is unexplainable and not easy to understand. In the next section, let's optimize it by pruning.

In [None]:
feature_importances = model.feature_importances_
print("The feature importances for different features are: \n{}".format(feature_importanc

The feature importances for different features are:

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.00576963 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.81041464 0.
 0.         0.         0.02239144 0.1614243  0.         0.        ]

These numbers don't make much sense, do they? Let's now plot the above results to make it more comprehensible.

In [None]:
def plot_feature_importances(model):
    n_features = data_train.shape[1]
    plt.figure(figsize=(20,20))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), data_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')


From the barplot above:

- Worst perimeter is the most important feature. This is evident from the fact that this was the first condition that was applied for coming up with the decision tree.
- Features with low feature importances don't mean that they are not important, rather they were not picked by the tree which can occur due to the fact that different variables weren't independent of each other.
- Feature importances tell us that worst perimeter is important but not about whether a sample is benign or malignant.