# Tree-based explanation example

A demonstration of the tree explanation method on the UCI Breast Cancer Wisconsin dataset

In [18]:
# Import Modules
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from generalizedtrees.trepanlike import make_trepanlike_classifier
from generalizedtrees.sampling import gaussian_rejection_sample
from generalizedtrees.core import test_all_x

In [2]:
# Initialize testing and training data

np.random.seed(20200508)

data = load_breast_cancer()

features_train, features_test, target_train, target_test = \
    train_test_split(data.data, data.target, test_size=0.3, stratify = data.target)

The black-box is going to be a random forest.



In [3]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(features_train, target_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [4]:
# Random forest classification performance:
print(classification_report(target_test, rf.predict(features_test)))

precision    recall  f1-score   support

           0       0.97      0.97      0.97        64
           1       0.98      0.98      0.98       107

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



We also learn a regular decision tree:

In [5]:
# Learn tree
dtree = DecisionTreeClassifier()
dtree.fit(features_train, target_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [6]:
# Decision tree classification performance:
print(classification_report(target_test, dtree.predict(features_test)))

precision    recall  f1-score   support

           0       0.91      0.94      0.92        64
           1       0.96      0.94      0.95       107

    accuracy                           0.94       171
   macro avg       0.94      0.94      0.94       171
weighted avg       0.94      0.94      0.94       171



Explanation model for the data (fitting to the forest)

In [24]:
# Constrained generator, local:
def local_constrained_generator(n, constraints):
    tester = test_all_x(constraints)
    passing_samples = features_train[np.apply_along_axis(tester, 1, features_train)]
    means = np.mean(passing_samples, axis=0, keepdims=True)[0]
    sds = np.std(passing_samples, axis=0, keepdims=True)[0]

    return gaussian_rejection_sample(means, sds, n, constraints)

In [11]:
# Generator
feature_means = np.mean(features_train, 0)
feature_sds = np.std(features_train, 0)

constrained_generator = lambda n, constraints: gaussian_rejection_sample(feature_means, feature_sds, n, constraints)

# Explainer
Explainer = make_trepanlike_classifier(rf, constrained_generator=constrained_generator)

etree = Explainer()
etree.build()
etree

Tree: (None: [([21]<=33.4377831759953: [([24]<=0.09812917132564093: [([28]<=0.3045044248313924: [([19]<=0.003359373777158404: [([27]<=0.1201280077649357: Predict: [1]), ([27]>0.1201280077649357: Predict: [0])]), ([19]>0.003359373777158404: [([20]<=15.485980279148066: Predict: [0]), ([20]>15.485980279148066: Predict: [0])])]), ([28]>0.3045044248313924: [([27]<=0.22717031243135738: [([27]<=0.21677927058979218: Predict: [0]), ([27]>0.21677927058979218: Predict: [0])]), ([27]>0.22717031243135738: [([16]<=0.08158985186219958: Predict: [0]), ([16]>0.08158985186219958: Predict: [0])])])]), ([24]>0.09812917132564093: [([9]<=0.05207297625781396: [([24]<=0.09890610854269627: [([27]<=0.06756550666546102: Predict: [0]), ([27]>0.06756550666546102: Predict: [0])]), ([24]>0.09890610854269627: [([27]<=0.18686542538204376: Predict: [0]), ([27]>0.18686542538204376: Predict: [0])])]), ([9]>0.05207297625781396: [([27]<=0.1760035743506675: [([27]<=0.17310423018413673: Predict: [0]), ([27]>0.173104230184136

In [12]:
# Explanation tree classification performance:
print(classification_report(target_test, etree.predict(features_test)))

precision    recall  f1-score   support

           0       0.39      1.00      0.56        64
           1       1.00      0.07      0.12       107

    accuracy                           0.42       171
   macro avg       0.70      0.53      0.34       171
weighted avg       0.77      0.42      0.29       171



In [13]:
# Explanation tree fidelity performance:
print(classification_report(rf.predict(features_test), etree.predict(features_test)))

precision    recall  f1-score   support

           0       0.38      0.98      0.55        64
           1       0.86      0.06      0.11       107

    accuracy                           0.40       171
   macro avg       0.62      0.52      0.33       171
weighted avg       0.68      0.40      0.27       171



In [23]:
# Tree with local constrained generator
Explainer2 = make_trepanlike_classifier(rf, constrained_generator=local_constrained_generator)

e2tree = Explainer2()
e2tree.build()
e2tree

IndexError: boolean index did not match indexed array along dimension 0; dimension is 398 but corresponding boolean dimension is 30

In [None]:
# Explanation tree classification performance:
print("Classification")
print(classification_report(target_test, etree.predict(features_test)))

# Explanation tree fidelity performance:
print("Fidelity")
print(classification_report(rf.predict(features_test), etree.predict(features_test)))

* Get RF performance on test set
* Get explainer performance on test set
* Get decision tree performance on test set
* Get fidelity measurement