In [82]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier


STATE = 42

np.random.seed(STATE)
tf.random.set_seed(STATE)

In [83]:
def disp_cf_results(cf):
    """Display counterfactual results nicely."""
    for feature, change in zip(features, cf['delta']):
        if change != 0:
            print(feature, change)
            
            
# Load data
dataset = load_breast_cancer() 
X, y = dataset['data'], dataset['target']
features = dataset['feature_names']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=STATE)

## CEML
https://github.com/andreArtelt/ceml
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer

In [84]:
from ceml.tfkeras import generate_counterfactual
from ceml.backend.tensorflow.costfunctions import NegLogLikelihoodCost
from ceml.model import ModelWithLoss


class Model(ModelWithLoss):
    def __init__(self, X):
        super(Model, self).__init__()

        self.model = keras.models.Sequential([
            keras.layers.Input(shape=[X.shape[1],]),
            keras.layers.Dense(16, activation='selu'),
            keras.layers.Dense(8, activation='selu'),
            keras.layers.Dense(1, activation='sigmoid'),
        ])
    
    def fit(self, X_train, y_train, X_test, y_test, num_epochs=100):
        self.model.compile(loss='binary_crossentropy', optimizer='adam', 
                           metrics=['accuracy'])

        self.model.fit(X_train, y_train, epochs=num_epochs, verbose=False,
                       validation_data=(X_test, y_test))

    def predict(self, x):
        return (self.predict_proba(x) > 0.5).astype(int)
    
    def predict_proba(self, x):
        return self.model(x).numpy().reshape(-1,)
    
    def __call__(self, x):
        return self.predict(x)

    def get_loss(self, y_target, pred=None):
        return NegLogLikelihoodCost(input_to_output=self.model.predict_proba, y_target=y_target)

In [85]:
model = Model(X_train)
model.fit(X_train, y_train, X_test, y_test)

y_pred = model.predict(X_test)
print("Model accuracy: {0}".format(accuracy_score(y_test, y_pred)))

Model accuracy: 0.9574468085106383


In [97]:
# Select data point for explaining its prediction
x = X_test[1,:]
instance_pred = model.predict(np.array([x]))[0]
print("Prediction on x: {0}".format(instance_pred))

optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1.0)    # Init optimization algorithm
optimizer_args = {"max_iter": 5000}

# Compute counterfactual
cf = generate_counterfactual(
    model, 
    x, 
    y_target=(1-instance_pred), 
    features_whitelist=None,
    regularization="l1", 
    C=0.1, 
    optimizer=optimizer, 
    optimizer_args=optimizer_args
)
disp_cf_results(cf)

Prediction on x: 0


Exception: No counterfactual found - Consider changing parameters 'C', 'regularization', 'features_whitelist', 'optimizer' and try again

## Decision Tree

In [15]:
from ceml.sklearn import generate_counterfactual

# Whitelist of features - list of features we can change/use when computing a counterfactual 
features_whitelist = None   # We can use all features

# Create and fit model
model = DecisionTreeClassifier(max_depth=3, random_state=STATE)
model.fit(X_train, y_train)
print('Model accuracy = ', accuracy_score(y_test, model.predict(X_test)))

# Select data point for explaining its prediction
x = X_test[1,:]
instance_pred = model.predict([x])[0]
print("Prediction on x: {0}".format(instance_pred))

# Compute counterfactual
cf = generate_counterfactual(model, x, y_target=(1-instance_pred), 
                             features_whitelist=features_whitelist)

disp_cf_results(cf)

Model accuracy =  0.9574468085106383
Prediction on x: 0
worst concavity 0.05250999846220017
