In [None]:
!pip install alibi

In [10]:
import tensorflow as tf
tf.get_logger().setLevel(40) # suppress deprecation messages
tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from alibi.explainers import CounterFactualProto
from sklearn.model_selection import train_test_split

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

TF version:  2.3.0
Eager execution enabled:  False


## Data set



In [11]:
names = [
    'id', 
    'clump_thickness', 
    'cell_size_uniformity', 
    'cell_shape_uniformity',
    'marginal_adhesion',
    'epithelial_cell_size',
    'bare_nuclei',
    'bland_chromatin',
    'normal_nucleoli',
    'mitoses',
    'class'
]

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df1 = pd.read_csv(url, names=names)
df1.head(10)
# Dataset is now stored in a Pandas Dataframe

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [8]:
# remove NaN values
dataset1 = df1.replace('?', np.NaN)
dataset1.dropna(inplace=True)

# remove non-feature col
dataset1.drop(columns=['id'], inplace=True)


# convert labels to M=1 and B=0
dataset1['class'] = (dataset1['class'] == 4).astype(int)
dataset1 = dataset1.astype(np.float64)
dataset1['class'] = dataset1['class'].astype('int32')
dataset1.head(10)

Unnamed: 0,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,0
5,8.0,10.0,10.0,8.0,7.0,10.0,9.0,7.0,1.0,1
6,1.0,1.0,1.0,1.0,2.0,10.0,3.0,1.0,1.0,0
7,2.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0,0
8,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,5.0,0
9,4.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0


In [23]:
# Split Train and Test datasets
# Shuffle the data and define the train and test set:
X = dataset1.iloc[:,:-1].values
y = dataset1.iloc[:,9:].values

#Standardise
mu = X.mean(axis=0)
sigma = X.std(axis=0)
X = (X - mu) / sigma

#Split Train and Test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42 )

In [16]:
#Modeling a ligistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [19]:
#Modeling a ligistic Regression
lr = LogisticRegression(penalty='l2', max_iter=200, fit_intercept=True)
lr.fit(X_train, np.ravel(y_train))
accuracy_score(np.ravel(y_test), lr.predict(X_test))

0.9532163742690059

In [24]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [25]:
def nn_model():
    x_in = Input(shape=(9,))
    x = Dense(40, activation='relu')(x_in)
    x = Dense(40, activation='relu')(x)
    x_out = Dense(2, activation='softmax')(x)
    nn = Model(inputs=x_in, outputs=x_out)
    nn.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return nn

In [27]:
nn = nn_model()
nn.summary()
nn.fit(X_train, y_train, batch_size=64, epochs=500, verbose=0)
nn.save('nn_breastCancer.h5', save_format='h5')

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
dense_3 (Dense)              (None, 40)                400       
_________________________________________________________________
dense_4 (Dense)              (None, 40)                1640      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 82        
Total params: 2,122
Trainable params: 2,122
Non-trainable params: 0
_________________________________________________________________


In [28]:
nn = load_model('nn_breastCancer.h5')
score = nn.evaluate(X_test, y_test, verbose=0)
print('Test accuracy: ', score[1])

Test accuracy:  0.9590643


Generate counterfactual guided by the nearest class prototype

In [29]:
# Original instance:
X = X_test[1].reshape((1,) + X_test[1].shape)
shape = X.shape

# Run counterfactual:

# define model
nn = load_model('nn_breastCancer.h5')

# initialize explainer, fit and generate counterfactual
cf = CounterFactualProto(nn, shape, use_kdtree=True, theta=10., max_iterations=1000,
                         feature_range=(X_train.min(axis=0), X_train.max(axis=0)),
                         c_init=1., c_steps=10)

cf.fit(X_train)
explanation = cf.explain(X)

print('Original prediction: {}'.format(explanation.orig_class))
print('Counterfactual prediction: {}'.format(explanation.cf['class']))

No encoder specified. Using k-d trees to represent class prototypes.


Original prediction: 1
Counterfactual prediction: 0


The explainer outputs We a counterfactual to see which variables need to be changed to increase or decrease to get the cancer diagnostic. The output below shows the result of undoing the pre-processing step and then checks where the counterfactual differs from the original instance:

In [37]:
feature_names = names[1:-1]
orig = X * sigma + mu
counterfactual = explanation.cf['X'] * sigma + mu
delta = counterfactual - orig
for ii, ff in enumerate(feature_names):
    if np.abs(delta[0][ii]) > 1e-4:
        print('{}: {}'.format(ff, delta[0][ii]))

clump_thickness: -4.526571275874357
cell_size_uniformity: -2.010276286841414
cell_shape_uniformity: -0.3861853841621876
marginal_adhesion: -1.634759570244614
epithelial_cell_size: -1.3719380826456486
bare_nuclei: -6.525691631301381
bland_chromatin: -3.389368207491753
normal_nucleoli: -0.543684742930715
mitoses: -5.531326410432158


In [39]:
print('Proportion of the observation %{} mitoses'.format(orig[0][8]))

Proportion of the observation %7.0 mitoses
