In [1]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Tensorflow import
import tensorflow as tf
import os

C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
from data_interfaces import *
from explainer_interfaces import  *
from model_interfaces import base_model
import data, dice
import model as dice_model
#from utils import helpers

In [3]:
dataset = pd.read_csv('adultdataset.csv')

In [4]:
dataset.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [5]:
d = data.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [6]:
# Split data into train and test
target = dataset["income"]
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

#データの前処理
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [7]:
# Using sklearn backend
m = dice_model.Model(model=model, backend="sklearn")
exp = dice.Dice(d, m, method="random")

In [8]:
e1 = exp.generate_counterfactuals(x_train[0:1], total_CFs=2, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  3.86it/s]

Query instance (original outcome うんこ: 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,38,Private,HS-grad,Married,Blue-Collar,White,Male,44,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Bachelors,-,-,-,-,-,1
1,-,-,Assoc,-,Professional,-,-,-,1


### Explaining a Tensorflow model

In [9]:
# supress deprecation warnings from TF
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

backend = 'TF'+tf.__version__[0]  # TF2
#ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)
model_ext = '.h5' if 'TF' in backend else '.pth'
ML_modelpath = os.path.join('utils', 'sample_trained_models', 'adult'+model_ext)

# Step 2: dice_ml.Model
m = dice_model.Model(model_path=ML_modelpath, backend=backend)
# Step 3: initiate DiCE
exp = dice.Dice(d, m)

In [10]:
# query instance in the form of a dictionary or a dataframe; keys: feature name, values: feature value
query_instance = {'age': 22,
                  'workclass': 'Private',
                  'education': 'HS-grad',
                  'marital_status': 'Single',
                  'occupation': 'Service',
                  'race': 'White',
                  'gender': 'Female',
                  'hours_per_week': 45}

In [11]:
# generate counterfactuals
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=2, desired_class="opposite", verbose=False)

[[0.01903973]]
[[0.02296001]]
[[0.03946911]]
[[0.02879182]]
[[0.07286217]]
[[0.0378578]]
[[0.11332627]]
[[0.05208436]]
[[0.1833602]]
[[0.06773687]]
[[0.21411113]]
[[0.07123592]]
[[0.24016248]]
[[0.07807194]]
[[0.25869396]]
[[0.08568421]]
[[0.2781319]]
[[0.09376674]]
[[0.29844236]]
[[0.09826366]]
[[0.30828902]]
[[0.5478091]]
[[0.30991265]]
[[0.9354572]]
[[0.31127876]]
[[0.93568754]]
[[0.31227884]]
[[0.9360657]]
[[0.3132807]]
[[0.93614584]]
[[0.31391397]]
[[0.93637383]]
[[0.73432326]]
[[0.93660104]]
[[0.7352316]]
[[0.9366805]]
[[0.94079477]]
[[0.9368275]]
[[0.94100726]]
[[0.9369067]]
[[0.94100726]]
[[0.907653]]
[[0.9412189]]
[[0.907653]]
[[0.9412189]]
[[0.90786076]]
[[0.9412189]]
[[0.9079727]]
[[0.94143]]
[[0.9079727]]
[[0.94143]]
[[0.90776515]]
[[0.94143]]
[[0.90787715]]
[[0.94143]]
[[0.90787715]]
[[0.94143]]
[[0.90787715]]
[[0.94143]]
[[0.90787715]]
[[0.94143]]
[[0.9076694]]
[[0.9415037]]
[[0.9076694]]
[[0.9415037]]
[[0.9076694]]
[[0.9415037]]
[[0.9074612]]
[[0.9413669]]
[[0.90757364]]

In [12]:
# visualize the result, highlight only the changes
dice_exp.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome うんこ: 0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,22.0,Private,HS-grad,Single,Service,White,Female,45.0,0.019



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,34.0,-,Doctorate,Married,-,-,-,-,0
1,29.0,Self-Employed,Prof-school,Married,-,-,-,-,0
