In [1]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Tensorflow import
import tensorflow as tf
import os

C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\diddy\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
from model_interfaces import base_model
import data, dice
import model as dice_model

In [3]:
dataset = pd.read_csv('csv/adultdataset.csv')

In [4]:
dataset.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [5]:
d = data.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

# Split data into train and test
target = dataset["income"]
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

#データの前処理
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

# Using sklearn backend
m = dice_model.Model(model=model, backend="sklearn")
exp = dice.Dice(d, m, method="random")

e1 = exp.generate_counterfactuals(x_train[0:1], total_CFs=2, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)

### Explaining a Tensorflow model

In [6]:
# supress deprecation warnings from TF
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

backend = 'TF'+tf.__version__[0]  # TF2
#ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)
model_ext = '.h5' if 'TF' in backend else '.pth'
ML_modelpath = os.path.join('utils', 'sample_trained_models', 'adult'+model_ext)

# Step 2: dice_ml.Model
m = dice_model.Model(model_path=ML_modelpath, backend=backend, model_type='regressor')
# Step 3: initiate DiCE
exp = dice.Dice(d, m)

In [7]:
# query instance in the form of a dictionary or a dataframe; keys: feature name, values: feature value
query_instance = {'age': 29,
                  'workclass': 'Private',
                  'education': 'HS-grad',
                  'marital_status': 'Married',
                  'occupation': 'Blue-Collar',
                  'race': 'White',
                  'gender': 'Female',
                  'hours_per_week': 38}

In [8]:
# generate counterfactuals
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=100,
                                       desired_class="opposite", desired_range=[0.0, 1.0],
                                       proximity_weight=1.5,
                                       diversity_weight=0.5,
                                       yloss_type="difference_loss",
                                       diversity_loss_type="avg_dist",
                                       max_iter = 500,
                                       verbose=True)

step 1,  loss=489.76
step 51,  loss=0.145155
step 101,  loss=-0.434511
step 151,  loss=-0.511164
step 201,  loss=-0.566507
step 251,  loss=-0.60791
step 301,  loss=-0.640718
step 351,  loss=-0.664712
step 401,  loss=-0.683668
step 451,  loss=-0.696736
Diverse Counterfactuals found! total time taken: 40 min 07 sec


In [10]:
# visualize the result, highlight only the changes
dice_exp.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome: 0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29.0,Private,HS-grad,Married,Blue-Collar,White,Female,38.0,0.293



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,32.0,-,Doctorate,-,White-Collar,-,-,-,0.9127997756004333
1,36.0,-,Doctorate,-,White-Collar,-,-,-,0.941864013671875
2,38.0,-,Doctorate,-,White-Collar,-,-,36.0,0.9367575645446777
3,37.0,-,Doctorate,-,White-Collar,-,-,39.0,0.9444723725318909
4,37.0,-,Doctorate,-,White-Collar,-,-,-,0.9419694542884827
...,...,...,...,...,...,...,...,...,...
95,37.0,-,Doctorate,-,White-Collar,-,-,-,0.9419694542884827
96,33.0,-,Doctorate,-,White-Collar,-,-,40.0,0.9282369017601013
97,33.0,-,Doctorate,-,White-Collar,-,-,41.0,0.9315187931060791
98,33.0,-,Doctorate,-,White-Collar,-,-,37.0,0.917503297328949


In [3]:
import tensorflow as tf

x = tf.Variable(1.0)
y = tf.Variable(1.0)
opt = tf.keras.optimizers.SGD(lr=0.1)

@tf.function
def step():
    square_x = x ** 2
    with tf.GradientTape() as tape:
        square_x = x ** 2 + 3 * x # Dummy
        square_y = y ** 2
        L = square_x + square_y
    grad = tape.gradient(L, [x, y])
    tf.print(grad)
    opt.apply_gradients(zip(grad, [x, y]))
    tf.print("L=", L, "x=", x, "y=", y)

step()
# [0, 2] ??
# L= 2 x= 1 y= 0.8

[5, 2]
L= 5 x= 0.5 y= 0.8
