# Evaluation DiCE

## Preparation

In [5]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import LinearSVC
import dice_ml
from dice_ml.data_interfaces.public_data_interface import PublicData
from dice_ml.utils import helpers # helper functions
import ce_helpers

In [6]:
num_iterations = 30

# CREDIT

## data & model prep

The (preprocessed) data can be downloaded from: https://datahub.io/machine-learning/credit-g

In [7]:
### reading in data
dataset_name = 'german_credit'
df = pd.read_csv('./data/credit-g_csv.csv')

# recode class to 0 and 1
recode = {"class": {"bad": 0, "good": 1}}
df = df.replace(recode)

df.head()

target = 'class'

X = df.drop(target, axis=1)
y = df[target]

x_train_dice, x_test_dice, y_train_dice, y_test_dice = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

train_dataset = pd.merge(pd.DataFrame(x_train_dice),pd.DataFrame(y_train_dice), left_index=True, right_index=True)

numerical = ['duration', 'credit_amount', 'installment_commitment', 'age', 
             'residence_since', 'existing_credits', 'num_dependents']
categorical = x_train_dice.columns.difference(numerical)

In [8]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', MinMaxScaler(), numerical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_dice = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', LinearSVC(random_state=0, max_iter = 1e5, dual=False, penalty = 'l2', C=0.1))])

In [9]:
model = clf_dice.fit(x_train_dice, y_train_dice)

In [10]:
y_pred = model.predict(x_test_dice)
y_pred_0 = np.where(y_pred==0)
X_test_0 = x_test_dice.iloc[y_pred_0[0],:].copy()

## run DiCE

In [11]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=model, backend="sklearn")

# Step 3: dice_ml.Dice
exp = dice_ml.Dice(d, m, method="random")

# Actionability 
# immutable features
I = ['personal_status', 'foreign_worker', 'purpose']

employment = ['unemployed', '<1', '1<=X<4','4<=X<7', '>=7']
# permitted ranges depend on u, and hence have to be inside the loop

In [12]:
## loop over several factual instances
CE_perf = pd.DataFrame()

eval = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for u_index in range(iters):
    print('u_index: %d' % u_index)
    u = pd.DataFrame(X_test_0.iloc[u_index,:]).T

    # Actionability
    permitted_ranges = {}
    permitted_ranges = {
      'age': [u['age'], df['age'].max()], # age should only increase
      'residence_since': [u['residence_since'], df['residence_since'].max()],
      'employment': employment[employment.index(u['employment'].item()):]
      }

    F_b = df.columns.difference(numerical + [target])
    feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

    features = list(feature_ranges.keys())
    features_to_vary = [ele for ele in features if ele not in I] 

    # generate counterfactuals
    e1 = exp.generate_counterfactuals(u, total_CFs=3, desired_class="opposite", random_seed=0, 
                                    features_to_vary = features_to_vary, permitted_range = feature_ranges)

    # extract counterfactuals as data frame
    CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
    CFs = CFs.reset_index(drop=True)

    number_of_solutions = len(CFs.index)

    CEs = pd.concat([u, CFs])
    ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
    CEs = CEs.set_index(pd.Index(ix_names))

    CE_perf1 = ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)
    CE_perf = CE_perf.append(CE_perf1)

eval.loc['mean'] = np.array(CE_perf.mean())
eval.loc['std'] = np.array(CE_perf.std())
eval.round(2)

u_index: 0


100%|██████████| 1/1 [00:00<00:00, 11.80it/s]


u_index: 1


100%|██████████| 1/1 [00:00<00:00, 13.01it/s]


u_index: 2


100%|██████████| 1/1 [00:00<00:00, 13.27it/s]


u_index: 3


100%|██████████| 1/1 [00:00<00:00, 13.27it/s]


u_index: 4


100%|██████████| 1/1 [00:00<00:00, 13.18it/s]


u_index: 5


100%|██████████| 1/1 [00:00<00:00, 13.94it/s]


u_index: 6


100%|██████████| 1/1 [00:00<00:00, 13.40it/s]


u_index: 7


100%|██████████| 1/1 [00:00<00:00, 13.66it/s]


u_index: 8


100%|██████████| 1/1 [00:00<00:00, 12.86it/s]


u_index: 9


100%|██████████| 1/1 [00:00<00:00, 13.01it/s]


u_index: 10


100%|██████████| 1/1 [00:00<00:00, 13.25it/s]


u_index: 11


100%|██████████| 1/1 [00:00<00:00, 13.72it/s]


u_index: 12


100%|██████████| 1/1 [00:00<00:00, 13.72it/s]


u_index: 13


100%|██████████| 1/1 [00:00<00:00, 13.22it/s]


u_index: 14


100%|██████████| 1/1 [00:00<00:00, 13.60it/s]


u_index: 15


100%|██████████| 1/1 [00:00<00:00, 13.37it/s]


u_index: 16


100%|██████████| 1/1 [00:00<00:00, 13.06it/s]


u_index: 17


100%|██████████| 1/1 [00:00<00:00, 13.17it/s]


u_index: 18


100%|██████████| 1/1 [00:00<00:00, 14.13it/s]


u_index: 19


100%|██████████| 1/1 [00:00<00:00, 13.66it/s]


u_index: 20


100%|██████████| 1/1 [00:00<00:00, 13.57it/s]


u_index: 21


100%|██████████| 1/1 [00:00<00:00, 12.04it/s]


u_index: 22


100%|██████████| 1/1 [00:00<00:00, 13.07it/s]


u_index: 23


100%|██████████| 1/1 [00:00<00:00, 13.47it/s]


u_index: 24


100%|██████████| 1/1 [00:00<00:00, 12.94it/s]


u_index: 25


100%|██████████| 1/1 [00:00<00:00, 13.42it/s]


u_index: 26


100%|██████████| 1/1 [00:00<00:00, 13.07it/s]


u_index: 27


100%|██████████| 1/1 [00:00<00:00, 13.46it/s]


u_index: 28


100%|██████████| 1/1 [00:00<00:00, 13.26it/s]


u_index: 29


100%|██████████| 1/1 [00:00<00:00, 12.73it/s]


Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,1.0,0.9,-240.6,0.91,0.16,479.67,0.15
std,0.0,0.03,775.33,0.01,0.07,1551.1,0.03


# ADULT

## data & model prep

In [9]:
### reading in data
dataset_name = 'adult_income'
df = helpers.load_adult_income_dataset(only_train = False)
recode = {"education": {"HS-grad": 'School', 'Prof-school': 'School', "Assoc": 'Bachelors', "Some-college": 'Bachelors'}}
df = df.replace(recode)
df.head()

target = 'income'

X = df.drop(target, axis=1)
y = df[target]

x_train_dice, x_test_dice, y_train_dice, y_test_dice = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

train_dataset = pd.merge(pd.DataFrame(x_train_dice),pd.DataFrame(y_train_dice), left_index=True, right_index=True)

numerical = ["age", "hours_per_week"]
categorical = x_train_dice.columns.difference(numerical)

In [10]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', MinMaxScaler(), numerical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_dice = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', LinearSVC(random_state=0, max_iter = 1e5, dual=False, penalty = 'l2', C=100))])

In [11]:
model = clf_dice.fit(x_train_dice, y_train_dice)

In [12]:
y_pred = model.predict(x_test_dice)
y_pred_0 = np.where(y_pred==0)
X_test_0 = x_test_dice.iloc[y_pred_0[0],:].copy()

## run DiCE

In [13]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

# Step 2: dice_ml.Model
m = dice_ml.Model(model=model, backend="sklearn")

# Step 3: dice_ml.Dice
exp = dice_ml.Dice(d, m, method="random")

# Actionability 
I = ['gender', 'marital_status','race']
education = ['School', 'Bachelors', 'Masters', 'Doctorate']
# permitted ranges depend on u, and hence have to be inside the loop

In [14]:
## loop over several factual instances
CE_perf = pd.DataFrame()

eval = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for u_index in range(iters):
    print('u_index: %d' % u_index)
    u = pd.DataFrame(X_test_0.iloc[u_index,:]).T

    # Actionability
    permitted_ranges = {}
    permitted_ranges = {
      'age': [u['age'], df['age'].max()], # age should only increase
      'education': education[education.index(u['education'].item()):]
      }

    F_b = df.columns.difference(numerical + [target])
    feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

    features = list(feature_ranges.keys())
    features_to_vary = [ele for ele in features if ele not in I] 

    # generate counterfactuals
    e1 = exp.generate_counterfactuals(u, total_CFs=3, desired_class="opposite", random_seed=0, 
                                    features_to_vary = features_to_vary, permitted_range = feature_ranges)

    # extract counterfactuals as data frame
    CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
    CFs = CFs.reset_index(drop=True)

    number_of_solutions = len(CFs.index)

    CEs = pd.concat([u, CFs])
    ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
    CEs = CEs.set_index(pd.Index(ix_names))

    CE_perf1 = ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)
    CE_perf = CE_perf.append(CE_perf1)

eval.loc['mean'] = np.array(CE_perf.mean())
eval.loc['std'] = np.array(CE_perf.std())
eval.round(2)

u_index: 0


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.92it/s]


u_index: 1


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.31it/s]

u_index: 2



100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.18it/s]


u_index: 3


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.44it/s]


u_index: 4


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.70it/s]


u_index: 5


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.67it/s]


u_index: 6


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.34it/s]


u_index: 7


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.24it/s]


u_index: 8


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.19it/s]


u_index: 9


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.86it/s]


u_index: 10


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.78it/s]


u_index: 11


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.34it/s]


u_index: 12


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.10it/s]


u_index: 13


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.57it/s]


u_index: 14


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.34it/s]


u_index: 15


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.34it/s]


u_index: 16


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.04it/s]


u_index: 17


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.11it/s]


u_index: 18


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.24it/s]


u_index: 19


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.72it/s]


u_index: 20


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.57it/s]


u_index: 21


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.26it/s]


u_index: 22


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.25it/s]


u_index: 23


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.67it/s]


u_index: 24


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.47it/s]


u_index: 25


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.24it/s]


u_index: 26


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.11it/s]


u_index: 27


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.31it/s]


u_index: 28


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.16it/s]


u_index: 29


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.46it/s]


Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,1.0,0.84,-54.71,0.72,0.16,36.6,0.32
std,0.0,0.07,25.03,0.07,0.1,17.75,0.09


# DIABETES

## data & model prep

The (preprocessed) data can be downloaded from: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

In [15]:
### reading in data
dataset_name = 'diabetes'
df = pd.read_csv('./data/diabetes.csv')
df.head()

target = 'Outcome'

X = df.drop(target, axis=1)
y = df[target]

x_train_dice, x_test_dice, y_train_dice, y_test_dice = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

train_dataset = pd.merge(pd.DataFrame(x_train_dice),pd.DataFrame(y_train_dice), left_index=True, right_index=True)

numerical = list(X.columns)
categorical = x_train_dice.columns.difference(numerical)

In [16]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', MinMaxScaler(), numerical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_dice = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', LinearSVC(random_state=0, max_iter = 1e5, dual=False, penalty = 'l2', C=100))])

In [17]:
model = clf_dice.fit(x_train_dice, y_train_dice)

In [18]:
y_pred = model.predict(x_test_dice)
y_pred_0 = np.where(y_pred==0)
X_test_0 = x_test_dice.iloc[y_pred_0[0],:].copy()

## run DiCE

In [19]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=model, backend="sklearn")

# Step 3: dice_ml.Dice
exp = dice_ml.Dice(d, m, method="random")

# Actionability 
# immutable features
I = []

In [20]:
## loop over several factual instances
CE_perf = pd.DataFrame()

eval = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for u_index in range(iters):
    print('u_index: %d' % u_index)
    u = pd.DataFrame(X_test_0.iloc[u_index,:]).T

    # Actionability
    permitted_ranges = {}

    permitted_ranges = {
      'Age': [u['Age'], df['Age'].max()], # age should only increase
      'Pregnancies': [u['Pregnancies'], df['Pregnancies'].max()]
      }


    F_b = df.columns.difference(numerical + [target])
    feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

    features = list(feature_ranges.keys())
    features_to_vary = [ele for ele in features if ele not in I] 

    # generate counterfactuals
    e1 = exp.generate_counterfactuals(u, total_CFs=3, desired_class="opposite", random_seed=0, 
                                    features_to_vary = features_to_vary, permitted_range = feature_ranges)

    # extract counterfactuals as data frame
    CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
    CFs = CFs.reset_index(drop=True)

    number_of_solutions = len(CFs.index)

    CEs = pd.concat([u, CFs])
    ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
    CEs = CEs.set_index(pd.Index(ix_names))

    CE_perf1 = ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)
    CE_perf = CE_perf.append(CE_perf1)

eval.loc['mean'] = np.array(CE_perf.mean())
eval.loc['std'] = np.array(CE_perf.std())
eval.round(2)

u_index: 0


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.79it/s]


u_index: 1


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.09it/s]


u_index: 2


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.18it/s]


u_index: 3


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.07it/s]


u_index: 4


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.69it/s]


u_index: 5


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.01it/s]


u_index: 6


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.40it/s]


u_index: 7


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.40it/s]


u_index: 8


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.56it/s]


u_index: 9


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.51it/s]


u_index: 10


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.92it/s]


u_index: 11


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.20it/s]


u_index: 12


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.51it/s]


u_index: 13


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.49it/s]


u_index: 14


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.72it/s]


u_index: 15


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.55it/s]


u_index: 16


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.84it/s]


u_index: 17


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.35it/s]


u_index: 18


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 15.36it/s]


u_index: 19


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.68it/s]


u_index: 20


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.43it/s]


u_index: 21


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.72it/s]


u_index: 22


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.19it/s]


u_index: 23


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.94it/s]


u_index: 24


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.50it/s]


u_index: 25


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 16.26it/s]


u_index: 26


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.47it/s]


u_index: 27


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.66it/s]


u_index: 28


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  7.77it/s]


u_index: 29


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 13.27it/s]


Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,1.0,,-64.12,0.78,,93.6,0.36
std,0.0,,40.12,0.03,,68.42,0.06


# HEART

## data & model prep

The (preprocessed) data can be downloaded from: https://www.kaggle.com/datasets/shubamsumbria/statlog-heart-data-set

In [21]:
### reading in data
dataset_name = 'heart'
df = pd.read_csv('./data/statlog.csv')
recode = {"presence": {1: 0, 2:1},
          'cp':{1:'typical angina', 2:'atypical angina', 3:'nonanginal pain', 4:'asymptomatic'},
          'sex':{0:'female', 1:'male'},
          'fbs':{0:'false',1:'true'},
          'restecg':{0:'normal', 1:'having ST-T wave abnormality',2:'left ventricular hypertrophy'},
          'exang':{0:'no', 1:'yes'},
          'slope':{1:'upsloping', 2:'flat', 3:'downsloping'},
          'thal':{3:'normal', 6:'fixed defect', 7:'reversible defect'},
          'ca':{0:'0',1:'1', 2:'2',3:'3'}}
df = df.replace(recode)
df.head()

target = 'presence'

X = df.drop(target, axis=1)
y = df[target]

x_train_dice, x_test_dice, y_train_dice, y_test_dice = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

train_dataset = pd.merge(pd.DataFrame(x_train_dice),pd.DataFrame(y_train_dice), left_index=True, right_index=True)

numerical = ['age','trestbps','chol', 'thalach', 'oldpeak']
categorical = x_train_dice.columns.difference(numerical)

In [22]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', MinMaxScaler(), numerical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_dice = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', LinearSVC(random_state=0, max_iter = 1e5, dual=False, penalty = 'l2', C=0.1))])

In [23]:
model = clf_dice.fit(x_train_dice, y_train_dice)

In [24]:
y_pred = model.predict(x_test_dice)
y_pred_0 = np.where(y_pred==0)
X_test_0 = x_test_dice.iloc[y_pred_0[0],:].copy()

## run DiCE

In [25]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=model, backend="sklearn")

# Step 3: dice_ml.Dice
exp = dice_ml.Dice(d, m, method="random")

# Actionability 
# immutable features
I = ['sex']

In [26]:
## loop over several factual instances
CE_perf = pd.DataFrame()

eval = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for u_index in range(iters):
    print('u_index: %d' % u_index)
    u = pd.DataFrame(X_test_0.iloc[u_index,:]).T

    # Actionability
    permitted_ranges = {}
    permitted_ranges = {
      'age': [u['age'], df['age'].max()] # age should only increase
      }

    F_b = df.columns.difference(numerical + [target])
    feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

    features = list(feature_ranges.keys())
    features_to_vary = [ele for ele in features if ele not in I] 

    # generate counterfactuals
    e1 = exp.generate_counterfactuals(u, total_CFs=3, desired_class="opposite", random_seed=0, 
                                    features_to_vary = features_to_vary, permitted_range = feature_ranges)

    # extract counterfactuals as data frame
    CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
    CFs = CFs.reset_index(drop=True)

    number_of_solutions = len(CFs.index)

    CEs = pd.concat([u, CFs])
    ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
    CEs = CEs.set_index(pd.Index(ix_names))

    CE_perf1 = ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)
    CE_perf = CE_perf.append(CE_perf1)

eval.loc['mean'] = np.array(CE_perf.mean())
eval.loc['std'] = np.array(CE_perf.std())
eval.round(2)

u_index: 0


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  5.02it/s]


u_index: 1


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.52it/s]


u_index: 2


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  8.70it/s]


u_index: 3


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.62it/s]


u_index: 4


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.68it/s]


u_index: 5


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.77it/s]


u_index: 6


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.99it/s]


u_index: 7


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.37it/s]


u_index: 8


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.03it/s]


u_index: 9


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.82it/s]


u_index: 10


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.56it/s]


u_index: 11


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  9.73it/s]


u_index: 12


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  9.02it/s]


u_index: 13


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.18it/s]


u_index: 14


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  9.09it/s]


u_index: 15


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.99it/s]


u_index: 16


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.19it/s]


u_index: 17


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.45it/s]


u_index: 18


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.49it/s]


u_index: 19


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.24it/s]


u_index: 20


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.46it/s]


u_index: 21


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.27it/s]


u_index: 22


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.58it/s]


u_index: 23


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.80it/s]


u_index: 24


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.50it/s]


u_index: 25


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.87it/s]


u_index: 26


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.33it/s]


u_index: 27


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  6.63it/s]


u_index: 28


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 11.97it/s]


u_index: 29


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.57it/s]


Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,1.0,0.86,-33.81,0.84,0.19,59.1,0.24
std,0.0,0.09,42.65,0.04,0.12,73.38,0.05
