# DiCE Case Study

---

In [267]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import dice_ml
from dice_ml.data_interfaces.public_data_interface import PublicData
from dice_ml.utils import helpers # helper functions
import ce_helpers
import os

## Preparation

### Data prep

The (preprocessed) data can be downloaded from: https://datahub.io/machine-learning/credit-g

In [268]:
# model to use: either svm, linear, mlp, rf
alg = 'mlp'
method = 'kdtree'

In [269]:
### reading in data
dataset_name = 'german_credit'
df = pd.read_csv('./data/credit-g_csv.csv')

# recode class to 0 and 1
recode = {"class": {"bad": 0, "good": 1}}
df = df.replace(recode)

df.head()

target = 'class'

X = df.drop(target, axis=1)
y = df[target]

x_train_dice, x_test_dice, y_train_dice, y_test_dice = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

train_dataset = pd.merge(pd.DataFrame(x_train_dice),pd.DataFrame(y_train_dice), left_index=True, right_index=True)

numerical = ['duration', 'credit_amount', 'installment_commitment', 'age',
             'residence_since', 'existing_credits', 'num_dependents']
categorical = x_train_dice.columns.difference(numerical)

In [270]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', MinMaxScaler(), numerical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
if alg == 'svm':
    clf_dice = Pipeline(steps=[('preprocessor', transformations),
                          ('classifier', LinearSVC(random_state=0, max_iter = 1e5, dual=False, penalty = 'l2', C=0.1))])
elif alg == 'rf':
    clf_dice = Pipeline(steps=[('preprocessor', transformations),
                               ('classifier', RandomForestClassifier(random_state=0, max_depth=3, max_features='auto',
                                                                    n_estimators = 25))])
elif alg == 'mlp':
    clf_dice = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', MLPClassifier(random_state=0, max_iter=10000, solver='lbfgs'))])
elif alg == 'linear':
    clf_dice = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', LogisticRegression(random_state=0, max_iter=10000, solver='saga', C=0.401))])

In [271]:
model = clf_dice.fit(x_train_dice, y_train_dice)

In [272]:
y_pred = model.predict(x_test_dice)
y_pred_0 = np.where(y_pred==0)
X_test_0 = x_test_dice.iloc[y_pred_0[0],:].copy()

### Factual instance

In [273]:
u_index = 2
# u = X_test_0.iloc[u_index,:]
u = pd.DataFrame(X_test_0.iloc[u_index,:]).T
# print(u)
print('predicted label: %d' % (model.predict(u)))

predicted label: 0


### DiCE setup

In [274]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=model, backend="sklearn")

# Step 3: dice_ml.Dice
# exp = dice_ml.Dice(d, m, method="random")
exp = dice_ml.Dice(d, m, method=method)

---

## Part B: validity, proximity, sparsity

In [275]:
## Diversity
num_counterfactuals = 1

## Actionability

# immutable features
I = []
# I = ['personal_status', 'foreign_worker', 'purpose']

# actionability constraints
permitted_ranges = {}
# permitted_ranges = {
#   'age': [u['age'], df['age'].max()], # age should only increase
#   'residence_since': [u['residence_since'], df['residence_since'].max()],
#   'employment': employment[employment.index(u['employment'].item()):]
#   }

F_b = df.columns.difference(numerical + [target])
feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

features = list(feature_ranges.keys())
features_to_vary = [ele for ele in features if ele not in I]

In [276]:
# generate counterfactuals
if method == 'random':
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite", random_seed=0,
                                    features_to_vary = features_to_vary)
else:
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite",
                                features_to_vary = features_to_vary)

100%|██████████| 1/1 [00:00<00:00,  7.46it/s]


In [277]:
# extract counterfactuals as data frame
CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
CFs = CFs.reset_index(drop=True)

number_of_solutions = len(CFs.index)

CEs = pd.concat([u, CFs])
ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
CEs = CEs.set_index(pd.Index(ix_names))

### Visualize DF

In [278]:
orig = CEs[:1]
df = CEs[1:].copy()
df1 = pd.DataFrame()
for c in df.columns:
    df1[c] = df.apply(lambda row: ce_helpers.ce_change(row, df1, orig, c), axis=1)

df_1 = pd.concat([orig, df1])
df_1

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
original,<0,24,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4.0,real estate,25.0,none,rent,1.0,skilled,1,none,yes
sol0,-,-,critical/other existing credit,business,1382.0,100<=X<500,4<=X<7,-,male single,-,1.0,-,26.0,-,own,2.0,-,-,yes,


### Evaluation

In [279]:
CE_perf = ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)
CE_perf = CE_perf.set_index(pd.Index(['Part B']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part B,1.0,0.38,-16.0,0.4,,,


---

## Part C: validity, proximity, sparsity, diversity

In [280]:
## Diversity
num_counterfactuals = 3

## Actionability

# immutable features
I = []
# I = ['personal_status', 'foreign_worker', 'purpose']

# actionability constraints
permitted_ranges = {}
# permitted_ranges = {
#   'age': [u['age'], df['age'].max()], # age should only increase
#   'residence_since': [u['residence_since'], df['residence_since'].max()],
#   'employment': employment[employment.index(u['employment'].item()):]
#   }

F_b = df.columns.difference(numerical + [target])
feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

features = list(feature_ranges.keys())
features_to_vary = [ele for ele in features if ele not in I]

In [281]:
# generate counterfactuals
if method == 'random':
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite", random_seed=0,
                                    features_to_vary = features_to_vary)
else:
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite",
                                features_to_vary = features_to_vary)

100%|██████████| 1/1 [00:00<00:00,  2.67it/s]


In [282]:
# extract counterfactuals as data frame
CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
CFs = CFs.reset_index(drop=True)

number_of_solutions = len(CFs.index)

CEs = pd.concat([u, CFs])
ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
CEs = CEs.set_index(pd.Index(ix_names))

### Visualize DF

In [283]:
orig = CEs[:1]
df = CEs[1:].copy()
df1 = pd.DataFrame()
for c in df.columns:
    df1[c] = df.apply(lambda row: ce_helpers.ce_change(row, df1, orig, c), axis=1)

df_2 = pd.concat([orig, df1])
df_2

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
original,<0,24.0,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4,real estate,25.0,none,rent,1,skilled,1,none,yes
sol0,no checking,9.0,-,furniture/equipment,1388.0,<100,-,-,-,-,2.0,-,26.0,-,-,-,-,-,-,
sol1,-,15.0,-,-,1403.0,<100,-,2.0,-,-,-,car,28.0,-,-,-,-,-,-,
sol2,no checking,15.0,-,radio/tv,1386.0,-,-,-,male mar/wid,-,2.0,-,40.0,-,-,-,-,-,yes,


### Evaluation

In [284]:
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)])
CE_perf = CE_perf.set_index(pd.Index(['Part B', 'Part C']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part B,1.0,0.38,-16.0,0.4,,,
Part C,1.0,0.69,-40.67,0.6,0.41,27.33,0.47


---

## Part D: validity, proximity, sparsity, diversity, actionability

In [285]:
## Diversity
num_counterfactuals = 3

## Actionability

# immutable features
# I = []
I = ['personal_status', 'foreign_worker', 'purpose']

# actionability constraints
employment = ['unemployed', '<1', '1<=X<4','4<=X<7', '>=7']

permitted_ranges = {}
permitted_ranges = {
  'age': [u['age'], df['age'].max()], # age should only increase
  'residence_since': [u['residence_since'], df['residence_since'].max()],
  'employment': employment[employment.index(u['employment'].item()):]
  }

F_b = df.columns.difference(numerical + [target])
feature_ranges = ce_helpers.get_features_range(df, numerical, F_b, permitted_ranges)[0]

features = list(feature_ranges.keys())
features_to_vary = [ele for ele in features if ele not in I]

In [286]:
# generate counterfactuals
if method == 'random':
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite", random_seed=0,
                                    features_to_vary = features_to_vary)
else:
    e1 = exp.generate_counterfactuals(u, total_CFs=num_counterfactuals, desired_class="opposite",
                                features_to_vary = features_to_vary)

100%|██████████| 1/1 [00:00<00:00,  2.83it/s]


In [287]:
# extract counterfactuals as data frame
CFs = e1.cf_examples_list[0].final_cfs_df.iloc[:,:-1]
CFs = CFs.reset_index(drop=True)

number_of_solutions = len(CFs.index)

CEs = pd.concat([u, CFs])
ix_names = ['original']+['sol'+str(i) for i in range(number_of_solutions)]
CEs = CEs.set_index(pd.Index(ix_names))

### Visualize DF

In [288]:
orig = CEs[:1]
df = CEs[1:].copy()
df1 = pd.DataFrame()
for c in df.columns:
    df1[c] = df.apply(lambda row: ce_helpers.ce_change(row, df1, orig, c), axis=1)

df_3 = pd.concat([orig, df1])
df_3

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
original,<0,24.0,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4,real estate,25.0,none,rent,1,skilled,1,none,yes
sol0,-,15.0,-,-,1403.0,<100,-,2.0,-,-,-,car,28.0,-,-,-,-,-,-,
sol1,no checking,11.0,critical/other existing credit,-,1393.0,<100,<1,-,-,-,-,car,35.0,-,own,2.0,high qualif/self emp/mgmt,-,-,
sol2,no checking,10.0,-,-,1364.0,<100,-,2.0,-,-,-,car,64.0,-,own,-,-,-,yes,


### Evaluation

In [289]:
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(model, CEs, numerical, categorical, rounding = True)])
CE_perf = CE_perf.set_index(pd.Index(['Part B', 'Part C', 'Part D']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part B,1.0,0.38,-16.0,0.4,,,
Part C,1.0,0.69,-40.67,0.6,0.41,27.33,0.47
Part D,1.0,0.56,-51.33,0.52,0.38,55.33,0.47


---

## Table

In [290]:
df_complete = pd.concat([df_1, df_2, df_3])
df_complete

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
original,<0,24,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4,real estate,25.0,none,rent,1,skilled,1,none,yes
sol0,-,-,critical/other existing credit,business,1382.0,100<=X<500,4<=X<7,-,male single,-,1.0,-,26.0,-,own,2.0,-,-,yes,
original,<0,24,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4,real estate,25.0,none,rent,1,skilled,1,none,yes
sol0,no checking,9.0,-,furniture/equipment,1388.0,<100,-,-,-,-,2.0,-,26.0,-,-,-,-,-,-,
sol1,-,15.0,-,-,1403.0,<100,-,2.0,-,-,-,car,28.0,-,-,-,-,-,-,
sol2,no checking,15.0,-,radio/tv,1386.0,-,-,-,male mar/wid,-,2.0,-,40.0,-,-,-,-,-,yes,
original,<0,24,existing paid,new car,1371.0,no known savings,1<=X<4,4,female div/dep/mar,none,4,real estate,25.0,none,rent,1,skilled,1,none,yes
sol0,-,15.0,-,-,1403.0,<100,-,2.0,-,-,-,car,28.0,-,-,-,-,-,-,
sol1,no checking,11.0,critical/other existing credit,-,1393.0,<100,<1,-,-,-,-,car,35.0,-,own,2.0,high qualif/self emp/mgmt,-,-,
sol2,no checking,10.0,-,-,1364.0,<100,-,2.0,-,-,-,car,64.0,-,own,-,-,-,yes,


In [291]:
if not os.path.exists('results/%s/%s/' % (dataset_name, alg)):
    os.makedirs('results/%s/%s/' % (dataset_name, alg))

df_complete.to_csv('results/%s/%s/DiCE_%s_%s_%s.csv' % (dataset_name, alg, method, alg, dataset_name), index=True)
CE_perf.to_csv('results/%s/%s/DiCE_%s_%s_%s_eval.csv' % (dataset_name, alg, method, alg, dataset_name), index=True)