In [220]:
import Datasets as DS
import warnings
import numpy as np
import ce_helpers
from itertools import chain
warnings.filterwarnings("ignore")
from importlib import reload
import dice_ml
import pandas as pd

In [221]:
wd = '/Users/tabearober/Documents/Counterfactuals/CE-OCL/data/'

In [222]:
reload(dice_ml)

<module 'dice_ml' from '/Users/tabearober/Documents/Counterfactuals/CE-OCL/venv/lib/python3.8/site-packages/dice_ml/__init__.py'>

In [223]:
df, df_train, df_test, d, clf, df_factuals, model_master, encoder, scaler = DS.adult(wd)

In [224]:
target = d['target']
numerical = d['numerical']
categorical = d['categorical']

mapping = {}
for i in range(len(categorical)):
    mapping[categorical[i]] = [categorical[i] + '_' + s for s in list(encoder.categories_[i])]
mapping

{'marital-status': ['marital-status_Married', 'marital-status_Non-Married'],
 'native-country': ['native-country_Non-US', 'native-country_US'],
 'occupation': ['occupation_Managerial-Specialist', 'occupation_Other'],
 'race': ['race_Non-White', 'race_White'],
 'relationship': ['relationship_Husband', 'relationship_Non-Husband'],
 'sex': ['sex_Female', 'sex_Male'],
 'workclass': ['workclass_Non-Private', 'workclass_Private']}

---

## CE-OCL

### Optimization Prep

X: complete dataset without target

In [225]:
X = df.drop(target,axis=1)

X1: points in X that have 1 as label. They will be used as trust region

In [226]:
y_ix_1 = np.where(df[target]==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

Specify parameters

In [227]:
sp = True
mu = 10000
tr_region = True
enlarge_tr = False
num_counterfactuals = 3

In [228]:
# features that can only increase (become larger)
L = []
# conditionally mutable features, such as education level (can only take on higher categories)
Pers_I = []
# features that can only be positive
P = []
# immutable features
# I = mapping[d['immutable']]
I = [mapping.get(key) for key in list(set(d['immutable']).intersection(categorical))]
I = list(set(list(chain.from_iterable(I))).intersection(df.columns))
I.extend(set(d['immutable']).intersection(numerical))
# dictionary with one-hot encoding used to ensure coherence
F_coh = {}
# integer features
F_int = []
# categorical features one-hot encoded
F_b = df.columns.difference(numerical + [target])
# F_r
F_r = numerical

### Optimize for all factuals

In [229]:
df_performance = pd.DataFrame()
eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

# for u_index in range(len(df_factuals)):
for u_index in range(4):
    print('u_index: %i' % u_index)
    u = df_factuals.drop(target,axis=1).iloc[u_index,:]

    CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, numerical, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                           tr_region, enlarge_tr, 3, model_master, scaler)

    df_orig = ce_helpers.visualise_changes(clf, d, encoder, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_)
    df_performance_1 = ce_helpers.evaluation_carla(df_orig, d)

    df_performance = df_performance.append(df_performance_1)

eval_final.loc['mean'] = np.array(df_performance.mean())
eval_final.loc['std'] = np.array(df_performance.std())

eval_final.round(2)

u_index: 0
Generating constraints for the trust region using 11687 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 20000.194661322494
The optimal solution is: [0.315068493, 0.230798686, 0.7666676805638417, 0.08296182913181838, -1.3144331323263546e-13, 0.346938776, 0.9999999999999843, 0.9999999999999843, 0.0, 0.9999999999999843, 1.0, 0.0, 0.9999999999999843]
u_index: 1
Generating constraints for the trust region using 11687 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ:

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,0.92,1.0,-9295.66,0.83,0.0,5294.39,0.09
std,0.16,0.0,4383.07,0.04,0.0,9167.27,0.03


---

# DiCE: several factuals

In [230]:
# Step 1: dice_ml.Data
data = dice_ml.Data(dataframe=df_train, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=clf, backend="sklearn")

# Step 3: dice_ml.Dice
meth = 'random'
exp = dice_ml.Dice(data, m, method=meth)

In [231]:
# immutable features
I = [mapping.get(key) for key in list(set(d['immutable']).intersection(categorical))]
I = list(set(list(chain.from_iterable(I))).intersection(df.columns))
I.extend(set(d['immutable']).intersection(numerical))

features = df_train.drop(target,axis=1).columns
features_to_vary = [ele for ele in features if ele not in I]

In [232]:
df_performance = pd.DataFrame()
eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

# for u_index in range(len(df_factuals)):
for u_index in range(4):
    print('u_index: %i' % u_index)
    u = df_factuals.drop(target,axis=1).iloc[u_index,:]

    if meth == 'random':
        e1 = exp.generate_counterfactuals(pd.DataFrame(u).T, total_CFs=3, desired_class="opposite", random_seed=0,
                                    features_to_vary = features_to_vary)
    else:
        e1 = exp.generate_counterfactuals(pd.DataFrame(u).T, total_CFs=3, desired_class="opposite",
                                    features_to_vary = features_to_vary)

    df_orig = ce_helpers.visualise_changes(clf, d, encoder, method='DiCE', exp = e1, factual=pd.DataFrame(u).T, scaler=scaler, only_changes=False)
    df_performance_1 = ce_helpers.evaluation_carla(df_orig, d)

    df_performance = df_performance.append(df_performance_1)

eval_final.loc['mean'] = np.array(df_performance.mean())
eval_final.loc['std'] = np.array(df_performance.std())

eval_final.round(2)

u_index: 0


100%|██████████| 1/1 [00:00<00:00,  6.89it/s]


u_index: 1


100%|██████████| 1/1 [00:00<00:00,  7.28it/s]


u_index: 2


100%|██████████| 1/1 [00:00<00:00,  6.74it/s]


u_index: 3


100%|██████████| 1/1 [00:00<00:00,  6.68it/s]


Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
mean,1.0,0.93,-63883.78,0.8,0.1,35666.21,0.18
std,0.0,0.06,38642.53,0.02,0.08,28664.88,0.07


---

# DiCE: single factual

In [233]:
# Factual instance
u_index = 0
u = df_factuals.drop(target, axis=1).iloc[u_index, :]
# print(u)
print('predicted label: %d' % (clf.predict([u])))

predicted label: 0


In [234]:
# Step 1: dice_ml.Data
data = dice_ml.Data(dataframe=df_train, continuous_features=numerical, outcome_name=target)

# Step 2: dice_ml.Model
m = dice_ml.Model(model=clf, backend="sklearn")

# Step 3: dice_ml.Dice
exp = dice_ml.Dice(data, m, method="random")

In [235]:
# immutable features
I = [mapping.get(key) for key in list(set(d['immutable']).intersection(categorical))]
I = list(set(list(chain.from_iterable(I))).intersection(df.columns))
I.extend(set(d['immutable']).intersection(numerical))

features = df_train.drop(target,axis=1).columns
features_to_vary = [ele for ele in features if ele not in I]

In [236]:
e1 = exp.generate_counterfactuals(pd.DataFrame(u).T, total_CFs=3, desired_class="opposite", random_seed=0,
                                features_to_vary = features_to_vary)

100%|██████████| 1/1 [00:00<00:00,  7.03it/s]


### Visualize

In [237]:
ce_helpers.visualise_changes(clf, d, encoder, method='DiCE', exp = e1, factual=pd.DataFrame(u).T, scaler=scaler, only_changes=True)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,hours-per-week,capital-loss,income,marital-status,native-country,occupation,race,relationship,sex,workclass
original,40.0,353431.999757,6.0,0.0,35.0,0.0,0,Non-Married,US,Managerial-Specialist,White,Non-Husband,Female,Non-Private
sol0,-,-,14.825786,99999.0,-,-,1,-,-,-,-,-,-,-
sol1,-,-,-,89999.1,-,-,1,Married,-,-,-,-,-,-
sol2,-,-,-,109998.9,-,-,1,Married,-,-,-,-,-,-


### Evaluation

In [238]:
df_orig = ce_helpers.visualise_changes(clf, d, encoder, method='DiCE', exp = e1, factual=pd.DataFrame(u).T, scaler=scaler, only_changes=False)
df_performance = ce_helpers.evaluation_carla(df_orig, d)
df_performance

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
0,1.0,0.9,-100001.94,0.79,0.1,13339.08,0.17


---

# CE-OCL: single factual

In [239]:
# Factual instance
u_index = 0
u = df_factuals.drop(target, axis=1).iloc[u_index, :]
# print(u)
print('predicted label: %d' % (clf.predict([u])))

predicted label: 0


In [240]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, numerical, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                           tr_region, enlarge_tr, 3, model_master, scaler)

Generating constraints for the trust region using 11687 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 20000.194661322494
The optimal solution is: [0.315068493, 0.230798686, 0.7666676805638417, 0.08296182913181838, -1.3144331323263546e-13, 0.346938776, 0.9999999999999843, 0.9999999999999843, 0.0, 0.9999999999999843, 1.0, 0.0, 0.9999999999999843]


## Inspect and Visualise

In [241]:
ce_helpers.visualise_changes(clf, d, encoder, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes = True)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,hours-per-week,capital-loss,income,marital-status,native-country,occupation,race,relationship,sex,workclass
original,40.0023,353433.942,5.9995,0.0,34.9962,0.0,0,Non-Married,US,Managerial-Specialist,White,Non-Husband,Female,Private
sol0,-,-,12.5005,8299.917,-,-,1,-,-,-,-,-,-,-
sol1,-,-,13.4995,8299.917,-,-,1,-,-,-,-,-,-,-
sol2,-,-,14.5,8299.917,-,-,-,-,-,-,-,-,-,-


## Evaluation

In [242]:
df_orig = ce_helpers.visualise_changes(clf, d, encoder, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_)
df_performance = ce_helpers.evaluation_carla(df_orig, d)
df_performance

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
0,0.67,1.0,-8307.42,0.81,0.0,1.33,0.12
