In [16]:
import Datasets as DS
import warnings
import numpy as np
import ce_helpers
from itertools import chain
warnings.filterwarnings("ignore")

In [17]:
wd = '/Users/tabearober/Documents/Counterfactuals/CE-OCL/data/'

In [18]:
df, df_train, df_test, d, clf, df_factuals, model_master, encoder, scaler = DS.adult(wd)

In [19]:
target = d['target']
numerical = d['numerical']
categorical = d['categorical']

mapping = {}
for i in range(len(categorical)):
    mapping[categorical[i]] = [categorical[i] + '_' + s for s in list(encoder.categories_[i])]
mapping

{'marital-status': ['marital-status_Married', 'marital-status_Non-Married'],
 'native-country': ['native-country_Non-US', 'native-country_US'],
 'occupation': ['occupation_Managerial-Specialist', 'occupation_Other'],
 'race': ['race_Non-White', 'race_White'],
 'relationship': ['relationship_Husband', 'relationship_Non-Husband'],
 'sex': ['sex_Female', 'sex_Male'],
 'workclass': ['workclass_Non-Private', 'workclass_Private']}

## recreate original dataset

This is not necessary at this point.

In [20]:
# df_dummies, df_orig = DS.recreate_orig(df, d, encoder)

## Optimization

### Optimization Prep

X: complete dataset without target

In [21]:
X = df.drop(target,axis=1)

X1: points in X that have 1 as label. They will be used as trust region

In [22]:
y_ix_1 = np.where(df[target]==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

Factual instance

In [23]:
u_index = 0
u = df_factuals.drop(target,axis=1).iloc[u_index,:]
# print(u)
print('predicted label: %d' % (clf.predict([u])))

predicted label: 0


Specify parameters

In [24]:
sp = True
mu = 10000
tr_region = True
enlarge_tr = False
num_counterfactuals = 3

In [25]:
# features that can only increase (become larger)
L = []
# conditionally mutable features, such as education level (can only take on higher categories)
Pers_I = []
# features that can only be positive
P = []
# immutable features
# I = mapping[d['immutable']]
I = [mapping.get(key) for key in list(set(d['immutable']).intersection(categorical))]
I = list(set(list(chain.from_iterable(I))).intersection(df.columns))
# dictionary with one-hot encoding used to ensure coherence
F_coh = {}
# integer features
F_int = []
# categorical features one-hot encoded
F_b = df.columns.difference(numerical + [target])
# F_r
F_r = numerical

### Optimize

In [26]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, numerical, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                           tr_region, enlarge_tr, num_counterfactuals, model_master, scaler)

Generating constraints for the trust region using 11687 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 20000.19466132187
The optimal solution is: [0.315068493, 0.230798686, 0.7666676805638417, 0.08296182913181838, -1.8189894035458565e-12, 0.34693877599999956, 1.0, 1.0, -0.0, 1.0, 1.0, 0.0, 1.0]


## Inspect and Visualise

We create df_dummies and df_orig.

The model trained by CARLA uses one-hot encoding and drops the reference categories from the df. Hence, we have only sex_Male in the dataframe, but not sex_Female. df_dummies includes all dummy variables, and df_orig is a dataframe that is mapped back into the original input space. So in df_orig, we have a variable 'sex' with values 'Male' and 'Female', instead of dummy variables.

We can use df_orig to visualise our counterfactuals.

In [27]:
CEs_['income'] = clf.predict(CEs.drop('scaled_distance', axis=1))
df_dummies, df_orig = DS.recreate_orig(CEs_, d, encoder)

In [28]:
ce_helpers.visualise_changes(df_orig)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,hours-per-week,capital-loss,income,marital-status,native-country,occupation,race,relationship,sex,workclass
original,40.0023,353433.942,5.9995,0.0,34.9962,0.0,0,Non-Married,US,Managerial-Specialist,White,Non-Husband,Female,Private
sol0,-,-,12.5005,8299.917,-,-,1,-,-,-,-,-,-,-
sol1,-,-,13.4995,8299.917,-,-,1,-,-,-,-,-,-,-
sol2,-,-,14.5,8299.917,-,-,-,-,-,-,-,-,-,-


## Evaluation

In [29]:
df_performance = ce_helpers.evaluation(clf, df_orig.drop(target,axis=1), numerical, categorical, rounding=True, CEs_=CEs)
df_performance

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
0,0.67,1.0,-8307.42,0.85,0.0,1.33,0.08
