# Evaluation CE-OCL

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import opticl as oc
import os
import ce_helpers

In [2]:
num_iterations = 30

# CREDIT

## Preparation

### Data prep

The (preprocessed) data can be downloaded from: https://datahub.io/machine-learning/credit-g

In [4]:
dataset_name = 'german_credit'
df = pd.read_csv('./data/credit-g_csv.csv')

# recode class to 0 and 1
recode = {"class": {"bad": 0, "good": 1}}
df = df.replace(recode)

df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0


In [5]:
# outcome variable
target = 'class'

# continuous features
numerical = ['duration', 'credit_amount', 'installment_commitment', 'age', 
             'residence_since', 'existing_credits', 'num_dependents']

F_int = []

In [6]:
X = df.drop(target, axis=1)
y = df[target]

X, X_train, X_test, y_train, y_test, F_b, data_pip = ce_helpers.prep_data(X, y, numerical, 
                                                             one_hot_encoding = True, scaling = True)

### Train predictive models

In [7]:
version = dataset_name+'_v1'
alg_list_cf = ['svm']
alg_list_dr = ['mlp']

outcome_dict = {'counterfactual_german':{'task': 'binary', 'X features': X_train.columns, 
                                        'class': target, 'alg_list': alg_list_cf,
                                        'X_train':X_train, 'X_test':X_test,
                                        'y_train':y_train, 'y_test':y_test},
               'duration':{'task': 'continuous', 'X features': ['credit_amount'], 
                           'class': 'duration', 'alg_list': alg_list_dr,
                           'X_train':X_train, 'X_test':X_test,
                           'y_train':X_train['duration'], 'y_test':X_test['duration']}}

## uncomment if models should be trained
ce_helpers.train_models(outcome_dict, version)

performance = ce_helpers.perf_trained_models(version, outcome_dict)
performance

Learning a constraint for counterfactual_german
Training svm
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = svm, metric = None
saving... results/svm_counterfactual_german_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 0.8246354166666667
-------------------testing evaluation-----------------------
Test Score: 0.8285714285714286

Learning a constraint for duration
Training mlp
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = mlp, metric = None
saving... results/mlp_duration_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.017391575513544238
Train R2: 0.44060188793997934
-------------------testing evaluation-----------------------
Test MSE: 0.020288389477887313
Test R2: 0.37999906334524114

Saving the 

Unnamed: 0,save_path,seed,cv_folds,parameters,best_params,valid_score,train_score,test_score,outcome,alg,task
0,results/svm/german_credit_v1_counterfactual_ge...,0,5,"{'C': [0.1, 1, 10, 100]}",{'C': 0.1},0.765923,0.824635,0.828571,counterfactual_german,svm,binary
1,results/mlp/german_credit_v1_duration_model.csv,0,5,"{'hidden_layer_sizes': [(10,), (20,), (50,), (...","{'hidden_layer_sizes': (100,)}",-0.017651,0.017392,0.020288,duration,mlp,continuous


In [8]:
# load models
algorithms = {'counterfactual_german':'svm',
             'duration':'mlp'}
y_pred, y_pred_0, X_test_0, models = ce_helpers.load_model(algorithms, outcome_dict, 'counterfactual_german') 

X_test_0.head()
clf = models['counterfactual_german']
clf_Duration = models['duration']

In [9]:
## for coherence
F_r = numerical
categorical = df.columns.difference(numerical + [target])

F_coh = {}
for f in categorical:
    F_coh[f] = [i for i in list(X_train.columns.difference(numerical + [target])) if i.startswith('%s_' % f)]

### Prep for Optimization

Define how the counterfactual constraint should look like.

In [10]:
algorithm = algorithms['counterfactual_german']
SCM_algo = algorithms['duration']
constraints_embed = ['counterfactual_german']
objectives_embed = {}

In [11]:
model_master = oc.model_selection(performance[performance['alg']==algorithm], constraints_embed, objectives_embed)
model_master['lb'] = 0.5  # this can be changed but it is generally equal to 0.5
model_master['ub'] = None
model_master['SCM_counterfactuals'] = None
model_master['features'] = [[col for col in X.columns]]
model_master

                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features
0,counterfactual_german,svm,results/svm/german_credit_v1_counterfactual_ge...,binary,0,0.5,,,"[checking_status_0<=X<200, checking_status_<0,..."


In [12]:
y_ix_1 = np.where(y==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

## Several runs

In [13]:
# features that can only increase (become larger)
L = ['age', 'residence_since']

# immutable features
I = ['personal_status_male div/sep', 'personal_status_male mar/wid','personal_status_male single',
     'purpose_domestic appliance', 'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
     'purpose_other', 'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining', 'purpose_used car',
     'foreign_worker_yes']

employment = ['employment_unemployed', 'employment_<1', 'employment_1<=X<4','employment_4<=X<7', 'employment_>=7']
Pers_I = [employment] # variables that must be considered for person specific immutable features

P = ['duration', 'installment_commitment', 'num_dependents', 'credit_amount', 'existing_credits']



runs = {'OptiCL_Actionability':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':False, 'enlarge_tr':False, 'model_master':model_master},
       'OptiCL_TR':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':True, 'enlarge_tr':False, 'model_master':model_master},
       'OptiCL_Causality':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':True, 'enlarge_tr':True, 'model_master':None}}


eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for r in runs:
    
    CE_perf = pd.DataFrame()
    
    print('Running %s' % r)
    sp = runs[r]['sp']
    mu = runs[r]['mu']
    num_counterfactuals = runs[r]['num_counterfactuals']
    tr_region = runs[r]['tr_region']
    enlarge_tr = runs[r]['enlarge_tr']

    for u_index in range(iters):
        print('u_index: %d' % u_index)
        u = X_test_0.iloc[u_index,:]

        ### causality constraint
        if r == 'OptiCL_Causality':
        
            constraints_embed = ['duration', 'counterfactual_german']
            model_master_causality = oc.model_selection(performance[(performance['alg']==algorithm) & (performance['outcome']=='counterfactual_german')], constraints_embed, objectives_embed)
            model_master_causality = model_master_causality.append(oc.model_selection(performance[(performance['alg']==SCM_algo) & (performance['outcome']!='counterfactual_german')], constraints_embed, objectives_embed))
            model_master_causality['SCM_counterfactuals'] = None
            model_master_causality.loc[model_master_causality['outcome']=='duration','SCM_counterfactuals'] =  clf_Duration.predict(pd.DataFrame([u['credit_amount']], columns=['credit_amount'])) - u['duration']
            model_master_causality['lb'] = 0.50
            model_master_causality['ub'] = None

            df_arr = pd.DataFrame({'features': [list(X.columns)]})
            result_1 = pd.merge(
                model_master_causality.loc[model_master_causality['outcome']=='counterfactual_german',:] ,
                df_arr,
                how='left',
                left_index=True, # Merge on both indexes, since right only has 0...
                right_index=True # all the other rows will be NaN
            )
            df_arr = pd.DataFrame({'features':[['credit_amount']]})
            result_2 = pd.merge(
                model_master_causality.loc[model_master_causality['outcome']=='duration',:].reset_index(drop=True) ,
                df_arr,
                how='left',
                left_index=True, # Merge on both indexes, since right only has 0...
                right_index=True # all the other rows will be NaN
            )
            model_master_causality = result_1.append(result_2)
            model_master = pd.DataFrame(model_master_causality.iloc[0, :]).T
            model_master_causality
            
            runs['OptiCL_Causality']['model_master'] = model_master_causality

            ####
        
        model_m = runs[r]['model_master']
        
        enlarge_tr = runs[r]['enlarge_tr']
        try: CEs, CEs_, final_model = ce_helpers.opt(pd.concat([X_train,X_test]), X1, u, F_r, F_b, F_int,F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)
        
        except: 
            enlarge_tr = True
            CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)

        CEs = ce_helpers.vis_dataframe(df, CEs_, F_r, F_coh, target, only_changes = False)
        CE_perf1 = ce_helpers.evaluation(clf, CEs, numerical, categorical, CEs_ = CEs_, rounding = True)

        CE_perf = CE_perf.append(CE_perf1)

    eval_final.loc['%s_mean' % r] = np.array(CE_perf.mean())
    eval_final.loc['%s_std' % r] = np.array(CE_perf.std())

    eval_final

Running OptiCL_Actionability
u_index: 0
Embedding constraints for counterfactual_german
Set parameter Username
Academic license - for non-commercial use only - expires 2023-04-23
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10011.206155164686
The optimal solution is: [0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.5588235294117112, 0.38375135618116474, 0.33333333333393966, 3.5975595834406704, 0.0, 0.33333333333303017, 0.0]
u_index: 1
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.001139021035
The optimal solution is: [1.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0,

Set parameter PoolSearchMode to value 1
OBJ: 10000.053819440238
The optimal solution is: [0.0, 1.0, 0.0, -0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.13070134843768932, 0.2380496521794626, 0.6666666666666667, 0.5254265329942205, 0.35961449316517047, 0.0, 0.0]
u_index: 15
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 0.000628620164050513
The optimal solution is: [0.0, 1.0, 0.0, -0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, -0.0, 0.0, 1.0, 0.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.38235294117647056, 0.6603484587401875, 0.66666666

u_index: 29
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.181910855928
The optimal solution is: [1.0, 0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.04702948131668483, 0.1623587976258855, 1.0, 0.142857142855064, 1.0500883005984276, 0.3333333333333333, 0.0]
Running OptiCL_TR
u_index: 0
Generating constraints for the trust region using 700 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 140011.17533583453
The optimal solution is: [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.

... Trust region defined.
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 80006.01753007405
The optimal solution is: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.16176470588235292, 0.007403152721934925, 1.0, 0.071428571427532, 1.0, 0.0, 0.0]
u_index: 11
Generating constraints for the trust region using 700 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 130008.56718810994
The optimal solution is: [0.0, -0.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 1.0, -0.0, -0.0, -0.0, 0.0, 0.0, 1.0, -0.0, 1.0, 0.0, 0.0, 1.0, 0.0, -0.0, 

ERROR: evaluating object as numeric value: OBJ
        (object: <class 'pyomo.core.base.objective.ScalarObjective'>)
    No value for uninitialized NumericValue object x[checking_status_0<=X<200]
Generating constraints for the trust region using 700 samples.
The trust region is being enlarged with penatly Beta: 100000.
... Trust region defined.
Embedding constraints for counterfactual_german
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 140725.44502856748
The optimal solution is: [-0.0, 1.0, -0.0, 0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 0.0, -0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, 0.0, 1.0, -0.0, 1.0, -0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 1.0, 0.11764705882352938, 0.06292679813644775, 0.3333333333333333, 0.642857142855064, 1.0, 0.0, 0.0]
u_index: 22
Generating constraints for the trust region using 700 samples.
The trust r

... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: -0.08646442692952538
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 19968.186416453183
The optimal solution is: [1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 1.0, -0.0, 1.0, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, 1.0, -0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, -0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 0.38235294113262835, 0.22694492304840214, 0.23365156853334382, 0.053571428568830015, 0.0, 0.0, 0.0]
u_index: 2
                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  durat

u_index: 8
                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  duration        mlp  results/mlp/german_credit_v1_duration_model.csv   

         task  objective  
0  continuous          0  
Generating constraints for the trust region using 700 samples.
The trust region is being enlarged with penatly Beta: 100000.
... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: 0.06400070497594773
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 48289.27738668465
The optimal solution is: [1.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, 1.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 1.0, -0.0, 1.0, -0.0, -0.0, 1.0, 0.0, -0.0, -0.0, -0.0, 1.0, 1.0, 0.0

... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: -0.049357170044410204
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 94846.61069193555
The optimal solution is: [-0.0, 1.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, 1.0, -0.0, 0.0, -0.0, 0.0, 0.0, 1.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 1.0, -0.0, 1.0, -0.0, -0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, -0.0, -0.0, -0.0, 0.35538550920112133, 0.24158154389253045, 0.6666666666666667, 0.4642857124714023, 1.0, 0.0, 1.0]
u_index: 15
                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  durati

... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: 0.13312828174431113
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 144247.68056359136
The optimal solution is: [-0.0, 1.0, 0.0, 0.0, -0.0, -0.0, 0.0, 1.0, -0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 0.10741421026250464, 0.14685046907907334, 0.3333333333333333, 0.642857142855064, 0.0, 0.0, 0.0]
u_index: 22
                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  duration        mlp  r

u_index: 28
                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  duration        mlp  results/mlp/german_credit_v1_duration_model.csv   

         task  objective  
0  continuous          0  
Generating constraints for the trust region using 700 samples.
The trust region is being enlarged with penatly Beta: 100000.
... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: -0.02183317888067106
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 16303.734951878
The optimal solution is: [1.0, -0.0, -0.0, -0.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1

In [14]:
eval_final.round(2)

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
OptiCL_Actionability_mean,1.0,0.99,-404.21,0.93,0.01,808.03,0.12
OptiCL_Actionability_std,0.0,0.01,788.84,0.02,0.02,1577.79,0.03
OptiCL_TR_mean,1.0,0.68,-2887.89,0.57,0.31,1691.39,0.41
OptiCL_TR_std,0.0,0.08,2552.63,0.05,0.11,1837.87,0.1
OptiCL_Causality_mean,1.0,0.78,-2183.12,0.72,0.08,836.44,0.18
OptiCL_Causality_std,0.0,0.15,3000.06,0.13,0.11,1158.37,0.11


# ADULT 

## Preparation

### Data prep

To load the data, dice_ml must be installed from: https://github.com/interpretml/DiCE

In [15]:
from dice_ml.utils import helpers # helper functions

In [16]:
dataset_name = 'adult_income'
df = helpers.load_adult_income_dataset(only_train = False)
recode = {"education": {"HS-grad": 'School', 'Prof-school': 'School', "Assoc": 'Bachelors', "Some-college": 'Bachelors'}}
df = df.replace(recode)
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,39,Government,Bachelors,Single,White-Collar,White,Male,40,0
1,50,Self-Employed,Bachelors,Married,White-Collar,White,Male,13,0
2,38,Private,School,Divorced,Blue-Collar,White,Male,40,0
3,53,Private,School,Married,Blue-Collar,Other,Male,40,0
4,28,Private,Bachelors,Married,Professional,Other,Female,40,0


In [17]:
# outcome variable
target = 'income'

# continuous features
numerical = ['age','hours_per_week']

F_int = [] # integer variables

In [18]:
X = df.drop(target, axis=1)
y = df[target]

X, X_train, X_test, y_train, y_test, F_b, data_pip = ce_helpers.prep_data(X, y, numerical, 
                                                             one_hot_encoding = True, scaling = True)

### Train predictive models

In [19]:
version = dataset_name+'_v1'
alg_list_cf = ['svm']
outcome_dict = {'counterfactual_adult':{'task': 'binary', 'X features': X_train.columns, 
                                        'class': target, 'alg_list': alg_list_cf,
                                        'X_train':X_train, 'X_test':X_test,
                                        'y_train':y_train, 'y_test':y_test}}

## uncomment if models should be trained
ce_helpers.train_models(outcome_dict, version)

performance = ce_helpers.perf_trained_models(version, outcome_dict)
performance

Learning a constraint for counterfactual_adult
Training svm
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = svm, metric = None
saving... results/svm_counterfactual_adult_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 0.8650729482382203
-------------------testing evaluation-----------------------
Test Score: 0.8566055694269619

Saving the performance...
Done!


Unnamed: 0,save_path,seed,cv_folds,parameters,best_params,valid_score,train_score,test_score,outcome,alg,task
0,results/svm/adult_income_v1_counterfactual_adu...,0,5,"{'C': [0.1, 1, 10, 100]}",{'C': 100},0.864507,0.865073,0.856606,counterfactual_adult,svm,binary


In [20]:
# load models
algorithms = {'counterfactual_adult':'svm'}
y_pred, y_pred_0, X_test_0, models = ce_helpers.load_model(algorithms, outcome_dict, 'counterfactual_adult')  # it should be X_test instead of X

clf = models['counterfactual_adult']

In [21]:
## for coherence
F_r = numerical
categorical = df.columns.difference(numerical + [target])

F_coh = {}
for f in categorical:
    F_coh[f] = [i for i in list(X_train.columns.difference(numerical + [target])) if i.startswith('%s_' % f)]

### Prep for Optimization

Define how the counterfactual constraint should look like.

In [22]:
constraints_embed = ['counterfactual_adult']
objectives_embed = {}

In [23]:
model_master = oc.model_selection(performance[performance['alg']==algorithms['counterfactual_adult']], constraints_embed, objectives_embed)
model_master['lb'] = 0.5  # this can be changed but it is generally equal to 0.5
model_master['ub'] = None
model_master['SCM_counterfactuals'] = None
model_master['features'] = [[col for col in X.columns]]
model_master

                outcome model_type  \
0  counterfactual_adult        svm   

                                           save_path    task  objective  
0  results/svm/adult_income_v1_counterfactual_adu...  binary          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features
0,counterfactual_adult,svm,results/svm/adult_income_v1_counterfactual_adu...,binary,0,0.5,,,"[education_Bachelors, education_Doctorate, edu..."


X1: points in X that have 1 as label. They will be used as trust region

In [24]:
y_ix_1 = np.where(y==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

## Several runs

In [25]:
# features that can only increase (become larger)
L = ['age']

# immutable features
I = ['gender_Male', 'gender_Female', 'marital_status_Separated', 'marital_status_Married', 'marital_status_Divorced', 'marital_status_Single', 'marital_status_Widowed',
    'race_Other', 'race_White']

education = ['education_School', 'education_Bachelors', 'education_Masters', 'education_Doctorate']
Pers_I = [education] # variables that must be considered for person specific immutable features

P = ['hours_per_week']



runs = {'OptiCL_Actionability':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':False, 'enlarge_tr':False, 'model_master':model_master},
       'OptiCL_TR':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':True, 'enlarge_tr':False, 'model_master':model_master}}


eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for r in runs:
    
    CE_perf = pd.DataFrame()
    
    print('Running %s' % r)
    sp = runs[r]['sp']
    mu = runs[r]['mu']
    num_counterfactuals = runs[r]['num_counterfactuals']
    tr_region = runs[r]['tr_region']
    enlarge_tr = runs[r]['enlarge_tr']
    model_m = runs[r]['model_master']
    

    for u_index in range(iters):
        print('u_index: %d' % u_index)
        u = X_test_0.iloc[u_index,:]


        enlarge_tr = runs[r]['enlarge_tr']
        try: CEs, CEs_, final_model = ce_helpers.opt(pd.concat([X_train,X_test]), X1, u, F_r, F_b, F_int,F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)
        
        except: 
            enlarge_tr = True
            CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)

        CEs = ce_helpers.vis_dataframe(df, CEs_, F_r, F_coh, target, only_changes = False)
        CE_perf1 = ce_helpers.evaluation(clf, CEs, numerical, categorical, CEs_ = CEs_, rounding = True)

        CE_perf = CE_perf.append(CE_perf1)

    eval_final.loc['%s_mean' % r] = np.array(CE_perf.mean())
    eval_final.loc['%s_std' % r] = np.array(CE_perf.std())

    eval_final

Running OptiCL_Actionability
u_index: 0
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 0.07978506083658202
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.547945205479452, 0.47222833732581554]
u_index: 1
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.15550468891
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.3972602739726027, 0.7616877671322795]
u_index: 2
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.230064623463
The optimal solution is: [1.0, -0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.

u_index: 24
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10001.409471759476
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.2054794520536234, 1.564762778889417]
u_index: 25
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10001.299520512941
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.26027397260077123, 1.5379243203951773]
u_index: 26
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10001.4398170031
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.123

Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 40002.02789840859
The optimal solution is: [-0.0, 1.0, -0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 1.0, 0.0, 1.0, 1.0, -0.0, -0.0, 0.0, 0.553481839160351, 0.5306654879936941]
u_index: 8
Generating constraints for the trust region using 7841 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.00176837884
The optimal solution is: [-0.0, -0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 1.0, -0.0, 0.0, 0.0, 1.0, -0.0, -0.0, 1.0, 0.0, 0.6027397260259022, 0.4400112804060483]
u_index: 9
Generating constraints for the trust region using 7841 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103

u_index: 21
Generating constraints for the trust region using 7841 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 20002.0
The optimal solution is: [-0.0, -0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -0.0, -0.0, 1.0, -0.0, -0.0, 0.0, 1.0, 0.0, -0.0, -0.0, 1.0, 0.0, 0.49315068492978753, 0.3979591836734695]
u_index: 22
Generating constraints for the trust region using 7841 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_adult
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
    model.name="CE";
      - termination condition: infeasibleOrUnbounded
      - message from solver: <undefined>
ERROR: evaluating object as numeric value: x[education_Bachelors]
        (object: <class 'pyomo.core.base.var._GeneralVarData'>)
    No value

In [26]:
eval_final.round(2)

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
OptiCL_Actionability_mean,1.0,1.0,-85.78,0.83,0.0,116.24,0.25
OptiCL_Actionability_std,0.0,0.0,44.8,0.01,0.0,60.75,0.0
OptiCL_TR_mean,1.0,0.73,-32.24,0.6,0.19,26.79,0.39
OptiCL_TR_std,0.0,0.16,20.25,0.14,0.12,21.86,0.09


# DIABETES

## Preparation

### Data prep

The (preprocessed) data can be downloaded from: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

In [27]:
dataset_name = 'diabetes'
df = pd.read_csv('./data/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
# outcome variable
target = 'Outcome'

# continuous features
numerical = list(df.columns)
numerical.remove(target)

F_int = [] # integer variables

In [29]:
X = df.drop(target, axis=1)
y = df[target]

X, X_train, X_test, y_train, y_test, F_b, data_pip = ce_helpers.prep_data(X, y, numerical, 
                                                             one_hot_encoding = False, scaling = True)

### Train predictive models

In [30]:
version = dataset_name+'_v1'
alg_list_cf = ['svm']
outcome_dict = {'counterfactual_diabetes':{'task': 'binary', 'X features': X_train.columns, 
                                        'class': target, 'alg_list': alg_list_cf,
                                        'X_train':X_train, 'X_test':X_test,
                                        'y_train':y_train, 'y_test':y_test}}

## uncomment if models should be trained
ce_helpers.train_models(outcome_dict, version)

performance = ce_helpers.perf_trained_models(version, outcome_dict)
performance

Learning a constraint for counterfactual_diabetes
Training svm
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = svm, metric = None
saving... results/svm_counterfactual_diabetes_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 0.8265070093457944
-------------------testing evaluation-----------------------
Test Score: 0.8833333333333334

Saving the performance...
Done!


Unnamed: 0,save_path,seed,cv_folds,parameters,best_params,valid_score,train_score,test_score,outcome,alg,task
0,results/svm/diabetes_v1_counterfactual_diabete...,0,5,"{'C': [0.1, 1, 10, 100]}",{'C': 100},0.818242,0.826507,0.883333,counterfactual_diabetes,svm,binary


In [31]:
# load models
algorithms = {'counterfactual_diabetes':'svm'}
y_pred, y_pred_0, X_test_0, models = ce_helpers.load_model(algorithms, outcome_dict, 'counterfactual_diabetes')  # it should be X_test instead of X

clf = models['counterfactual_diabetes']

In [32]:
## for coherence
F_r = numerical
categorical = df.columns.difference(numerical + [target])

F_coh = {}
for f in categorical:
    F_coh[f] = [i for i in list(X_train.columns.difference(numerical + [target])) if i.startswith('%s_' % f)]

### Prep for Optimization

Define how the counterfactual constraint should look like.

In [33]:
constraints_embed = ['counterfactual_diabetes']
objectives_embed = {}

In [34]:
model_master = oc.model_selection(performance[performance['alg']==algorithms['counterfactual_diabetes']], constraints_embed, objectives_embed)
model_master['lb'] = 0.5  # this can be changed but it is generally equal to 0.5
model_master['ub'] = None
model_master['SCM_counterfactuals'] = None
model_master['features'] = [[col for col in X.columns]]
model_master

                   outcome model_type  \
0  counterfactual_diabetes        svm   

                                           save_path    task  objective  
0  results/svm/diabetes_v1_counterfactual_diabete...  binary          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features
0,counterfactual_diabetes,svm,results/svm/diabetes_v1_counterfactual_diabete...,binary,0,0.5,,,"[Pregnancies, Glucose, BloodPressure, SkinThic..."


In [35]:
y_ix_1 = np.where(y==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

## Several runs

In [36]:
# features that can only increase (become larger)
L = ['Age', 'Pregnancies']

# immutable features
I = []

Pers_I = [] # variables that must be considered for person specific immutable features

P = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']


runs = {'OptiCL_Actionability':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':False, 'enlarge_tr':False, 'model_master':model_master},
       'OptiCL_TR':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':True, 'enlarge_tr':False, 'model_master':model_master}}


eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for r in runs:
    
    CE_perf = pd.DataFrame()
    
    print('Running %s' % r)
    sp = runs[r]['sp']
    mu = runs[r]['mu']
    num_counterfactuals = runs[r]['num_counterfactuals']
    tr_region = runs[r]['tr_region']
    enlarge_tr = runs[r]['enlarge_tr']
    model_m = runs[r]['model_master']
    

    for u_index in range(iters):
        print('u_index: %d' % u_index)
        u = X_test_0.iloc[u_index,:]


        enlarge_tr = runs[r]['enlarge_tr']
        try: CEs, CEs_, final_model = ce_helpers.opt(pd.concat([X_train,X_test]), X1, u, F_r, F_b, F_int,F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)
        
        except: 
            enlarge_tr = True
            CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)

        CEs = ce_helpers.vis_dataframe(df, CEs_, F_r, F_coh, target, only_changes = False)
        CE_perf1 = ce_helpers.evaluation(clf, CEs, numerical, categorical, CEs_ = CEs_, rounding = True)

        CE_perf = CE_perf.append(CE_perf1)

    eval_final.loc['%s_mean' % r] = np.array(CE_perf.mean())
    eval_final.loc['%s_std' % r] = np.array(CE_perf.std())

    eval_final

Running OptiCL_Actionability
u_index: 0
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.396959052481
The optimal solution is: [0.11764705882352941, 0.8962999787827054, 0.45901639344262296, 0.2828282828282829, 0.05319148936170213, 0.37917837017466066, 0.10845431255337319, 0.016666666666666663]
u_index: 1
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.25716082345
The optimal solution is: [0.05882352941080171, 0.9694218253353029, 0.5081967213118332, 0.2525252525247197, 0.04846335697402537, 0.2906110283165617, 0.17250213492661715, 0.06666666666569654]
u_index: 2
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.272248990397
The optimal solution is: [0.05882352941080171, 1.0242874092728016, 0.6065573770483752, 0.12

u_index: 24
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.006811691956
The optimal solution is: [0.11764705882160342, 0.6837549772660331, 0.5573770491810137, 0.2828282828282829, 0.24231678486997635, 0.5414485354557934, 0.3596136009078272, 0.1621235496577576]
u_index: 25
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.118785197816
The optimal solution is: [0.17647058823422412, 0.9124914813583018, 0.3606557377042918, 0.13131313131270872, 0.0, 0.33383010432226, 0.02647309991516522, 0.016666666664605145]
u_index: 26
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.020470894655
The optimal solution is: [0.23529411764502584, 0.8767448759645049, 0.6967213114749029, 0.2727272727279342, 0.11820330969203496, 0.4307004

... Trust region defined.
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 20000.09884893462
The optimal solution is: [0.058823529411764705, 0.6825627414471418, 0.5081967213114754, 0.24242424242424243, 0.05200945626477541, 0.6482856701587356, 0.14688300597779674, 0.03333333333102928]
u_index: 14
Generating constraints for the trust region using 268 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_diabetes
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 20000.002808059195
The optimal solution is: [0.05882352941080171, 0.47542906890353615, 0.2871889065499265, 0.42424242424242425, 0.11702127659574468, 0.8196721311469446, 0.17847993168288667, 0.08333333333212067]
u_index: 15
Generating constraints for the trust region using 268 samples.
The trust region is not being enlarged.
... Trust region defined.

In [37]:
eval_final.round(2)

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
OptiCL_Actionability_mean,1.0,,-28.26,0.83,,53.66,0.32
OptiCL_Actionability_std,0.0,,14.94,0.07,,31.87,0.1
OptiCL_TR_mean,1.0,,-55.35,0.7,,45.05,0.42
OptiCL_TR_std,0.0,,42.27,0.08,,63.63,0.09


# HEART

## Preparation

### Data prep

The (preprocessed) data can be downloaded from: https://www.kaggle.com/datasets/shubamsumbria/statlog-heart-data-set

In [38]:
dataset_name = 'heart'
df = pd.read_csv('./data/statlog.csv')
recode = {"presence": {1: 0, 2:1},
          'cp':{1:'typical angina', 2:'atypical angina', 3:'nonanginal pain', 4:'asymptomatic'},
          'sex':{0:'female', 1:'male'},
          'fbs':{0:'false',1:'true'},
          'restecg':{0:'normal', 1:'having ST-T wave abnormality',2:'left ventricular hypertrophy'},
          'exang':{0:'no', 1:'yes'},
          'slope':{1:'upsloping', 2:'flat', 3:'downsloping'},
          'thal':{3:'normal', 6:'fixed defect', 7:'reversible defect'},
          'ca':{0:'0',1:'1', 2:'2',3:'3'}}
df = df.replace(recode)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,presence
0,70.0,male,asymptomatic,130.0,322.0,False,left ventricular hypertrophy,109.0,no,2.4,flat,3,normal,1
1,67.0,female,nonanginal pain,115.0,564.0,False,left ventricular hypertrophy,160.0,no,1.6,flat,0,reversible defect,0
2,57.0,male,atypical angina,124.0,261.0,False,normal,141.0,no,0.3,upsloping,0,reversible defect,1
3,64.0,male,asymptomatic,128.0,263.0,False,normal,105.0,yes,0.2,flat,1,reversible defect,0
4,74.0,female,atypical angina,120.0,269.0,False,left ventricular hypertrophy,121.0,yes,0.2,upsloping,1,normal,0


In [39]:
# outcome variable
target = 'presence'

# continuous features
numerical = ['age','trestbps','chol', 'thalach', 'oldpeak']

F_int = [] # integer variables

In [40]:
X = df.drop(target, axis=1)
y = df[target]

X, X_train, X_test, y_train, y_test, F_b, data_pip = ce_helpers.prep_data(X, y, numerical, 
                                                             one_hot_encoding = True, scaling = True)

### Train predictive models

In [41]:
version = dataset_name+'_v1'
alg_list_cf = ['svm']
outcome_dict = {'counterfactual_heart':{'task': 'binary', 'X features': X_train.columns, 
                                        'class': target, 'alg_list': alg_list_cf,
                                        'X_train':X_train, 'X_test':X_test,
                                        'y_train':y_train, 'y_test':y_test}}

## uncomment if models should be trained
ce_helpers.train_models(outcome_dict, version)
performance = ce_helpers.perf_trained_models(version, outcome_dict)
performance

Learning a constraint for counterfactual_heart
Training svm
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = svm, metric = None
saving... results/svm_counterfactual_heart_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 0.9427083333333334
-------------------testing evaluation-----------------------
Test Score: 0.8972222222222223

Saving the performance...
Done!


Unnamed: 0,save_path,seed,cv_folds,parameters,best_params,valid_score,train_score,test_score,outcome,alg,task
0,results/svm/heart_v1_counterfactual_heart_mode...,0,5,"{'C': [0.1, 1, 10, 100]}",{'C': 0.1},0.924846,0.942708,0.897222,counterfactual_heart,svm,binary


In [42]:
# load models
algorithms = {'counterfactual_heart':'svm'}
y_pred, y_pred_0, X_test_0, models = ce_helpers.load_model(algorithms, outcome_dict, 'counterfactual_heart')  # it should be X_test instead of X

clf = models['counterfactual_heart']

In [43]:
## for coherence
F_r = numerical
categorical = df.columns.difference(numerical + [target])

F_coh = {}
for f in categorical:
    F_coh[f] = [i for i in list(X_train.columns.difference(numerical + [target])) if i.startswith('%s_' % f)]

### Prep for Optimization

Define how the counterfactual constraint should look like.

In [44]:
constraints_embed = ['counterfactual_heart']
objectives_embed = {}

In [45]:
model_master = oc.model_selection(performance[performance['alg']==algorithms['counterfactual_heart']], constraints_embed, objectives_embed)
model_master['lb'] = 0.5  # this can be changed but it is generally equal to 0.5
model_master['ub'] = None
model_master['SCM_counterfactuals'] = None
model_master['features'] = [[col for col in X.columns]]
model_master

                outcome model_type  \
0  counterfactual_heart        svm   

                                           save_path    task  objective  
0  results/svm/heart_v1_counterfactual_heart_mode...  binary          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features
0,counterfactual_heart,svm,results/svm/heart_v1_counterfactual_heart_mode...,binary,0,0.5,,,"[ca_0, ca_1, ca_2, ca_3, cp_asymptomatic, cp_a..."


In [46]:
y_ix_1 = np.where(y==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

## Several runs

In [47]:
# features that can only increase (become larger)
L = ['age']

# immutable features
I = ['sex_male','sex_female']


Pers_I = [] # variables that must be considered for person specific immutable features

P = ['trestbps','chol', 'thalach', 'oldpeak']



runs = {'OptiCL_Actionability':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':False, 'enlarge_tr':False, 'model_master':model_master},
       'OptiCL_TR':{'sp':True, 'mu':10000, 'num_counterfactuals':3, 'tr_region':True, 'enlarge_tr':False, 'model_master':model_master}}


eval_final = pd.DataFrame(columns=['validity', 'cat_prox', 'cont_prox', 'sparsity', 'cat_diver',
       'cont_diver', 'cont_count_divers'])

if X_test_0.shape[0] < num_iterations:
    iters = X_test_0.shape[0]
else: iters = num_iterations

for r in runs:
    
    CE_perf = pd.DataFrame()
    
    print('Running %s' % r)
    sp = runs[r]['sp']
    mu = runs[r]['mu']
    num_counterfactuals = runs[r]['num_counterfactuals']
    tr_region = runs[r]['tr_region']
    enlarge_tr = runs[r]['enlarge_tr']
    model_m = runs[r]['model_master']
    

    for u_index in range(iters):
        print('u_index: %d' % u_index)
        u = X_test_0.iloc[u_index,:]


        enlarge_tr = runs[r]['enlarge_tr']
        try: CEs, CEs_, final_model = ce_helpers.opt(pd.concat([X_train,X_test]), X1, u, F_r, F_b, F_int,F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)
        
        except: 
            enlarge_tr = True
            CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                               tr_region, enlarge_tr, num_counterfactuals, model_m, data_pip)
            
        CEs = ce_helpers.vis_dataframe(df, CEs_, F_r, F_coh, target, only_changes = False)
        CE_perf1 = ce_helpers.evaluation(clf, CEs, numerical, categorical, CEs_ = CEs_, rounding = True)

        CE_perf = CE_perf.append(CE_perf1)


    eval_final.loc['%s_mean' % r] = np.array(CE_perf.mean())
    eval_final.loc['%s_std' % r] = np.array(CE_perf.std())

    eval_final

Running OptiCL_Actionability
u_index: 0
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10000.011924107053
The optimal solution is: [1.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.1875, 0.3253459286381637, 0.2397260273972603, 0.847328244275559, 0.7041135800753965]
u_index: 1
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10001.05019239132
The optimal solution is: [0.0, 1.0, -0.0, 0.0, -0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.47916666666424135, 0.40816326530693914, 0.17123287671232862, 0.6641221374045801, 1.1538212077120056]
u_index: 2
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10005.210805982535
The op

Set parameter PoolSearchMode to value 1
OBJ: 10004.461790883437
The optimal solution is: [1.0, 0.0, -0.0, 0.0, -0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.5416666666642413, 0.3673469387758814, 0.3105022831041424, 0.6412213740459265, 2.1122951695815573]
u_index: 21
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 10001.256278979983
The optimal solution is: [1.0, 0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, -0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.6666666666642413, 0.571428571429351, 0.2702505210794684, 0.5038167938931296, 1.2807285598174967]
u_index: 22
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 0.02200372382560167
The optimal solution is: [0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 0.0, 1.0, 1.0, -

u_index: 8
Generating constraints for the trust region using 120 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 100006.034938605
The optimal solution is: [1.0, 0.0, -0.0, -0.0, 1.0, 0.0, -0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.3095238095233981, 0.22157434402288345, 0.20547945205544238, 0.6030534351145037, 0.07373271889400937]
u_index: 9
Generating constraints for the trust region using 120 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 120008.9388269013
The optimal solution is: [0.0, 0.0, 1.0, -0.0, 1.0, 0.0, -0.0, 0.0, -0.0, 1.0, -0.0, 1.0, -0.0, 1.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.5624999999999998, 1

... Trust region defined.
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 80004.07230083471
The optimal solution is: [1.0, 0.0, -0.0, -0.0, 1.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, -0.0, 1.0, 0.40544871794871795, 0.36734693877551017, 0.32156656129258865, 0.658250146799765, 0.15880893300248144]
u_index: 24
Generating constraints for the trust region using 120 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_heart
Set parameter PoolSolutions to value 103
Set parameter PoolSearchMode to value 1
OBJ: 100006.19446173501
The optimal solution is: [0.0, 0.0, 1.0, -0.0, 1.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, 0.0, -0.0, 1.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 1.0, -0.0, 0.6874999999999998, 0.46938775510204034, 0.324200913242009, 0.6793893129771158, 0.5806451612903223]
u_index: 25
Generating constraint

In [48]:
eval_final.round(2)

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
OptiCL_Actionability_mean,1.0,1.0,-459.13,0.91,0.0,916.6,0.16
OptiCL_Actionability_std,0.0,0.0,302.71,0.02,0.0,606.17,0.03
OptiCL_TR_mean,1.0,0.66,-89.77,0.45,0.28,90.01,0.54
OptiCL_TR_std,0.0,0.1,34.55,0.08,0.14,39.77,0.09
