# CE-OCL Case Study

In [1]:
import warnings
import numpy as np
import pandas as pd
import embed_mip as em
import ce_helpers
import os

In [2]:
warnings.filterwarnings("ignore")

## Preparation

### Data prep

The (preprocessed) data can be downloaded from: https://datahub.io/machine-learning/credit-g

In [3]:
# model to use: either svm, linear, mlp, rf, cart, gbm
alg = 'svm'

In [4]:
dataset_name = 'german_credit'
df = pd.read_csv('./data/credit-g_csv.csv')

# recode class to 0 and 1
recode = {"class": {"bad": 0, "good": 1}}
df = df.replace(recode)

df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0


In [5]:
# outcome variable
d = {
    'target': 'class',
    'numerical':['duration', 'credit_amount', 'installment_commitment', 'age',
                 'residence_since', 'existing_credits', 'num_dependents']
}

d['categorical'] = df.columns.difference(d['numerical'] + [d['target']])

# target = 'class'
#
# # continuous features
# numerical = ['duration', 'credit_amount', 'installment_commitment', 'age',
#              'residence_since', 'existing_credits', 'num_dependents']

# integer features
# F_int = ['age', 'num_dependents', 'existing_credits', 'residence_since', 'duration']
F_int = ['num_dependents', 'existing_credits', 'residence_since']
# F_int = []

In [6]:
X = df.drop(d['target'], axis=1)
y = df[d['target']]

X, X_train, X_test, y_train, y_test, F_b, data_pip = ce_helpers.prep_data(X, y, d['numerical'],
                                                             one_hot_encoding = True, scaling = True)


### Train predictive models

In [7]:
version = dataset_name+'_v1'
alg_list_cf = [alg]
#alg_list_dr = ['mlp', 'linear', 'svm', 'rf']
alg_list_dr = ['mlp']

outcome_dict = {'counterfactual_german':{'task': 'binary', 'X features': X_train.columns, 
                                        'class': d['target'], 'alg_list': alg_list_cf,
                                        'X_train':X_train, 'X_test':X_test,
                                        'y_train':y_train, 'y_test':y_test},
               'duration':{'task': 'continuous', 'X features': ['credit_amount'], 
                           'class': 'duration', 'alg_list': alg_list_dr,
                           'X_train':X_train, 'X_test':X_test,
                           'y_train':X_train['duration'], 'y_test':X_test['duration']}}

## uncomment if models should be trained
ce_helpers.train_models(outcome_dict, version)

performance = ce_helpers.perf_trained_models(version, outcome_dict)
performance

Learning a constraint for counterfactual_german
Training svm
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = svm, metric = None
saving... results/svm_counterfactual_german_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 0.8246354166666667
-------------------testing evaluation-----------------------
Test Score: 0.8285714285714286
------------- Save results  ----------------

Learning a constraint for duration
Training mlp
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = mlp, metric = None
saving... results/mlp_duration_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.01739157551354752
Train R2: 0.44060188793987376
-------------------testing evaluation-----------------------
Test MSE: 0.0202883894777875

Unnamed: 0,save_path,seed,cv_folds,task,parameters,best_params,valid_score,train_score,test_score,outcome,alg
0,results/svm/german_credit_v1_counterfactual_ge...,0,5,binary,"{'C': [0.1, 1, 10, 100]}",{'C': 0.1},0.765923,0.824635,0.828571,counterfactual_german,svm
1,results/mlp/german_credit_v1_duration_model.csv,0,5,continuous,"{'hidden_layer_sizes': [(10,), (20,), (50,), (...","{'hidden_layer_sizes': (100,)}",-0.017651,0.017392,0.020288,duration,mlp


In [8]:
# load models
algorithms = {'counterfactual_german':alg,
             'duration':'mlp'}
y_pred, y_pred_0, X_test_0, models = ce_helpers.load_model(algorithms, outcome_dict, 'counterfactual_german')  # it should be X_test instead of X

X_test_0.head()
clf = models['counterfactual_german']
clf_Duration = models['duration']
#clf.intercept_, clf.coef_  #check it

In [9]:
## for coherence
F_r = d['numerical']
# categorical = df.columns.difference(numerical + [target])

F_coh = {}
for f in d['categorical']:
    F_coh[f] = [i for i in list(X_train.columns.difference(d['numerical'] + [d['target']])) if i.startswith('%s_' % f)]

### Prep for Optimization

Define how the counterfactual constraint should look like.

In [10]:
algorithm = algorithms['counterfactual_german']
SCM_algo = algorithms['duration']
constraints_embed = ['counterfactual_german']
objectives_embed = {}

In [11]:
model_master = em.model_selection(performance[performance['alg']==algorithm], constraints_embed, objectives_embed)
model_master['lb'] = 0.5  # this can be changed but it is generally equal to 0.5
model_master['ub'] = None
model_master['SCM_counterfactuals'] = None
model_master['features'] = [[col for col in X.columns]]
model_master

                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features
0,counterfactual_german,svm,results/svm/german_credit_v1_counterfactual_ge...,binary,0,0.5,,,"[checking_status_0<=X<200, checking_status_<0,..."


X1: points in X that have 1 as label. They will be used as trust region

In [12]:
y_ix_1 = np.where(y==1)
X1 = X.iloc[y_ix_1[0],:].copy().reset_index(drop=True, inplace=False)

### Factual instance

In [13]:
u_index = 2
u = X_test_0.iloc[u_index,:]
print(u)
print('predicted label: %d' % (clf.predict([u])))

checking_status_0<=X<200       0.000000
checking_status_<0             1.000000
checking_status_>=200          0.000000
checking_status_no checking    0.000000
credit_history_all paid        0.000000
                                 ...   
installment_commitment         1.000000
age                            0.107143
residence_since                1.000000
existing_credits               0.000000
num_dependents                 0.000000
Name: 775, Length: 61, dtype: float64
predicted label: 0


## Part A: validity, proximity

In [14]:
sp = False
mu = 0
tr_region = False
enlarge_tr = False
num_counterfactuals = 1

In [15]:
# features that can only increase (become larger)
L = []

# immutable features
I = []

# conditionally mutable features
Pers_I = []

P = []

### Optimization

In [16]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, 
                                        sp, mu, tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)

Embedding constraints for counterfactual_german
Academic license - for non-commercial use only - expires 2023-07-04
Using license file C:\gurobi912\win64\gurobi.lic
Changed value of parameter PoolSolutions to 101
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 0.03276461923702208
The optimal solution is: [0.0, 1.0, 0.0, -0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.16210973515182273, -0.038941993579050135, 0.9533450411973824, 0.1434598030362687, 1.0, -0.0, -0.0]


### Visualize DF

In [17]:
df_1 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_1

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,15.02,-333.52,3.86,27.04,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-


### Evaluation

In [18]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = ce_helpers.evaluation(df_orig, d).set_index(pd.Index(['Part A']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,


---

## Part B: validity, proximity, sparsity

In [19]:
sp = True
mu = 10000
tr_region = False
num_counterfactuals = 1

In [20]:
# features that can only increase (become larger)
L = []

# immutable features
I = []

# conditionally mutable features
Pers_I = []

P = []

### optimization

In [21]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                           tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)

Embedding constraints for counterfactual_german
Changed value of parameter PoolSolutions to 101
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 10000.061604200753
The optimal solution is: [0.0, 1.0, 0.0, -0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.04591571162563923, 0.06988320888558519, 1.0, 0.10714285714129801, 1.0, -0.0, -0.0]


### Visualize DF

In [22]:
df_2 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_2

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-


### Evaluation

In [23]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(df_orig, d)]).set_index(pd.Index(['Part A', 'Part B']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,
Part B,1.0,1.0,-16.88,0.9,,,


---

## Part C: validity, proximity, sparsity, diversity

In [24]:
sp = True
tr_region = False
num_counterfactuals = 3

In [25]:
# features that can only increase (become larger)
L = []

# immutable features
I = []

# conditionally mutable features
Pers_I = []

P = []

### Optimization

In [26]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int,F_coh, I, L, Pers_I, P, sp, mu, 
                           tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)

Embedding constraints for counterfactual_german
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 10000.061604200753
The optimal solution is: [0.0, 1.0, 0.0, -0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.04591571162563923, 0.06988320888558519, 1.0, 0.10714285714129801, 1.0, -0.0, -0.0]


### Visualise DF

In [27]:
df_3 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_3

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol1,-,-2873.47,-,30.06,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol2,-,-,1.96,26.63,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-


### Evaluation

In [28]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(df_orig, d)]).set_index(pd.Index(['Part A', 'Part B', 'Part C']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,
Part B,1.0,1.0,-16.88,0.9,,,
Part C,1.0,1.0,-1423.45,0.87,0.0,2845.81,0.14


---

## Part D: validity, proximity, sparsity, diversity, actionability

In [29]:
sp = True
tr_region = False
num_counterfactuals = 3

In [30]:
# features that can only increase (become larger)
L = ['age', 'residence_since']
# L = ['residence_since']

# immutable features
I = ['personal_status_male div/sep', 'personal_status_male mar/wid','personal_status_male single',
     'purpose_domestic appliance', 'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
     'purpose_other', 'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining', 'purpose_used car',
     'foreign_worker_yes']

employment = ['employment_unemployed', 'employment_<1', 'employment_1<=X<4','employment_4<=X<7', 'employment_>=7']
Pers_I = [employment] # variables that must be considered for person specific immutable features

P = ['duration', 'installment_commitment', 'num_dependents', 'credit_amount', 'existing_credits']

### Optimization

In [31]:
CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu, 
                           tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)

Embedding constraints for counterfactual_german
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 10000.061604200788
The optimal solution is: [0.0, 1.0, 0.0, 0.0, 0.0, -0.0, 0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -0.0, 1.0, 0.0, 1.0, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 1.0, 0.04591571155543263, 0.06988320888378308, 1.0, 0.10714285714129801, 1.0, -0.0, -0.0]


### Visualize DF

In [32]:
df_4 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_4

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol1,-,-,1.96,26.63,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol2,-,-,-,75.52,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-


### Evaluation

In [33]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(df_orig, d)]).set_index(pd.Index(['Part A', 'Part B',
                                                                                            'Part C', 'Part D']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,
Part B,1.0,1.0,-16.88,0.9,,,
Part C,1.0,1.0,-1423.45,0.87,0.0,2845.81,0.14
Part D,1.0,1.0,-23.69,0.89,0.0,46.29,0.11


---

## Part E: validity, proximity, sparsity, diversity, actionability, trust region

In [34]:
sp = True
tr_region = True
enlarge_tr = False
num_counterfactuals = 3

In [35]:
# features that can only increase (become larger)
L = ['age', 'residence_since']

# immutable features
I = ['personal_status_male div/sep', 'personal_status_male mar/wid','personal_status_male single',
     'purpose_domestic appliance', 'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
     'purpose_other', 'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining', 'purpose_used car',
     'foreign_worker_yes']

employment = ['employment_unemployed', 'employment_<1', 'employment_1<=X<4','employment_4<=X<7', 'employment_>=7']
Pers_I = [employment] # variables that must be considered for person specific immutable features

P = ['duration', 'installment_commitment', 'num_dependents', 'credit_amount', 'existing_credits']

### Optimization

In [36]:
try: CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                                             tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)
except:
    print('----TRUST REGION IS BEING ENLARGED----')
    enlarge_tr = True
    CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                                            tr_region, enlarge_tr, num_counterfactuals, model_master, data_pip)

Generating constraints for the trust region using 700 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_german
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
OBJ: 80006.00089659344
The optimal solution is: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.0, -0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.26470588235294246, 0.06426702406024631, 1.0, 0.10714285714129801, 1.0, 0.0, 0.0]


#### Visualize DF

In [37]:
df_5 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_5

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,22.0,1283.52,-,-,-,-,-,1.0,no checking,-,...,-,-,-,-,-,-,-,life insurance,-,-
sol1,12.0,1893.04,-,29.0,-,-,-,1.0,-,-,...,-,own,-,guarantor,-,yes,-,life insurance,-,<100
sol2,10.0,1363.43,2.0,64.0,-,-,-,1.0,no checking,-,...,-,own,-,-,-,yes,-,car,-,<100


### Evaluating

In [38]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(df_orig, d)]).set_index(pd.Index(['Part A', 'Part B',
                                                                                            'Part C', 'Part D',
                                                                                            'Part E']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,
Part B,1.0,1.0,-16.88,0.9,,,
Part C,1.0,1.0,-1423.45,0.87,0.0,2845.81,0.14
Part D,1.0,1.0,-23.69,0.89,0.0,46.29,0.11
Part E,1.0,0.67,-230.12,0.6,0.36,441.68,0.4


---

## Part F: validity, proximity, sparsity, diversity, actionability, trust region, causality

In [39]:
sp = True
tr_region = True
enlarge_tr = False
num_counterfactuals = 3

In [40]:
# features that can only increase (become larger)
L = ['age', 'residence_since']

# immutable features
I = ['personal_status_male div/sep', 'personal_status_male mar/wid','personal_status_male single',
     'purpose_domestic appliance', 'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
     'purpose_other', 'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining', 'purpose_used car',
     'foreign_worker_yes']

employment = ['employment_unemployed', 'employment_<1', 'employment_1<=X<4','employment_4<=X<7', 'employment_>=7']
Pers_I = [employment] # variables that must be considered for person specific immutable features

P = ['duration', 'installment_commitment', 'num_dependents', 'credit_amount', 'existing_credits']

### define constraints

In [41]:
constraints_embed = ['duration', 'counterfactual_german']

In [42]:
model_master_causality = em.model_selection(performance[(performance['alg']==algorithm) & (performance['outcome']=='counterfactual_german')], constraints_embed, objectives_embed)
model_master_causality = model_master_causality.append(em.model_selection(performance[(performance['alg']==SCM_algo) & (performance['outcome']!='counterfactual_german')], constraints_embed, objectives_embed))
model_master_causality['SCM_counterfactuals'] = None
model_master_causality.loc[model_master_causality['outcome']=='duration','SCM_counterfactuals'] =  clf_Duration.predict(pd.DataFrame([u['credit_amount']], columns=['credit_amount'])) - u['duration']
model_master_causality['lb'] = 0.50
model_master_causality['ub'] = None

df_arr = pd.DataFrame({'features': [list(X.columns)]})
result_1 = pd.merge(
    model_master_causality.loc[model_master_causality['outcome']=='counterfactual_german',:] ,
    df_arr,
    how='left',
    left_index=True, # Merge on both indexes, since right only has 0...
    right_index=True # all the other rows will be NaN
)
df_arr = pd.DataFrame({'features':[['credit_amount']]})
result_2 = pd.merge(
    model_master_causality.loc[model_master_causality['outcome']=='duration',:].reset_index(drop=True) ,
    df_arr,
    how='left',
    left_index=True, # Merge on both indexes, since right only has 0...
    right_index=True # all the other rows will be NaN
)
model_master_causality = result_1.append(result_2)
model_master = pd.DataFrame(model_master_causality.iloc[0, :]).T
model_master_causality

                 outcome model_type  \
0  counterfactual_german        svm   

                                           save_path    task  objective  
0  results/svm/german_credit_v1_counterfactual_ge...  binary          0  
    outcome model_type                                        save_path  \
0  duration        mlp  results/mlp/german_credit_v1_duration_model.csv   

         task  objective  
0  continuous          0  


Unnamed: 0,outcome,model_type,save_path,task,objective,SCM_counterfactuals,lb,ub,features
0,counterfactual_german,svm,results/svm/german_credit_v1_counterfactual_ge...,binary,0,,0.5,,"[checking_status_0<=X<200, checking_status_<0,..."
0,duration,mlp,results/mlp/german_credit_v1_duration_model.csv,continuous,0,-0.137631,0.5,,[credit_amount]


In [43]:
try: CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                                             tr_region, enlarge_tr, num_counterfactuals, model_master_causality, data_pip)
except:
    print('----TRUST REGION IS BEING ENLARGED----')
    enlarge_tr = True
    CEs, CEs_, final_model = ce_helpers.opt(X, X1, u, F_r, F_b, F_int, F_coh, I, L, Pers_I, P, sp, mu,
                                            tr_region, enlarge_tr, num_counterfactuals, model_master_causality, data_pip)

Generating constraints for the trust region using 700 samples.
The trust region is not being enlarged.
... Trust region defined.
Embedding constraints for counterfactual_german
Embedding constraints for duration
['credit_amount']
SCM: -0.13763087010393413
Changed value of parameter PoolSolutions to 103
   Prev: 10  Min: 1  Max: 2000000000  Default: 10
Changed value of parameter PoolSearchMode to 1
   Prev: 0  Min: 0  Max: 2  Default: 0
    model.name="CE";
      - termination condition: infeasibleOrUnbounded
      - message from solver: <undefined>
ERROR: evaluating object as numeric value: x[checking_status_0<=X<200]
        (object: <class 'pyomo.core.base.var._GeneralVarData'>)
    No value for uninitialized NumericValue object x[checking_status_0<=X<200]
ERROR: evaluating object as numeric value: OBJ
        (object: <class 'pyomo.core.base.objective.ScalarObjective'>)
    No value for uninitialized NumericValue object x[checking_status_0<=X<200]
----TRUST REGION IS BEING ENLARGED-

#### Visualize DF

In [44]:
df_6 = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method = 'CE-OCL', CEs=CEs, CEs_ = CEs_, only_changes=True)
df_6

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,-,-,-,-,-,-,-,1.0,no checking,-,...,-,-,-,-,-,-,-,life insurance,-,-
sol1,22.0,990.51,-,-,-,-,-,1.0,no checking,-,...,-,-,-,-,-,-,-,life insurance,-,-
sol2,26.83,1910.28,-,-,-,-,-,1.0,no checking,-,...,-,-,-,-,-,-,-,life insurance,-,-


### Evaluating

In [45]:
df_orig = ce_helpers.visualise_changes(clf, d, F_coh=F_coh, method='CE-OCL', CEs=CEs, CEs_=CEs_)
CE_perf = pd.concat([CE_perf, ce_helpers.evaluation(df_orig, d)]).set_index(pd.Index(['Part A', 'Part B',
                                                                                            'Part C', 'Part D',
                                                                                            'Part E', 'Part F']))
CE_perf

Unnamed: 0,validity,cat_prox,cont_prox,sparsity,cat_diver,cont_diver,cont_count_divers
Part A,1.0,1.0,-1715.94,0.76,,,
Part B,1.0,1.0,-16.88,0.9,,,
Part C,1.0,1.0,-1423.45,0.87,0.0,2845.81,0.14
Part D,1.0,1.0,-23.69,0.89,0.0,46.29,0.11
Part E,1.0,0.67,-230.12,0.6,0.36,441.68,0.4
Part F,1.0,0.77,-308.2,0.75,0.0,616.4,0.1


## Table

In [46]:
df_complete = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6])
df_complete

Unnamed: 0,duration,credit_amount,installment_commitment,age,residence_since,existing_credits,num_dependents,class,checking_status,credit_history,...,foreign_worker,housing,job,other_parties,other_payment_plans,own_telephone,personal_status,property_magnitude,purpose,savings_status
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,15.02,-333.52,3.86,27.04,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol1,-,-2873.47,-,30.06,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
sol2,-,-,1.96,26.63,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-
original,24.0,1371.26,4.0,25.0,4.0,1.0,1.0,0.0,<0,existing paid,...,yes,rent,skilled,none,none,none,female div/dep/mar,real estate,new car,no known savings
sol0,7.12,-,-,-,-,-,-,1.0,-,-,...,-,-,-,-,-,-,-,-,-,-


In [47]:
# df_complete.to_csv('results/CE-OCL_%s_%s.csv' % (dataset_name, alg), index=True)
# CE_perf.to_csv('results/CE-OCL_%s_%s_eval.csv' % (dataset_name, alg), index=True)