## Conterfactual methods

In [1]:
import time
import pandas as pd
import numpy as np

from utils import read_diabetes_dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

from cfmining.algorithms import MAPOCAM, BruteForce, Greedy
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier
from cfmining.visualization import buildTable, PlotCounterfactuals
from cfmining.mip_builder import RecourseBuilder
from cfmining.action_set import ActionSet


### Dataset

In [2]:
columns = ['gender', 'age', 'time_in_hospital', 'diag_1', 'diag_2', 'diag_3', 'metformin',
           'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin']


In [3]:
X, y = read_diabetes_dataset(binary=True)
X = X.sample(2500)[columns]
y = y[X.index]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101)


In [4]:
clf_logreg = LogisticRegression(max_iter=1000, solver='lbfgs')
grid = GridSearchCV(
    clf_logreg, param_grid={'C': np.logspace(-12, 12, num=25)},
    cv=5,
    scoring='roc_auc',
)

grid.fit(X_train, y_train)
clf_logreg = grid.best_estimator_


In [5]:
print('AUC performance:', roc_auc_score(y_test, clf_logreg.predict_proba(X_test)[:,1]))

AUC performance: 0.5859142202074668


In [6]:
prob = 0.78
clf_logreg_ = MonotoneClassifier(clf_logreg, X_train, y_train, threshold=prob)
coefficients = clf_logreg.coef_[0]
intercept = clf_logreg.intercept_[0]

In [7]:
X.head()

Unnamed: 0,gender,age,time_in_hospital,diag_1,diag_2,diag_3,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin
63404,1,65,3,13.0,3.0,7.0,0,2,0,0,2,0,0
27733,1,55,3,9.0,7.0,4.0,0,0,0,0,0,0,2
24552,0,55,3,7.0,7.0,17.0,0,2,0,0,0,0,0
52022,0,45,3,3.0,0.0,12.0,0,0,0,0,0,0,2
22585,0,75,1,3.0,7.0,7.0,0,0,0,0,0,0,0


In [8]:
action_set = ActionSet(X = X)
action_set.embed_linear_clf(coefficients=coefficients)

for feat in action_set:
    feat.step_type ="relative"
    feat.step_size = 0.1
    feat.update_grid()
    
action_set['age'].mutable = False
action_set['time_in_hospital'].mutable = False
action_set['metformin'].mutable = False

In [9]:
pd_coef = pd.Series(coefficients, index=X.columns).to_frame('Coefficients')
pd_coef

Unnamed: 0,Coefficients
gender,-0.007913
age,0.004908
time_in_hospital,0.024927
diag_1,-0.019223
diag_2,-0.005601
diag_3,0.008904
metformin,-0.016016
glimepiride,-0.011935
glipizide,0.001835
glyburide,-0.004349


In [10]:
scores = pd.Series(clf_logreg.predict_proba(X)[:, 1])
denied_individuals = scores.loc[lambda s: (s < prob)].index

In [11]:
choice = 162
choice = choice if choice in denied_individuals else np.random.choice(denied_individuals)
print('Choice:', choice)
individual = X.iloc[choice].values

Choice: 162


In [12]:
percCalc = PercentileCalculator(action_set=action_set)
percCriteria = PercentileCriterion(individual, percCalc)
percChCriteria = PercentileChangesCriterion(individual, percCalc)


In [13]:
p = prob
start = time.perf_counter()
rb = RecourseBuilder(
    optimizer="cplex",
    coefficients=coefficients,
    intercept=intercept - (np.log(p / (1. - p))),
    action_set=action_set,
    x=individual
)

build_info, indices = rb._get_mip_build_info()
output_1 = rb.fit()
print('Elapsed time:', time.perf_counter()-start)


Elapsed time: 0.07541174600009981


In [14]:
print('Number of actions', sum(output_1['actions']!=0))
(pd.Series(output_1['actions']+individual, index=X.columns).iloc[output_1['actions']!=0].to_frame('Actions')).replace(0,'-')

Number of actions 13


Unnamed: 0,Actions
gender,
age,
time_in_hospital,
diag_1,
diag_2,
diag_3,
metformin,
glimepiride,
glipizide,
glyburide,


In [15]:
en_nd_feat = MAPOCAM(action_set, individual, clf_logreg_, max_changes=3)
start = time.perf_counter()
en_nd_feat.fit()
print('Elapsed time:', time.perf_counter()-start)


Elapsed time: 0.005265698000584962




In [16]:
names = action_set.df['name'].values
overview_clean = buildTable(en_nd_feat, individual, percCriteria, names, include_original=False, include_cost=False)
overview_full = buildTable(en_nd_feat, individual, percCriteria, names, include_original=True, include_cost=True)

In [17]:
overview_full.replace(np.nan, '-')


Unnamed: 0,Orig
gender,1.0
age,55.0
time_in_hospital,3.0
diag_1,13.0
diag_2,13.0
diag_3,13.0
metformin,2.0
glimepiride,0.0
glipizide,2.0
glyburide,0.0


In [18]:
pltW = PlotCounterfactuals(overview_clean.iloc[:,:], individual)
pltW.show(None, bbox_to_anchor=(-0.01, -0.01))

  self.ax.set_ylim([-0.5, self.H-0.5])
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


ValueError: 'box_aspect' and 'fig_aspect' must be positive

<Figure size 50x0 with 1 Axes>