### Import Libraries

In [329]:
import dice_ml
from dice_ml.utils import helpers # helper functions
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
print(tf.__version__)

1.13.0-rc1


### Loading dataset

In [330]:
train = pd.read_csv('./loan_train.csv')
print(train.shape)
display(train.head())

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [331]:
train = train.drop('Loan_ID', axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [332]:
# separate the categorical and numeric columns
cat_data = []
num_data = []

for i,c in enumerate(train.dtypes):
    if c == object:
        cat_data.append(train.iloc[:, i])
    else :
        num_data.append(train.iloc[:, i])

cat_data = pd.DataFrame(cat_data).T
num_data = pd.DataFrame(num_data).T
display(cat_data)
display(num_data)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y
...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,Rural,Y
610,Male,Yes,3+,Graduate,No,Rural,Y
611,Male,Yes,1,Graduate,No,Urban,Y
612,Male,Yes,2,Graduate,No,Urban,Y


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849.0,0.0,,360.0,1.0
1,4583.0,1508.0,128.0,360.0,1.0
2,3000.0,0.0,66.0,360.0,1.0
3,2583.0,2358.0,120.0,360.0,1.0
4,6000.0,0.0,141.0,360.0,1.0
...,...,...,...,...,...
609,2900.0,0.0,71.0,360.0,1.0
610,4106.0,0.0,40.0,180.0,1.0
611,8072.0,240.0,253.0,360.0,1.0
612,7583.0,0.0,187.0,360.0,1.0


### 欠損値補完

In [333]:
cat_data = cat_data.apply(lambda x:x.fillna(x.value_counts().index[0]))
cat_data.isnull().sum().any() # no more missing data

False

In [334]:
num_data.fillna(num_data.median(), inplace=True)
num_data.isnull().sum().any() # no more missing data 

False

In [335]:
num_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849.0,0.0,128.0,360.0,1.0
1,4583.0,1508.0,128.0,360.0,1.0
2,3000.0,0.0,66.0,360.0,1.0
3,2583.0,2358.0,120.0,360.0,1.0
4,6000.0,0.0,141.0,360.0,1.0


### 目的変数のバイナリ変数化

In [336]:
target_values = {'Y': 1 , 'N' : 0}

target = cat_data['Loan_Status']
cat_data.drop('Loan_Status', axis=1, inplace=True)

target = target.map(target_values)

### マージ

In [337]:
df = pd.concat([cat_data, num_data, target], axis=1)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,0,Graduate,No,Urban,5849.0,0.0,128.0,360.0,1.0,1
1,Male,Yes,1,Graduate,No,Rural,4583.0,1508.0,128.0,360.0,1.0,0
2,Male,Yes,0,Graduate,Yes,Urban,3000.0,0.0,66.0,360.0,1.0,1
3,Male,Yes,0,Not Graduate,No,Urban,2583.0,2358.0,120.0,360.0,1.0,1
4,Male,No,0,Graduate,No,Urban,6000.0,0.0,141.0,360.0,1.0,1


In [338]:
df.isna().any()

Gender               False
Married              False
Dependents           False
Education            False
Self_Employed        False
Property_Area        False
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
Loan_Status          False
dtype: bool

In [339]:
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
Property_Area         object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Loan_Status            int64
dtype: object

In [340]:
df['Credit_History'].unique()

array([1., 0.])

In [341]:
df['Dependents'].unique()

array(['0', '1', '2', '3+'], dtype=object)

In [342]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,0,Graduate,No,Urban,5849.0,0.0,128.0,360.0,1.0,1
1,Male,Yes,1,Graduate,No,Rural,4583.0,1508.0,128.0,360.0,1.0,0
2,Male,Yes,0,Graduate,Yes,Urban,3000.0,0.0,66.0,360.0,1.0,1
3,Male,Yes,0,Not Graduate,No,Urban,2583.0,2358.0,120.0,360.0,1.0,1
4,Male,No,0,Graduate,No,Urban,6000.0,0.0,141.0,360.0,1.0,1


In [343]:
np.random.seed(1)
tf.set_random_seed(2)

In [344]:
# d = dice_ml.Data(dataframe=df, continuous_features=['ApplicantIncome',
#                                                                                   'CoapplicantIncome',
#                                                                                   'LoanAmount',
#                                                                                   'Loan_Amount_Term',
#                                                                                   'Credit_History'
#                                                                                   ], outcome_name='Loan_Status')#NG

d = dice_ml.Data(dataframe=df, continuous_features=['ApplicantIncome',
                                                                                  'CoapplicantIncome',
                                                                                  'LoanAmount',
                                                                                  ], outcome_name='Loan_Status')

In [345]:
sess = tf.InteractiveSession()

train, _ = d.split_data(d.normalize_data(d.one_hot_encoded_data))
X_train = train.loc[:, train.columns != 'Loan_Status']
y_train = train.loc[:, train.columns == 'Loan_Status']

ann_model = keras.Sequential()
ann_model.add(keras.layers.Dense(20, input_shape=(X_train.shape[1],), kernel_regularizer=keras.regularizers.l1(0.001), activation=tf.nn.relu))
ann_model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

ann_model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])
ann_model.fit(X_train, y_train, validation_split=0.20, epochs=100, verbose=0, class_weight={0:1,1:2})



<tensorflow.python.keras.callbacks.History at 0x158041d68>

In [346]:
# provide the trained ML model to DiCE's model object
backend = 'TF'+tf.__version__[0] # TF1
m = dice_ml.Model(model=ann_model, backend=backend)

In [347]:
# initiate DiCE
exp = dice_ml.Dice(d, m)

In [351]:
factual_sample = df.iloc[7, 1:].to_dict()
display(factual_sample)
dice_exp = exp.generate_counterfactuals(factual_sample,
                                                                  total_CFs=4,
                                                                  desired_class='opposite')

{'Married': 'Yes',
 'Dependents': '3+',
 'Education': 'Graduate',
 'Self_Employed': 'No',
 'Property_Area': 'Semiurban',
 'ApplicantIncome': 3036.0,
 'CoapplicantIncome': 2504.0,
 'LoanAmount': 158.0,
 'Loan_Amount_Term': 360.0,
 'Credit_History': 0.0,
 'Loan_Status': 0}

Diverse Counterfactuals found! total time taken: 00 min 09 sec


In [352]:
# visualize the resutls
dice_exp.visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Female,Yes,3+,Graduate,No,Semiurban,3036.0,2504.0,158.0,360.0,0.0,0.146271



Diverse Counterfactual set (new outcome : 1)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,Yes,3+,Graduate,No,Urban,2726.3,3992.5,127.0,360.0,1.0,0.89
1,Male,Yes,1,Graduate,No,Semiurban,3959.3,1547.1,158.6,180.0,1.0,0.937
2,Female,Yes,3+,Graduate,No,Semiurban,5528.0,2462.4,158.6,180.0,0.0,0.751
3,Female,Yes,3+,Graduate,No,Semiurban,504.5,3021.0,176.2,360.0,1.0,0.895
