In [1]:
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from alibi.datasets import fetch_adult
from alibi.explainers import CounterFactualProto
import pandas as pd 
import tensorflow as tf
from alibi.explainers import CounterFactual
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [43]:
data = pd.read_csv('titanic.csv')
data.rename(columns={'Survived': 'class'}, inplace=True)
data['Sex'] = data['Sex'].map({'male':'Male','female':'Female'})
data['Embarked'] = data['Embarked'].map({'S':'Southampton','C':'Cherbourg','Q':'Queenstown'})
data['Pclass'] = data['Pclass'].map({1:'First', 2:'Second', 3:'Third'})
data['Relatives'] = data['SibSp'] + data['Parch']

data = data.drop(['PassengerId', 'Name','Ticket','Cabin', 'SibSp', 'Parch'], axis=1)
data = data.dropna()

f = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Relatives']

features = data.drop('class', axis=1)
#print(features)

training_features, testing_features, training_target, testing_target = \
    train_test_split(features, data['class'].values, random_state=None)

In [3]:
print(training_features)

Pclass   Sex   Age     Fare     Embarked  Relatives
183  Second  Male   1.0  39.0000  Southampton          3
231   Third  Male  29.0   7.7750  Southampton          0
171   Third  Male   4.0  29.1250   Queenstown          5
422   Third  Male  29.0   7.8750  Southampton          0
509   Third  Male  26.0  56.4958  Southampton          0
..      ...   ...   ...      ...          ...        ...
129   Third  Male  45.0   6.9750  Southampton          0
370   First  Male  25.0  55.4417    Cherbourg          1
217  Second  Male  42.0  27.0000  Southampton          1
386   Third  Male   1.0  46.9000  Southampton          7
514   Third  Male  24.0   7.4958  Southampton          0

[534 rows x 6 columns]


In [4]:
numeric_features = ['Age', 'Fare', 'Relatives']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [5]:
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])


In [6]:
model.fit(training_features, training_target)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  ['Age', 'Fare', 'Relatives']),
                                                 ('cat',
                                                  P

In [7]:
print("XGB {}".format(accuracy_score(testing_target, model.predict(testing_features))))

XGB 0.7921348314606742


In [8]:
record = {'Pclass': 'Third', 'Sex': 'Male', 'Age': '47', 'Fare': '7.25', 'Embarked': 'Southampton', 'Relatives': '0'}
print(pd.DataFrame([record]))
prediction = model.predict_proba(pd.DataFrame([record]))[0]
print(prediction)
print(np.where(prediction == np.amax(prediction))[0][0])

Pclass   Sex Age  Fare     Embarked Relatives
0  Third  Male  47  7.25  Southampton         0
[0.9598487 0.0401513]
0


In [9]:
#print(model.predict_proba([np.array([3, 'Male', 47, 7.25, 'Southampton', 0])]))

In [9]:
predict_fn = lambda x: model['classifier'].predict_proba(x)

In [10]:
print(predict_fn(np.array([[1.30669356, 0.000000, 0.000000, 0.00, 0.00, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0]])))
print(model.predict_proba(pd.DataFrame([{'Pclass': 'Third', 'Sex': 'Male', 'Age': 47.0, 'Fare': 0.0000, 'Embarked': 'Southampton', 'Relatives': 0}])))

[[0.97625226 0.02374776]]
[[0.975354   0.02464599]]


In [28]:
test = (model['preprocessor'].transform(pd.DataFrame([{'Pclass': 'First', 'Sex': 'Male', 'Age': '10.0', 'Fare': '10.0000', 'Embarked': 'Southampton', 'Relatives': 0}])))
#print(pd.DataFrame(data=test))
#print(model['classifier'].predict_proba(test))
print(test)

[[0.12038201 0.0195187  0.         1.         0.         0.
  0.         1.         0.         0.         1.        ]]


In [30]:
cat_vars = {}
start = 3 #number of continuous features
for f in ['Pclass', 'Sex', 'Embarked']:
    numbers_features = len(np.unique(training_features[f]))
    cat_vars[start] = numbers_features
    start = start + numbers_features
print(cat_vars)


{3: 3, 6: 2, 8: 3}


In [45]:
np.zeros((1,len(f)))
np.ones((1,len(f)))


array([[1., 1., 1., 1., 1., 1.]])

In [46]:
cf = CounterFactualProto(predict_fn,
                         (1,11),
                         beta=0.01,
                         cat_vars=cat_vars,
                         ohe=True,
                         max_iterations=1000,
                         feature_range= (np.zeros((1,len(f))), np.ones((1,len(f)))),
                         #feature_range= (np.array([[-1, -1, -1, -1, -1, -1]]), np.array([[1, 1, 1, 1, 1, 1]])),
                         c_init=1.,
                         c_steps=5,
                         eps=(.01, .01)  # perturbation size for numerical gradients
                        )

In [15]:
print(model['preprocessor'].transform(training_features))

[[0.00728826 0.07612293 0.42857143 ... 0.         0.         1.        ]
 [0.35913546 0.01517579 0.         ... 0.         0.         1.        ]
 [0.04498618 0.05684821 0.71428571 ... 0.         1.         0.        ]
 ...
 [0.52249309 0.05270049 0.14285714 ... 0.         0.         1.        ]
 [0.00728826 0.0915427  1.         ... 0.         0.         1.        ]
 [0.2963056  0.01463083 0.         ... 0.         0.         1.        ]]


In [26]:
cf.fit(model['preprocessor'].transform(training_features), d_type='abdm', disc_perc=[25, 50, 75])

In [27]:
explanation = cf.explain(test, target_class=[0])

In [29]:
print(explanation['cf'])

None


In [16]:
print(predict_fn(np.array([[0.30887157, 0.        , 0.        , 0.        , 0.        ,
        1.        , 1.        , 0.        , 0.        , 0.        ,
        1.        ]])))
print(test)

[[0.28084832 0.7191517 ]]
[[0.12038201 0.0195187  0.         1.         0.         0.
  0.         1.         0.         0.         1.        ]]


In [17]:
x = explanation['cf']['X']
print(x)

[[0.16006179 0.0195187  0.         1.         0.         0.
  0.         1.         0.         0.         1.        ]]


In [18]:
one_hot_training = training_features[['Pclass', 'Sex', 'Embarked']].to_numpy()
print(one_hot_training)

[['Third' 'Male' 'Southampton']
 ['Third' 'Male' 'Southampton']
 ['Second' 'Female' 'Southampton']
 ...
 ['Second' 'Male' 'Southampton']
 ['First' 'Female' 'Cherbourg']
 ['Second' 'Female' 'Cherbourg']]


In [19]:
one = OneHotEncoder(handle_unknown='ignore')

In [20]:
one.fit(one_hot_training)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [21]:
print(one.transform([np.array(one_hot_training[0])]).toarray())

[[0. 0. 1. 0. 1. 0. 0. 1.]]


In [22]:
print(one.inverse_transform([[0., 0., 1., 1., 0., 0., 0., 1.]]))

[['Third' 'Female' 'Southampton']]


In [23]:
print(one.inverse_transform([explanation['cf']['X'][0][3:]]))

[['First' 'Male' 'Southampton']]


In [24]:
scaler_training = training_features[['Age', 'Fare', 'Relatives']].to_numpy()
print(scaler_training[0])

[26.      8.6625  2.    ]


In [25]:
scaler = MinMaxScaler()
scaler.fit(scaler_training)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [26]:
print(scaler.transform([np.array(scaler_training[0])]))

[[0.32143755 0.01690807 0.28571429]]


In [27]:
print(scaler.inverse_transform([explanation['cf']['X'][0][0:3]]))

[[13.15771737  9.9999997   0.        ]]


In [28]:
print(explanation['cf']['X'][0][0:3])

[0.16006179 0.0195187  0.        ]
