In [68]:
import warnings
warnings.filterwarnings('ignore') 

from utils import *
from get_data import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd

import time

from copy import deepcopy

In [None]:
EXPERIMENT = 'compass'

In [2]:
# Get the data set and do some preprocessing
params = Params("data/experiment_params.json")
np.random.seed(params.seed)

if EXPERIMENT == 'compass':
    X, y, cols = get_and_preprocess_compas_data(params)

# Add a random column -- this is what we'll have LIME/SHAP explain.
X['unrelated_column'] = np.random.choice([0,1],size=X.shape[0])
features = [c for c in X]

categorical_feature_name = ['two_year_recid', 'c_charge_degree_F', 'c_charge_degree_M',\
                            'sex_Female', 'sex_Male', 'race', 'unrelated_column']

categorical_feature_indcs = [features.index(c) for c in categorical_feature_name]

race_indc = features.index('race')
unrelated_indcs = features.index('unrelated_column')
X = X.values

In [3]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.1)
ss = StandardScaler().fit(X)
xtrain = ss.transform(xtrain)
xtest = ss.transform(xtest)
xall = ss.transform(X)

In [72]:
from importlib import reload  

import manifold_sampling_tabular as mst

reload(mst)

<module 'manifold_sampling_tabular' from '/blue/thai/minhvu/manifold_sampling/umap_discriminator/manifold_sampling_tabular.py'>

In [73]:
DIM = 2
all_labels = y

In [74]:
start_time = time.time()
mani_sampler = mst.Manifold_Tabular_Sampler(xall, dim = DIM, labels = all_labels)
duration = time.time() - start_time
print("Initialize duration: ", duration)

Initialize duration:  10.941585063934326


In [75]:
start_time = time.time()
mani_sampler.train_multiplier = 100
mani_sampler.std_train = 0.1
mani_sampler.train_pivot(no_pivots_per_label = 20, shuffle = False)
duration = time.time() - start_time
print("Train duration: ", duration)

Train duration:  61.78624677658081


In [76]:
def get_discriminator(X,y,n_estimators = 100, train_ratio = 0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 1 - train_ratio)
    the_rf = RandomForestClassifier(n_estimators=n_estimators).fit(X_train, y_train)
    y_pred = the_rf.predict(X_test)
    the_rf_result = (y_pred == y_test).sum()
    return the_rf, the_rf_result/y_test.shape[0], X_train.shape[0]

def get_discriminator_performance(X,y,rf):
    y_pred = rf.predict(X)
    the_rf_result = (y_pred == y).sum()
    return the_rf_result/y.shape[0], y.shape[0]

X_in = mani_sampler.pivots
X_rawper = np.vstack([perturbs[0] for perturbs in mani_sampler.raw_perturbs])
X_per = np.vstack([perturbs[0] for perturbs in mani_sampler.perturbs])
X_plane = np.zeros_like(X_in)
X_ortho = np.zeros_like(X_in)
for i in range(X_plane.shape[0]):
    X_plane[i] = mani_sampler.pivots[i] + mani_sampler.plane_noise[i][0]
    X_ortho[i] = mani_sampler.pivots[i] + mani_sampler.ortho_noise[i][0]

In [87]:
TRAIN_RATIO = 0.9

In [88]:
X_discriminator_rawper = np.vstack((X_in, X_rawper))
X_discriminator_per = np.vstack((X_in, X_per))
X_discriminator_plane = np.vstack((X_in, X_plane))
X_discriminator_ortho = np.vstack((X_in, X_ortho))
y_discriminator = np.concatenate((np.zeros(X_in.shape[0]), np.ones(X_per.shape[0])))

the_rf_rawper, test_acc_rawper, no_trains = get_discriminator(X_discriminator_rawper,y_discriminator, n_estimators = 100, train_ratio = TRAIN_RATIO)
print(test_acc_rawper, no_trains)
the_rf_per, test_acc_per, no_trains = get_discriminator(X_discriminator_per,y_discriminator, n_estimators = 100, train_ratio = TRAIN_RATIO)
print(test_acc_per, no_trains)
the_rf_plane, test_acc_plane, no_trains = get_discriminator(X_discriminator_plane,y_discriminator, n_estimators = 100, train_ratio = TRAIN_RATIO)
print(test_acc_plane, no_trains)
the_rf_ortho, test_acc_ortho, no_trains = get_discriminator(X_discriminator_ortho,y_discriminator, n_estimators = 100, train_ratio = TRAIN_RATIO)
print(test_acc_ortho, no_trains)


1.0 72
1.0 72
1.0 72
1.0 72


In [86]:
print("Create Testing environment")

start_time = time.time()
explanation_sampler = mst.Manifold_Tabular_Sampler(xall, dim = DIM, labels = all_labels)
duration = time.time() - start_time
print("Initialize duration: ", duration)

start_time = time.time()
explanation_sampler.train_multiplier = 100
explanation_sampler.std_train = 0.1
explanation_sampler.train_pivot(no_pivots_per_label = 20, shuffle = True)
duration = time.time() - start_time
print("Train duration: ", duration)

Create Testing environment
Initialize duration:  11.452292680740356
Train duration:  59.31259727478027


In [89]:
NUM_PERTURBATIONS = 10

In [92]:
Z_in = explanation_sampler.pivots
acc_rawper = 0
acc_per = 0
acc_plane = 0
acc_ortho = 0
var_rawper = 0
var_per = 0 
var_plane = 0
var_ortho = 0
for p in range(NUM_PERTURBATIONS):
    Z_rawper = np.vstack([perturbs[0] for perturbs in explanation_sampler.raw_perturbs])
    Z_per = np.vstack([perturbs[p] for perturbs in explanation_sampler.perturbs])
    Z_plane = np.zeros_like(Z_in)
    Z_ortho = np.zeros_like(Z_in)
    for i in range(Z_plane.shape[0]):
        Z_plane[i] = explanation_sampler.pivots[i] + explanation_sampler.plane_noise[i][p]
        Z_ortho[i] = explanation_sampler.pivots[i] + explanation_sampler.ortho_noise[i][p]
    
    Z_discriminator_rawper = np.vstack((Z_in, Z_rawper))
    Z_discriminator_per = np.vstack((Z_in, Z_per))
    Z_discriminator_plane = np.vstack((Z_in, Z_plane))
    Z_discriminator_ortho = np.vstack((Z_in, Z_ortho))
    y_discriminator = np.concatenate((np.zeros(Z_in.shape[0]), np.ones(Z_per.shape[0])))
    
    test_acc_rawper, no_test = get_discriminator_performance(Z_discriminator_rawper, y_discriminator, the_rf_rawper)
    test_acc_per, no_test = get_discriminator_performance(Z_discriminator_per, y_discriminator, the_rf_per)
    test_acc_plane, no_test = get_discriminator_performance(Z_discriminator_plane, y_discriminator, the_rf_plane)
    test_acc_ortho, no_test = get_discriminator_performance(Z_discriminator_ortho, y_discriminator, the_rf_ortho)
    
    acc_rawper = acc_rawper + test_acc_rawper
    acc_per = acc_per + test_acc_per
    acc_plane = acc_plane + test_acc_plane
    acc_ortho = acc_ortho + test_acc_ortho
    var_rawper = var_rawper + np.var(Z_rawper-Z_in)
    var_per = var_per + np.var(Z_per-Z_in)
    var_plane = var_plane + np.var(Z_plane-Z_in)
    var_ortho = var_ortho + np.var(Z_ortho-Z_in)

acc_rawper = acc_rawper/NUM_PERTURBATIONS
acc_per = acc_per/NUM_PERTURBATIONS
acc_plane = acc_plane/NUM_PERTURBATIONS
acc_ortho = acc_ortho/NUM_PERTURBATIONS
var_rawper = var_rawper/NUM_PERTURBATIONS
var_per = var_per/NUM_PERTURBATIONS
var_plane = var_plane/NUM_PERTURBATIONS
var_ortho = var_ortho/NUM_PERTURBATIONS

In [95]:
print(acc_rawper)
print(acc_per)
print(acc_plane)
print(acc_ortho)
print(var_rawper)
print(var_per)
print(var_plane)
print(var_ortho)

0.9875
0.9912500000000002
0.9824999999999999
0.845
0.010567870886417674
0.041613823904583916
0.0018870068284950394
0.007826615018643563
