In [1]:
import warnings
warnings.filterwarnings('ignore') 

from utils import *
from get_data import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import torch

import pickle
import numpy as np
import pandas as pd
import argparse
import time

from copy import deepcopy

# from manifold_torch import Manifold_Tabular_Sampler

def arg_parse():
    parser = argparse.ArgumentParser(description="UMAP discriminator")
    parser.add_argument(
            "--exp", dest="exp", help="Experiments"
        )
    parser.add_argument(
            "--std", dest="std", type=float, help="Perturbation std"
        )
    parser.add_argument(
        "--multiplier", dest="multiplier", type=int, help="Number of times an image is perturbed"
    )
    parser.add_argument(
        "--perturbations", dest="num_perturbations", type=int, help="Number of perturbations"
    )
    parser.add_argument(
        "--dim", dest="dim", type=int, help="Number of low dim"
    )
    parser.add_argument(
        "--pivots", dest="pivots", type=int, help="Number of pivots"
    )
    parser.add_argument(
        "--shuffle", dest="shuffle", type=bool, help="Shuffle the pivots"
    )
    parser.add_argument(
            "--train_ratio", dest="train_ratio", type=float, help="ratio of training for the rf"
        )
    
        
    parser.set_defaults(
        exp = 'compass',
        std = 0.0001,
        num_perturbations = 100,
        multiplier = 100,
        dim = 2,
        pivots = 20,
        shuffle = True,
        train_ratio = 0.9
    )
    return parser.parse_args()

# prog_args = arg_parse()

In [8]:
EXPERIMENT = 'cc'
PERTURBATION_STD = 0.0001
NUM_PERTURBATIONS = 100
MULTIPLIER = 100
DIM = 2
PIVOTS = 20
# TRAIN_RATIO = 0.9

print("EXPERIMENT: ", EXPERIMENT)
print("MULTIPLIER: ", MULTIPLIER)
print("PERTURBATION_STD: ", PERTURBATION_STD)
print("DIM: ", DIM)
print("PIVOTS: ", PIVOTS)

EXPERIMENT:  cc
MULTIPLIER:  100
PERTURBATION_STD:  0.0001
DIM:  2
PIVOTS:  20


In [9]:
# Get the data set and do some preprocessing
params = Params("data/experiment_params.json")
np.random.seed(params.seed)
if EXPERIMENT == 'compass':
    X, y, cols = get_and_preprocess_compas_data(params)
elif EXPERIMENT == 'german':
    X, y, cols = get_and_preprocess_german(params)
elif EXPERIMENT == 'cc':
    X, y, cols = get_and_preprocess_cc(params)

In [10]:
X.shape

(1994, 100)

In [4]:
print("Data shape: ", X.shape)

xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.1)
ss = StandardScaler().fit(X)
xtrain = ss.transform(xtrain)
xtest = ss.transform(xtest)
xall = ss.transform(X)

Data shape:  (1994, 100)


In [5]:
def get_discriminator(X,y,n_estimators = 100):
    the_rf = RandomForestClassifier(n_estimators=n_estimators).fit(X, y)
    y_pred = the_rf.predict(X)
    the_rf_result = (y_pred == y).sum()
    return the_rf, the_rf_result/y.shape[0]

def get_discriminator_performance(X,y,rf):
    y_pred = rf.predict(X)
    the_rf_result = (y_pred == y).sum()
    return the_rf_result/y.shape[0]

In [6]:
import umap

def normalize(v, r):
    return v/np.sqrt(np.sum(v**2))*r

# class Manifold_Tabular_Sampler(object):
#     # Expect torch data
#     def __init__(self, data, dim = 2, random_state = 1,
#                 labels = None,
#                 std_train = 0.2):
#         """Init function.

#         Args:
#             data: traning data
#         """
#         self.data = data
#         self.dim = dim
#         self.std_train = std_train
#         self.no_training, self.num_features = self.data.shape
#         self.data_min = torch.min(self.data)
#         self.data_max = torch.max(self.data)
#         self.mapper = umap.UMAP(n_components = self.dim, random_state = random_state)
#         self.mapper.fit(data)
#         self.labels = labels
#         self.pivots = None
#         self.planes = None
        
#     def get_pivots(self, labels, no_pivots_per_label = 1, shuffle = False, target_labels = None):        
#         if target_labels == None:
#             target_labels = torch.unique(labels)
        
#         buff = []
#         for l in target_labels:
#             all_idx = (labels == l).nonzero(as_tuple=False)

#             if shuffle == False:
#                 idx = all_idx[range(no_pivots_per_label)]
#             else:
#                 idx = all_idx[random.sample(range(len(all_idx)),no_pivots_per_label )]
            
#             for i in idx:
#                 buff.append(i)
#         buff = [buff[i].cpu().detach().numpy()[0] for i in range(len(buff))]
            
#         self.pivots = self.data[buff].clone()
#         return buff
    
#     def transform(self, x_data):
#         return self.mapper.transform(x_data)

#     def inv_transform(self, low_data):
# #         Expect [N,d] data
#         return self.mapper.inverse_transform(low_data)

    
#     def get_G_from_samples(self, x_sample):
# #         x_sample should be inside the input distribution
#         matA = self.mapper.transform(x_sample.cpu().detach().numpy())
#         matB = x_sample.cpu().detach().numpy()
#         Xt = np.transpose(matA)
#         XtX = np.dot(Xt,matA)
#         Xty = np.dot(Xt,matB)
#         matG = np.linalg.solve(XtX,Xty)
#         return matG
    
#     def get_G_from_pivots(self):
#         matA = self.mapper.transform(self.pivots.cpu().detach().numpy())
#         matB = self.pivots.cpu().detach().numpy()
#         Xt = np.transpose(matA)
#         XtX = np.dot(Xt,matA)
#         Xty = np.dot(Xt,matB)
#         matG = np.linalg.solve(XtX,Xty)
#         return matG
    
#     def gen_perturbation_base(self, X, perturbation_multiplier=10, perturbation_std = 0.001):
# #         Gauss perturbations
#         all_x, all_y = [], []
#         var = 0
#         for _ in range(perturbation_multiplier):
#             perturbed_xtrain = np.random.normal(0, perturbation_std, size=X.shape)
#             p_train_x = np.vstack((X, np.clip(X + perturbed_xtrain, self.data_min, self.data_max)))
#             p_train_y = np.concatenate((np.zeros(X.shape[0]), np.ones(X.shape[0])))
#             all_x.append(p_train_x)
#             all_y.append(p_train_y)
#         all_x = np.vstack(all_x)
#         all_y = np.concatenate(all_y)
#         return torch.tensor(all_x), torch.tensor(all_y)

        
#     def get_G_local(self, x, 
#                     number_of_local_perturbations = 100,
#                     local_std = 0.001,
#                     perturb_only = True):
#         x_sample, y_sample = self.gen_perturbation_base(x.unsqueeze(0),
#                                             perturbation_multiplier=number_of_local_perturbations,
#                                             perturbation_std = local_std)
#         x_reverse = torch.tensor(self.inv_transform(self.transform(x_sample)))
#         if perturb_only == True:
#             return self.get_G_from_samples(x_reverse[y_sample == 1])
#         else:
#             return self.get_G_from_samples(x_reverse)
    
#     def train_pivots(self,  
#                      number_of_pivots_per_label = 10,
#                      number_of_local_perturbations = 50,
#                      local_std = 0.001, 
#                      shuffle = False):
#         if self.pivots == None:
#             _ =  self.get_pivots(self.labels, number_of_pivots_per_label, shuffle = shuffle)
        
#         Gs = []
#         Gvs = []
#         for pivot in self.pivots:
#             G = self.get_G_local(pivot, 
#                                  number_of_local_perturbations = number_of_local_perturbations,
#                                  local_std = local_std)
#             Gu, Gd, Gv = np.linalg.svd(G, full_matrices=False)
#             Gs.append(G)
#             Gvs.append(Gv)
        
#         self.G_pivots = Gs
#         self.Gv_pivots = Gvs
        
#     def get_pivot_perturbations(self, radius, 
#                                  number_of_perturbations_per_pivot = 1,
#                                  test = False):
#         gauss_perturbs = []
#         ortho_perturbs = []
#         plane_perturbs = []
        
#         for pivot in range(self.pivots.shape[0]):
#             Gv = self.Gv_pivots[pivot]
# #             Gv = self.Gv
            
#             for _ in range(number_of_perturbations_per_pivot):    
#                 gauss_noise = np.random.normal(0, 1, size=self.pivots[pivot].shape)
#                 plane_noise = np.zeros_like(gauss_noise)
#                 for d in range(Gv.shape[0]):
#                     proj = np.dot(gauss_noise, Gv[d])
#                     plane_noise = plane_noise + proj*Gv[d]        
#                 ortho_noise = gauss_noise - plane_noise

#                 # noise
#                 r = np.random.uniform()*radius
#                 ortho_norm = normalize(ortho_noise, r)
#                 plane_norm = normalize(plane_noise, r)
#                 gauss_norm = normalize(gauss_noise, r)

#                 # point clouds
#                 ortho_pc = self.pivots[pivot] + ortho_norm
#                 plane_pc = self.pivots[pivot] + plane_norm
#                 gauss_pc = self.pivots[pivot] + gauss_norm

#                 ortho_perturbs.append(ortho_pc)
#                 if test == True:
#                     plane_perturbs.append(plane_pc)
#                     gauss_perturbs.append(gauss_pc)
        
#         self.ortho_perturbs = torch.stack(ortho_perturbs)
#         if test == True:
#             self.gauss_perturbs = torch.stack(gauss_perturbs)
#             self.plane_perturbs = torch.stack(plane_perturbs)
#         else:
#             self.gauss_perturbs = None
#             self.plane_perturbs = None
#         return ortho_perturbs   

class Manifold_Tabular_Sampler(object):
    # Expect torch data
    def __init__(self, data, dim = 2, random_state = 1,
                labels = None,
                std_train = 0.2):
        """Init function.

        Args:
            data: traning data
        """
        self.data = data
        self.dim = dim
        self.std_train = std_train
        self.no_training, self.num_features = self.data.shape
        self.data_min = torch.min(self.data)
        self.data_max = torch.max(self.data)
        self.mapper = umap.UMAP(n_components = self.dim, random_state = random_state)
        self.mapper.fit(data)
        self.labels = labels
        self.pivots = None
        self.planes = None
        
    def get_pivots(self, labels, no_pivots_per_label = 1, shuffle = False, target_labels = None):        
        if target_labels == None:
            target_labels = torch.unique(labels)
        
        buff = []
        for l in target_labels:
            all_idx = (labels == l).nonzero(as_tuple=False)

            if shuffle == False:
                idx = all_idx[range(no_pivots_per_label)]
            else:
                idx = all_idx[random.sample(range(len(all_idx)),no_pivots_per_label )]
            
            for i in idx:
                buff.append(i)
        buff = [buff[i].cpu().detach().numpy()[0] for i in range(len(buff))]
            
        self.pivots = self.data[buff].clone()
        return buff
    
    def transform(self, x_data):
        return self.mapper.transform(x_data)

    def inv_transform(self, low_data):
#         Expect [N,d] data
        return self.mapper.inverse_transform(low_data)

    
    def get_G_from_samples(self, x_sample):
        matA = self.mapper.transform(x_sample.cpu().detach().numpy())
        matB = x_sample.cpu().detach().numpy()
        Xt = np.transpose(matA)
        XtX = np.dot(Xt,matA)
        Xty = np.dot(Xt,matB)
        matG = np.linalg.solve(XtX,Xty)
        return matG
    
    def get_G_from_pivots(self):
        matA = self.mapper.transform(self.pivots.cpu().detach().numpy())
        matB = self.pivots.cpu().detach().numpy()
        Xt = np.transpose(matA)
        XtX = np.dot(Xt,matA)
        Xty = np.dot(Xt,matB)
        matG = np.linalg.solve(XtX,Xty)
        return matG

    


In [7]:
TARGET = 0
SHUFFLE = False

start_time = time.time()
manifold_sampler = Manifold_Tabular_Sampler(torch.tensor(xall), dim = DIM, labels = torch.tensor(y))
duration = time.time() - start_time
print("Initialize duration: ", duration)

if TARGET == None:
    targets = torch.unique(torch.tensor(y))
    target_str = 'all'
else:
    targets = [TARGET]
    target_str = str(TARGET)

_ = manifold_sampler.get_pivots(manifold_sampler.labels, MULTIPLIER, shuffle = SHUFFLE, target_labels=targets)
manifold_G = manifold_sampler.get_G_from_pivots()
Gu, Gd, Gv = np.linalg.svd(manifold_G, full_matrices=False)

Initialize duration:  11.615175485610962


In [8]:
RADIUS = 0.005
base_RADIUS = 0.0

In [12]:
# start_time = time.time()
base_gauss_ = np.random.normal(0, 1, size=manifold_sampler.pivots.shape)
r = np.random.uniform()*base_RADIUS
base_gauss_norm = normalize(base_gauss_, r)
base_pc = manifold_sampler.pivots + base_gauss_norm

gauss_noise = np.random.normal(0, 1, size=manifold_sampler.pivots.shape)
plane_noise = np.zeros_like(gauss_noise)
for d in range(Gv.shape[0]):
    proj = np.dot(gauss_noise, Gv[d])
    for s in range(plane_noise.shape[0]):
        plane_noise[s] = plane_noise[s] + proj[s]*Gv[d]        
ortho_noise = gauss_noise - plane_noise

# noise
r = np.random.uniform()*RADIUS
ortho_norm = normalize(ortho_noise, r)
plane_norm = normalize(plane_noise, r)
gauss_norm = normalize(gauss_noise, r)

# point clouds
ortho_pc = base_pc + ortho_norm
plane_pc = base_pc + plane_norm
gauss_pc = base_pc + gauss_norm
# ortho_pc = manifold_sampler.to_1d(manifold_sampler.pivots) + ortho_norm
# plane_pc = manifold_sampler.to_1d(manifold_sampler.pivots) + plane_norm
# gauss_pc = manifold_sampler.to_1d(manifold_sampler.pivots) + gauss_norm

ori_pc = manifold_sampler.pivots.cpu().detach().numpy()
base_pc = base_pc.cpu().detach().numpy()
ortho_pc = ortho_pc.cpu().detach().numpy()
plane_pc = plane_pc.cpu().detach().numpy()
gauss_pc = gauss_pc.cpu().detach().numpy()

In [13]:
X_discriminator_gauss = np.vstack((base_pc[::2], gauss_pc[::2]))
X_discriminator_plane = np.vstack((base_pc[::2], plane_pc[::2]))
X_discriminator_ortho = np.vstack((base_pc[::2], ortho_pc[::2]))
y_discriminator = np.concatenate((np.zeros(ori_pc[::2].shape[0]), np.ones(ori_pc[::2].shape[0])))

the_rf_gauss, train_acc_gauss = get_discriminator(X_discriminator_gauss, y_discriminator, n_estimators = 100)
the_rf_plane, train_acc_plane = get_discriminator(X_discriminator_plane, y_discriminator, n_estimators = 100)
the_rf_ortho, train_acc_ortho = get_discriminator(X_discriminator_ortho, y_discriminator, n_estimators = 100)

print("Gauss:")
# print(train_acc_gauss)
print(get_discriminator_performance(gauss_pc[1::2], np.ones(gauss_pc[1::2].shape[0]) , the_rf_gauss))
print(get_discriminator_performance(base_pc, np.zeros(base_pc.shape[0]) , the_rf_gauss))

print("Plane:")
# print(train_acc_plane)
print(get_discriminator_performance(plane_pc[1::2], np.ones(plane_pc[1::2].shape[0]) , the_rf_plane))
print(get_discriminator_performance(base_pc, np.zeros(base_pc.shape[0]) , the_rf_plane))

print("Ortho:")
# print(train_acc_ortho)
print(get_discriminator_performance(ortho_pc[1::2], np.ones(ortho_pc[1::2].shape[0]) , the_rf_ortho))
print(get_discriminator_performance(base_pc, np.zeros(base_pc.shape[0]) , the_rf_ortho))

Gauss:
0.8
0.94
Plane:
0.78
0.98
Ortho:
0.84
0.89


In [58]:
p_LIME = 0.8

lime_mask = np.random.uniform(0, 1, size=manifold_sampler.pivots.shape)
lime_mask[lime_mask > p_LIME] = 1
lime_mask[lime_mask <= p_LIME] = 0

In [59]:
data_max = manifold_sampler.data_max.cpu().item()
data_min = manifold_sampler.data_min.cpu().item()
noise_uniform = np.random.uniform(data_min, data_max, size=manifold_sampler.pivots.shape)
noise_mask = noise_uniform * lime_mask

In [60]:
lime_pc = base_pc.copy()
lime_pc[noise_mask != 0] = noise_mask[noise_mask != 0]

In [61]:
X_discriminator_lime = np.vstack((base_pc[::2], lime_pc[::2]))
y_discriminator = np.concatenate((np.zeros(ori_pc[::2].shape[0]), np.ones(ori_pc[::2].shape[0])))

# the_rf_lime, train_acc_lime = get_discriminator(X_discriminator_lime, y_discriminator, n_estimators = 100)

print("LIME:")
# print(train_acc_gauss)
print(get_discriminator_performance(lime_pc[1::2], np.ones(lime_pc[1::2].shape[0]) , the_rf_lime))
print(get_discriminator_performance(base_pc, np.zeros(base_pc.shape[0]) , the_rf_gauss))


LIME:
0.46
0.94


In [74]:
SHAP_bg = ori_pc.mean(axis = 0)

In [78]:
SHAP_bg = ori_pc.mean(axis = 0)
SHAP_bg = np.tile(SHAP_bg,(ori_pc.shape[0],1))


In [66]:
# alpha = 0.1
# print("Gauss:")
# # print(train_acc_gauss)
# print(get_discriminator_performance(gauss_pc[1::2]-alpha*(gauss_pc[1::2]-base_pc[1::2]), np.ones(gauss_pc[1::2].shape[0]) , the_rf_gauss))

# print("Plane:")
# # print(train_acc_plane)
# print(get_discriminator_performance(plane_pc[1::2]-alpha*(plane_pc[1::2]-base_pc[1::2]), np.ones(plane_pc[1::2].shape[0]) , the_rf_plane))

# print("Ortho:")
# # print(train_acc_ortho)
# print(get_discriminator_performance(ortho_pc[1::2]-alpha*(ortho_pc[1::2]-base_pc[1::2]), np.ones(ortho_pc[1::2].shape[0]) , the_rf_ortho))

In [11]:
# start_time = time.time()
# manifold_sampler = Manifold_Tabular_Sampler(torch.tensor(xall), dim = DIM, labels = torch.tensor(y))
# duration = time.time() - start_time
# print("Initialize duration: ", duration)

In [12]:
# start_time = time.time()
# manifold_sampler.train_pivots(number_of_pivots_per_label = 50,
#                               number_of_local_perturbations = 50,
#                               local_std = 0.01, 
#                               shuffle = False)
# duration = time.time() - start_time
# print("Train pivots duration: ", duration)

In [13]:
# X_gauss_old = manifold_sampler.gauss_perturbs.cpu().detach().numpy()


In [14]:
# start_time = time.time()
# ortho_perturbs = manifold_sampler.get_pivot_perturbations(radius = 1.0, 
#                                                          number_of_perturbations_per_pivot = 20,
#                                                          test = True)
# duration = time.time() - start_time
# print("Perturbations generation duration: ", duration)

In [15]:
# X_input = manifold_sampler.pivots.cpu().detach().numpy()
# X_gauss = manifold_sampler.gauss_perturbs.cpu().detach().numpy()
# X_plane = manifold_sampler.plane_perturbs.cpu().detach().numpy()
# X_ortho = manifold_sampler.ortho_perturbs.cpu().detach().numpy()

# print(X_input.shape)
# print(X_gauss.shape)
# print(X_plane.shape)
# print(X_ortho.shape)

In [16]:
# X_discriminator_gauss = np.vstack((X_gauss_old[::20], X_gauss[::20]))
# X_discriminator_plane = np.vstack((X_gauss_old[::20], X_plane[::20]))
# X_discriminator_ortho = np.vstack((X_gauss_old[::20], X_ortho[::20]))
# y_discriminator = np.concatenate((np.zeros(X_input.shape[0]), np.ones(X_input.shape[0])))

In [17]:
# the_rf_gauss, train_acc_gauss = get_discriminator(X_discriminator_gauss, y_discriminator, n_estimators = 100)
# the_rf_plane, train_acc_plane = get_discriminator(X_discriminator_plane, y_discriminator, n_estimators = 100)
# the_rf_ortho, train_acc_ortho = get_discriminator(X_discriminator_ortho, y_discriminator, n_estimators = 100)

# print("Gauss:")
# # print(train_acc_gauss)
# print(get_discriminator_performance(X_gauss, np.ones(X_gauss.shape[0]) , the_rf_gauss))
# # print(get_discriminator_performance(X_input, np.zeros(X_input.shape[0]) , the_rf_gauss))

# print("Plane:")
# # print(train_acc_plane)
# print(get_discriminator_performance(X_plane, np.ones(X_plane.shape[0]) , the_rf_plane))
# # print(get_discriminator_performance(X_input, np.zeros(X_input.shape[0]) , the_rf_plane))

# print("Ortho:")
# # print(train_acc_ortho)
# print(get_discriminator_performance(X_ortho, np.ones(X_ortho.shape[0]) , the_rf_ortho))
# # print(get_discriminator_performance(X_input, np.zeros(X_input.shape[0]) , the_rf_ortho))