In [61]:
import numpy as np
from dataLoader import importData


data_set = 1
percentage_anomalies = [0.1, 1, 10, 20, 30]

x_train, y_train, x_test, y_test, dataset_name = importData(data_set)
#x_train, x_test = pca_transform(x_train, x_test)

x_train = np.concatenate([x_train, x_test], axis=0)
y_train = np.concatenate([y_train, y_test])
num_class = np.max(y_train) + 1
print('Number of classes:', num_class)

Data-set: STL-10
Number of classes: 10


In [56]:
store_folder = '../stored_results/'

In [65]:
from shell_anon import NormalizedClusterLearner
from normalization import InstanceNormalization, ErgoNormalization, NaiveNormalization, NoNormalization, PreTrainedNormalization
from sklearn.metrics import roc_auc_score, average_precision_score, pairwise_distances


def build_eval_set(x_train, y_train, ind, p_anon):
    x_in = x_train[y_train==ind]
    x_out = x_train[y_train!=ind]
    random.shuffle(x_out)

    num_out = int(p_anon/100 * x_in.shape[0])
    data = np.concatenate([x_in, x_out[:num_out]], axis=0)
    gt = np.zeros(data.shape[0], dtype=int)
    gt[:data.shape[0] -num_out] = 1
    
    return data, gt



class AnonEvaluationStatistics():
    def __init__(self, 
                 percentiles =[0.1, 1, 10, 20, 30], 
                 name = 'unnamed'):
        self.percentiles = percentiles
        self.name = name
        self.auroc = None
        self.auprc = None
        self.mean_auroc = None
        self.mean_auprc = None
    
    
    def eval(self, x_train, y_train, clf, print_summary=True):
        num_class = np.max(y_train) + 1
        
        auroc_scores = np.zeros([num_class, len(self.percentiles)])
        auprc_scores = np.zeros([num_class, len(self.percentiles)])

        for class_num in range(num_class):
            for anon_ind, p_anon in enumerate(self.percentiles):
                data, gt = build_eval_set(x_train, y_train, class_num, p_anon)
                
                clf.fit(data)
                score = clf.score(data)
                auroc = roc_auc_score(gt, score)
                auprc = average_precision_score(gt, score)

                
                auroc_scores[class_num, anon_ind] = auroc
                auprc_scores[class_num, anon_ind] = auprc
                
                if print_summary:
                    print('class:', class_num + 1, '/', num_class,
                          ', anon percentage:', p_anon, 
                          ', auroc:', auroc)
        
        self.auroc = auroc_scores
        self.auprc = auprc_scores
                
        self.mean_auroc = np.mean(auroc_scores, axis=0)
        self.mean_auprc = np.mean(auprc_scores, axis=0)
            


                
    
num_clus = 5
clf_ergo = NormalizedClusterLearner(num_clus = num_clus, norm = ErgoNormalization())
eval_name = dataset_name + '_naive_ergo_normalization_' + str(num_clus)
anon_eval = AnonEvaluationStatistics(name = eval_name)
anon_eval.eval(x_train, y_train, clf_ergo)
    

class: 1 / 10 , anon percentage: 0.1 , auroc: 0.9992307692307693
class: 1 / 10 , anon percentage: 1 , auroc: 0.9975739644970415
class: 1 / 10 , anon percentage: 10 , auroc: 0.865621301775148
class: 1 / 10 , anon percentage: 20 , auroc: 0.8722485207100591
class: 1 / 10 , anon percentage: 30 , auroc: 0.878534516765286
class: 2 / 10 , anon percentage: 0.1 , auroc: 0.9946153846153846
class: 2 / 10 , anon percentage: 1 , auroc: 0.9814201183431954
class: 2 / 10 , anon percentage: 10 , auroc: 0.3835029585798816
class: 2 / 10 , anon percentage: 20 , auroc: 0.38635798816568045
class: 2 / 10 , anon percentage: 30 , auroc: 0.241138067061144
class: 3 / 10 , anon percentage: 0.1 , auroc: 0.99
class: 3 / 10 , anon percentage: 1 , auroc: 0.9905325443786983
class: 3 / 10 , anon percentage: 10 , auroc: 0.7643372781065089
class: 3 / 10 , anon percentage: 20 , auroc: 0.6024541420118343
class: 3 / 10 , anon percentage: 30 , auroc: 0.6445187376725838
class: 4 / 10 , anon percentage: 0.1 , auroc: 0.99846153

In [None]:
    
num_clus = 300
clf_ergo = BayesClusterLearner(num_clus = num_clus, norm = InstanceNormalization())
eval_name = dataset_name + '_bayes_instance_normalization_' + str(num_clus)
anon_eval = AnonEvaluationStatistics(name = eval_name)
anon_eval.eval(x_train, y_train, clf_ergo)
    

class: 1 / 10 , anon percentage: 0.1 , auroc: 0.0007692307692307692
class: 1 / 10 , anon percentage: 1 , auroc: 0.008106508875739644
class: 1 / 10 , anon percentage: 10 , auroc: 0.0049526627218934895


In [80]:
import inspect
print(clf.norm.__class__)

<class 'normalization.InstanceNormalization'>


In [83]:
clf.norm.dtype

AttributeError: 'InstanceNormalization' object has no attribute 'dtype'

In [66]:
import pickle

store_path = store_folder + '/' + eval_name + '.pickle'
with open(store_path, 'wb') as file:
    pickle.dump(anon_eval, file) 


In [68]:
with open(store_path, 'rb') as file:
    b = pickle.load(file)

In [69]:
b.auprc

array([[0.99999941, 0.99997571, 0.98570921, 0.9741688 , 0.96443542],
       [0.99999585, 0.99981381, 0.88742899, 0.79310751, 0.62606375],
       [0.99999228, 0.99990553, 0.97280129, 0.88393178, 0.85337837],
       [0.99999882, 0.99996645, 0.89753218, 0.84220134, 0.6524824 ],
       [1.        , 0.99991725, 0.96075206, 0.87181886, 0.81882269],
       [0.99999882, 0.99995269, 0.86747757, 0.75494247, 0.68772233],
       [0.99999228, 0.99984447, 0.9467382 , 0.80887817, 0.75428585],
       [1.        , 0.99999882, 0.94410405, 0.89490554, 0.70199899],
       [0.99998085, 0.99969602, 0.96079609, 0.93299763, 0.82296736],
       [0.99990781, 0.99964908, 0.97387531, 0.94399604, 0.79003723]])

In [39]:
np.save('../stored_results/test', anon_eval)

In [41]:
b = np.load('../stored_results/test.npy', allow_pickle=True)

In [45]:
b

array(<__main__.AnonEvaluationStatistics object at 0x7fd494e8e3a0>,
      dtype=object)

In [19]:
anon_eval.auroc

array([[0.99923077, 0.99923077, 0.86133136, 0.84907396, 0.87331755],
       [0.99923077, 0.93526627, 0.36343787, 0.38632249, 0.22253057],
       [0.99692308, 0.9952071 , 0.72683432, 0.5299497 , 0.54893097],
       [0.99769231, 0.99639053, 0.38034911, 0.39021598, 0.26346351],
       [1.        , 0.99863905, 0.66591124, 0.44484911, 0.50415582],
       [0.99846154, 0.99781065, 0.32779882, 0.31661243, 0.15192899],
       [0.99615385, 0.99710059, 0.59268047, 0.41322189, 0.42093688],
       [1.        , 0.99970414, 0.56349704, 0.38806805, 0.3663432 ],
       [0.99692308, 0.97384615, 0.6452071 , 0.62826627, 0.54188166],
       [0.96461538, 0.94710059, 0.68539349, 0.5778284 , 0.55277318]])

In [20]:
np.mean(anon_eval.auprc, axis=0)

array([0.99999605, 0.99983237, 0.9299021 , 0.81714435, 0.73221026])

In [None]:
for anon_ind, p_anon in enumerate(percentage_anomalies):
    print(anon_ind)


In [None]:
algo_stats = {'Ergo Normalization Naive': [], 'Instance Normalization Naive': [],
             'Instance Normalization Bayes':}

In [None]:
num_clus = 10

for in_class in range(num_class):
    print('processing class:', in_class)
    for p_anon in percentage_anomalies:
        
        # in_class and out_class instances
        x_in = x_train[y_train==ind]
        x_out = x_train[y_train!=ind]
        random.shuffle(x_out)

        num_out = int(p_anon/100 * x_in.shape[0])
        print('number of normal:',  x_in.shape[0])
        print('number of anomalies:',  num_out)
        
        data = np.concatenate([x_in, x_out[:num_out]], axis=0)
        gt = np.zeros(data.shape[0], dtype=int)
        gt[:data.shape[0] -num_out] = 1

        clf_ergo = NormalizedClusterLearner(num_clus=num_clus, norm=ErgoNormalization())
        clf_ergo.fit(data)
        d = clf.score(data)
        auroc = roc_auc_score(gt, -d)
        print('auroc score:', auroc)
    print('\n')



In [2]:
import random
    
num_out = 100
ind = 0
num_clus = 5
x_in = x_train[y_train==ind]
num_in = x_in.shape[0]

x_out = x_train[y_train!=ind]
random.shuffle(x_out)

data = np.concatenate([x_in, x_out[:num_out]], axis=0)
gt = np.zeros(data.shape[0], dtype=int)
gt[:data.shape[0] -num_out] = 1


In [4]:
from shell_anon import NormalizedClusterLearner, BayesClusterLearner
from normalization import InstanceNormalization, ErgoNormalization, NaiveNormalization, NoNormalization, PreTrainedNormalization
from sklearn.metrics import roc_auc_score, average_precision_score, pairwise_distances

clf = NormalizedClusterLearner(num_clus=num_clus)
clf.fit(data)
d = clf.score(data)

roc_auc_score(gt, -d)

0.8620153846153845

In [5]:
clf = NormalizedClusterLearner(num_clus=num_clus, norm=InstanceNormalization())
clf.fit(data)
d = clf.score(data)

roc_auc_score(gt, -d)

0.8609615384615384

In [8]:
clf = BayesClusterLearner(num_clus=300, norm=InstanceNormalization())
clf.fit(data)

In [7]:
prob = clf.score(data)
roc_auc_score(gt, prob)

0.9973