In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

from sklearn.neighbors import NearestNeighbors as sknn

import pickle
import re

In [2]:
# push file to git repo
# split clean sample and put into test set as negative classes (instead of current methodology)
# cross validation on the clean sample is another idea?

In [3]:
with open("/users/albertwen/downloads/mae_data/val_losses.pkl", "rb") as f:
  raw_stdout = pickle.load(f)
split = raw_stdout.split('\n')
pattern = re.compile("Epoch: .*")
base_loss = [i for i in split if pattern.match(i)][:-1]
base_loss = [float(re.search('loss: (0\.\d*) .*', i).group(1)) for i in base_loss]

In [4]:
with open("/users/albertwen/downloads/mae_data/fgsm4_losses.pkl", "rb") as f:
  raw_stdout = pickle.load(f)
split = raw_stdout.split('\n')
pattern = re.compile("Epoch: .*")
fgsm4_loss = [i for i in split if pattern.match(i)][:-1]
fgsm4_loss = [float(re.search('loss: (0\.\d*) .*', i).group(1)) for i in fgsm4_loss]

In [5]:
with open("/users/albertwen/downloads/mae_data/fgsm8_losses.pkl", "rb") as f:
  raw_stdout = pickle.load(f)
split = raw_stdout.split('\n')
pattern = re.compile("Epoch: .*")
fgsm8_loss = [i for i in split if pattern.match(i)][:-1]
fgsm8_loss = [float(re.search('loss: (0\.\d*) .*', i).group(1)) for i in fgsm8_loss]

# Conformal Predictor Class

In [6]:
class ConformalAnomalyDetector():
    """
    Conformal Anomaly Detector Class
    @params
        ICM: class
            An object whose call operation should produce an array of conformal scores
        z: tuple (len==2)
            Each element is an (x,y) pair of the training set for CAD
        significance: float
            The significance level (must be between 0 and 1 exclusive)
    """
    def __init__ (self, train, ICM, significance=0.05):
        self._ICM = ICM
        self.train = train

        assert significance > 0 and significance < 1, \
            print('Significance must be in range (0,1).')
        self._significance = significance

        self.baseline = self._ICM(self.train, self.train)

        
    """
    Return true or false if the test example are an anomaly
    @params
        test: np.ndarray
            A 1xn test example where m is the number of test examples and n is the number of dimensions
    @return: bool
        True if test input is anomaly and false otherwise 
    """
    def testIfAnomaly(self, test_val):
        # conformal_set = np.concatenate((self.z,np.asarray(test)))
        # conformal_set = np.asarray(self.z + [self.test_test]).reshape((len(self.z) + 1, 1))
        conformal_scores = self._ICM(self.train, test_val)

        p = np.sum(self.baseline >= conformal_scores[0]) / len(self.baseline)
        # p = np.sum(conformal_scores >= conformal_scores[-1]) / (len(self.train)+1)
        
        return p < self._significance

    """
    Return array of true or false if the test examples are an anomaly
    @params
        test: np.ndarray
            A mxn test example where m is the number of test examples and n is the number of dimensions
    @return: np.ndarray
        An mx1 array of true if test input is anomaly and false otherwise 
    """ 
    def __call__(self,test_set):
        isAnomaly = [self.testIfAnomaly(test_set[i]) for i in range(len(test_set))]
        return isAnomaly

    """
    Change significance level (hyper-parameter)
    @params
        significance: float
            The significance level (must be between 0 and 1 exclusive)
    """ 
    def set_significance(self,significance):
        assert significance > 0 and significance < 1, \
            print('Significance must be in range (0,1).')
        self._significance = significance

# KNN class

In [7]:
class KNearestNeighbors():

    # change this to return distance[0] from sklearn nearest neighbors implementation
    # described below
    """
        A simple real-valued function to compute the conformal scores
        Each conformal score is the average k-nearest neighbors according to a specified metric
        @params
            k: int
                Determines k nearest neighbors
            metric: str
                distance metric (see scipy's pdist function for valid metrics)
    """
    def __init__(self, k, metric='euclidean'):
        self._k = k
        self._metric = metric
        self.neighbors = sknn(n_neighbors=k)

        # self.all_distances = pdist(np.asarray(train_set + test_set).reshape((len(train_set) + len(test_set), 1)), self._metric)
        # self.all_distances = squareform(self.all_distances)

        # self.train_distances = self.all_distances[:len(self.train)].T[:len(self.train)].T

    """
        Returns a pairwise distance matrix
        @params
            x: np.ndarray
                An m x n array with m samples and n dimensions
    """
    def get_pairwise_distance_matrix(self, test_val):
        distances = self.neighbors.kneighbors(test_val, self._k, return_distance=True)
        return np.mean(distances[0], axis=1)

    """
        Returns the mean pairwise distance between the k'th nearest neighbors
        @params
            x: np.ndarray
                An m x n array with m samples and n dimensions
    """
    def __call__(self, train, test_val):
        self.neighbors.fit(np.asarray(train).reshape((len(train), 1)))

        if np.array(test_val).shape == ():
            distance_matrix = self.get_pairwise_distance_matrix(np.array([[test_val]]))
        else: 
            distance_matrix = self.get_pairwise_distance_matrix(np.array(test_val).reshape((len(test_val), 1)))
        return distance_matrix

# Main

In [9]:
len(base_loss)

100000

In [31]:
train_data = base_loss[:50000]
test_data = fgsm8_loss
# test_data = base_loss[:100]

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=1000) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    anomalies.append(isAnomaly)

In [32]:
with open("/users/albertwen/downloads/mae_data/fgsm4_anomalies.pkl", "wb") as f:
    pickle.dump(anomalies, f)

In [30]:
train_data = base_loss[:50000]
# test_data = fgsm8_loss
test_data = base_loss[50000:]

# improve performance using vstack and cupy instead of numpy

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=1000) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

base_anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    base_anomalies.append(isAnomaly)

In [33]:
with open("/users/albertwen/downloads/mae_data/baseline_anomalies.pkl", "wb") as f:
    pickle.dump(base_anomalies, f)

In [34]:
for result in anomalies:
    print("tpr", sum(result) / len(result))

tpr 0.48446
tpr 0.62556
tpr 0.9149
tpr 0.97602


In [35]:
for result in base_anomalies:
    print("fpr", sum(result) / len(result))

fpr 0.02796
fpr 0.05556
fpr 0.26408
fpr 0.51588


In [36]:
for i in range(len(anomalies)):
    print("accuracy", (sum(anomalies[i]) + (len(base_anomalies[i]) - sum(base_anomalies[i]))) / (len(base_anomalies[i]) + len(anomalies[i])))

accuracy 0.72825
accuracy 0.785
accuracy 0.82541
accuracy 0.73007


In [37]:
len(base_anomalies[3]) - sum(base_anomalies[3])

24206

In [38]:
len(base_anomalies[3])

50000

In [39]:
len(base_anomalies[3]) + len(anomalies[3])

100000

In [40]:
sum(anomalies[3])

48801

In [41]:
(50079 + 48775) / 150000

0.6590266666666666

# testing with different k

In [25]:
train_data = base_loss[:50000]
test_data = fgsm8_loss
# test_data = base_loss[50000:]

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=100) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    anomalies.append(isAnomaly)

with open("/users/albertwen/downloads/mae_data/fgsm8_anomalies_k100.pkl", "wb") as f:
    pickle.dump(anomalies, f)

In [26]:
train_data = base_loss[:50000]
# test_data = fgsm8_loss
test_data = base_loss[50000:]

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=100) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    anomalies.append(isAnomaly)

with open("/users/albertwen/downloads/mae_data/base_anomalies_k100.pkl", "wb") as f:
    pickle.dump(anomalies, f)

In [27]:
for result in anomalies:
    print("tpr", sum(result) / len(result))
for result in base_anomalies:
    print("fpr", sum(result) / len(result))
for i in range(len(anomalies)):
    print("accuracy", (sum(anomalies[i]) + (len(base_anomalies[i]) - sum(base_anomalies[i]))) / (len(base_anomalies[i]) + len(anomalies[i])))

tpr 0.03
tpr 0.06242
tpr 0.31272
tpr 0.58346
fpr 0.02378
fpr 0.0477
fpr 0.24348
fpr 0.49394
accuracy 0.50311
accuracy 0.50736
accuracy 0.53462
accuracy 0.54476


# testing with k=10

In [28]:
train_data = base_loss[:50000]
test_data = fgsm8_loss
# test_data = base_loss[50000:]

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=10) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    anomalies.append(isAnomaly)

with open("/users/albertwen/downloads/mae_data/fgsm8_anomalies_k10.pkl", "wb") as f:
    pickle.dump(anomalies, f)

train_data = base_loss[:50000]
# test_data = fgsm8_loss
test_data = base_loss[50000:]

# improve performance using vstack and cupy instead of numpy

np.random.seed(123432) # set seed for reproducibility
k_nearest_neighbor = KNearestNeighbors(k=10) # Initialize the ICM that uses k-nearest neighbors(k=10)
conformal_predictor = ConformalAnomalyDetector(train_data, ICM=k_nearest_neighbor) # initialize CAD

significances = [0.025,0.05,0.25,0.5] # see how different significance levels affect results
# significances = [0.025]

anomalies = []
for i in range(len(significances)):
    significance = significances[i]
    conformal_predictor.set_significance(significance) # change significance
    isAnomaly = conformal_predictor(test_data) # test if anomamlies according to current CAD
    anomalies.append(isAnomaly)

with open("/users/albertwen/downloads/mae_data/base_anomalies_k10.pkl", "wb") as f:
    pickle.dump(anomalies, f)

In [29]:
for result in anomalies:
    print("tpr", sum(result) / len(result))
for result in base_anomalies:
    print("fpr", sum(result) / len(result))
for i in range(len(anomalies)):
    print("accuracy", (sum(anomalies[i]) + (len(base_anomalies[i]) - sum(base_anomalies[i]))) / (len(base_anomalies[i]) + len(anomalies[i])))

tpr 0.05602
tpr 0.0852
tpr 0.0852
tpr 0.0852
fpr 0.02378
fpr 0.0477
fpr 0.24348
fpr 0.49394
accuracy 0.51612
accuracy 0.51875
accuracy 0.42086
accuracy 0.29563
