Imports libraries and configures the environment.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from utils import *

%matplotlib inline

Loads dataset for anomaly detection.

In [None]:
# Load the dataset
X_train, X_val, y_val = load_data()

In [None]:
# Display the first five elements of X_train

Visualizes the data distribution, density contours, and detected anomalies.

In [None]:
# Create a scatter plot of the data. To change the markers to blue "x",
plt.scatter(X_train[:, 0], X_train[:, 1], marker='x', c='b') 

# Set the title
plt.title("The first dataset")
# Set the y-axis label
plt.ylabel('Throughput (mb/s)')
# Set the x-axis label
plt.xlabel('Latency (ms)')
# Set axis range
plt.axis([0, 30, 0, 30])
plt.show()

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
# estimate_gaussian

def estimate_gaussian(X): 
    """
    Calculates mean and variance of all features 
    in the dataset
    
    Args:
        X (ndarray): (m, n) Data matrix
    
    Returns:
        mu (ndarray): (n,) Mean of all features
        var (ndarray): (n,) Variance of all features
    """

    m, n = X.shape
   
    mu = 1 / m * np.sum(X, axis = 0)
    var = 1/m * np.sum((X - mu)**2, axis = 0)
        
    return mu, var

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
# Estimate mean and variance of each feature
mu, var = estimate_gaussian(X_train)              

    
estimate_gaussian_test(estimate_gaussian)

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
# Returns the density of the multivariate normal
# at each data point (row) of X_train
p = multivariate_gaussian(X_train, mu, var)

#Plotting code 
visualize_fit(X_train, mu, var)

Selects the anomaly threshold using a validation set to maximize F1.

In [None]:
# select_threshold

def select_threshold(y_val, p_val): 
    """
    Finds the best threshold to use for selecting outliers 
    based on the results from a validation set (p_val) 
    and the ground truth (y_val)
    
    Args:
        y_val (ndarray): Ground truth on validation set
        p_val (ndarray): Results on validation set
        
    Returns:
        epsilon (float): Threshold chosen 
        F1 (float):      F1 score by choosing epsilon as threshold
    """ 

    best_epsilon = 0
    best_F1 = 0
    F1 = 0
    
    step_size = (max(p_val) - min(p_val)) / 1000
    for epsilon in np.arange(min(p_val), max(p_val), step_size):
    
        tp = 0
        fp = 0
        fn = 0
            anomoly = 0
                anomoly += 1
                tp += 1
                fp += 1
                fn += 1
        if tp == 0:
            F1 = 0
        else:
            prec = tp/(tp+fp)
            rec = tp/(tp+fn)
        
            F1 = (2*prec*rec)/(prec+rec)     
        
        
        
        if F1 > best_F1:
            best_F1 = F1
            best_epsilon = epsilon
        
    return best_epsilon, best_F1

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
p_val = multivariate_gaussian(X_val, mu, var)
epsilon, F1 = select_threshold(y_val, p_val)

    
# UNIT TEST
select_threshold_test(select_threshold)

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
# Find the outliers in the training set 
outliers = p < epsilon

# Visualize the fit
visualize_fit(X_train, mu, var)

# Draw a red circle around those outliers
plt.plot(X_train[outliers, 0], X_train[outliers, 1], 'ro',
         markersize= 10,markerfacecolor='none', markeredgewidth=2)

Loads dataset for anomaly detection.

In [None]:
# load the dataset
X_train_high, X_val_high, y_val_high = load_data_multi()

Estimates Gaussian parameters (mean and variance) for each feature.

In [None]:
# Apply the same steps to the larger dataset

# Estimate the Gaussian parameters
mu_high, var_high = estimate_gaussian(X_train_high)

# Evaluate the probabilites for the training set
p_high = multivariate_gaussian(X_train_high, mu_high, var_high)

# Evaluate the probabilites for the cross validation set
p_val_high = multivariate_gaussian(X_val_high, mu_high, var_high)

# Find the best threshold
epsilon_high, F1_high = select_threshold(y_val_high, p_val_high)