In [None]:
# Anomali Detection Methods 

In [None]:
# Tags
# pyod tutorial

In [None]:
# References
(https://towardsdatascience.com/practical-guide-for-anomaly-detection-in-time-series-with-python-d4847d6c099f)

In [None]:
import os
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pyod

%matplotlib inline

# settings
import warnings
warnings.filterwarnings('ignore')
#print all rows of a df in ipython shell 
pd.set_option('display.max_rows', None)
#print all columns of a df in ipython shell 
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.1f' % x)
pd.set_option("display.precision", 6)

# optional
#pd.set_option('display.max_columns',100)
sns.set_style('darkgrid')

In [None]:
import datetime
from datetime import datetime

path = os.path.abspath(os.getcwd())
my_file_4 = 'eva_4.csv'

input_path = os.path.join(path, 'data', my_file_4)
df = pd.read_csv(input_path)
df['_time'] = df['_time'].values.astype('<M8[m]')

df.head()

In [None]:
## TOC:
* [Univariate anomaly detection](#univariate)
* [Multivariate anomaly detection](#multivariate)
* [Compare methods](#compare)

In [None]:
## Univariate anomaly detection<a class="anchor" id="univariate"></a>


    1. Z-score (standard score): the z-score measures how many standard deviations a data point is away from the mean. Generally, instances with a z-score over 3 are chosen as outliers.
    2. Interquartile range (IQR): The IQR is the range between the first quartile (Q1) and the third quartile (Q3) of a distribution. When an instance is beyond Q1 or Q3 for some multiplier of IQR, they are considered outliers. The most common multiplier is 1.5, making the outlier range [Q1–1.5 * IQR, Q3 + 1.5 * IQR].
    3. Modified z-scores: similar to z-scores, but modified z-scores use the median and a measure called Median Absolute Deviation (MAD) to find outliers. Since mean and standard deviation are easily skewed by outliers, modified z-scores are generally considered more robust.


In [None]:
# MAD
from pyod.models.mad import MAD

# Load a sample dataset
diamonds = sns.load_dataset("diamonds")
# Extract the feature we want
X = diamonds[["price"]]

# Initialize and fit a model
mad = MAD().fit(X)

# Extract the outlier labels

pd.DataFrame(mad.labels_).value_counts()
#pd.DataFrame(mad.decision_scores_).value_counts()

In [None]:
# MAD
from pyod.models.mad import MAD

# Load a sample dataset
#diamonds = sns.load_dataset("diamonds")
# Extract the feature we want
X = df[["count"]]

# Initialize and fit a model
mad = MAD().fit(X)

# Extract the outlier labels

y_test_pred, pred_confidence = mad.predict(X, return_confidence=True)

df['labels'] = pd.DataFrame(mad.labels_)
df['decision_scores'] = pd.DataFrame(mad.decision_scores_)
df['pred_confidence'] = pred_confidence

print(pd.DataFrame(mad.labels_).value_counts())
df.head()

In [None]:
df.query("decision_scores >=7")

In [None]:
df.nsmallest(5, "pred_confidence")

In [None]:
df[['count']][df.labels >= 1]


In [None]:
# Plot the temperature readings and the anomalies

plt.subplots(figsize=(14, 10)) # MODIFICATION, inserted
plt.plot(df['_time'], df['count'], color='blue', label='Counts')
plt.scatter(df[['_time']][df.labels >= 1], df[['count']][df.labels >= 1], color='red', label='Anomalies')
#plt.plot(df['timestamp'], ma, color='green', label='Moving Average')
#plt.fill_between(df['timestamp'], ma-threshold, ma+threshold, color='gray', alpha=0.2, label='Threshold')
plt.legend()
plt.title('Indexes counts')
plt.xlabel('Date')
plt.ylabel('Counts')
plt.grid() # MODIFICATION, inserted
plt.show()

In [None]:
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
#from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.lscp import LSCP
from pyod.models.inne import INNE
from pyod.models.gmm import GMM
from pyod.models.kde import KDE
from pyod.models.lmdd import LMDD

from pyod.models.dif import DIF
from pyod.models.copod import COPOD
from pyod.models.ecod import ECOD
#from pyod.models.suod import SUOD
from pyod.models.qmcd import QMCD
from pyod.models.sampling import Sampling
from pyod.models.kpca import KPCA
from pyod.models.lunar import LUNAR

In [None]:
# contamination=outliers_fraction, 
classifiers = { 
               'Mean absolute deviation (MAD)': MAD(),
               'Isolation Forest': IForest(random_state=random_state),
               #'Angle-based Outlier Detector (ABOD)': ABOD(),
}

In [None]:
from pyod.models.mad import MAD

for i, (clf_name, clf) in enumerate(classifiers.items()):
    print()
    print(i + 1, 'fitting', clf_name)
    X = df[["count"]]
    # fit the data and tag outliers
    clf.fit(X)
    scores_pred = clf.decision_function(X) * -1
    y_pred, pred_confidence = clf.predict(X, return_confidence=True)
    df['labels'] = pd.DataFrame(clf.labels_)
    df['decision_scores'] = pd.DataFrame(mad.decision_scores_)
    df['pred_confidence'] = pred_confidence
    #threshold = percentile(scores_pred, 100 * outliers_fraction)
    #n_errors = (y_pred != ground_truth).sum()
    # plot the levels lines and the points
    print(pd.DataFrame(clf.labels_).value_counts())
    print(df.head())
    
    #subplot = plt.subplot(5, 5, i + 1)
    fig,ax = plt.subplots(figsize = (24,4))
    #plt.subplots(figsize=(24, 4)) # MODIFICATION, inserted
    plt.plot(df['_time'], df['count'], color='blue', label='Counts')
    plt.scatter(df[['_time']][df.labels >= 1], df[['count']][df.labels >= 1], color='red', label='Anomalies')
    fig.text(0.5, 0.7, 'Outliers: '+str(len(df[['count']][df.labels >= 1])), fontsize = 25, 
         bbox = dict(facecolor = 'red', alpha = 0.5))
    #plt.plot(df['timestamp'], ma, color='green', label='Moving Average')
    #plt.fill_between(df['timestamp'], ma-threshold, ma+threshold, color='gray', alpha=0.2, label='Threshold')
    plt.legend()
    plt.title(clf_name, fontsize = 25)
    plt.xlabel('Date')
    plt.ylabel('Counts')
    plt.grid() # MODIFICATION, inserted
    plt.show()   

## Compare methods<a class="anchor" id="compare"></a>

In [None]:
from __future__ import division
from __future__ import print_function

#supress warnings for clean output
import warnings

warnings.filterwarnings("ignore")

import numpy as np
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager

# Import all models
from pyod.models.abod import ABOD
from pyod.models.lof import LOF

# Define the number of inliers and outliers
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0]

# Compare given detectors under given settings
# Initialize the data
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

# initialize a set of detectors for LSCP
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
LOF(n_neighbors=50)]

# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth, '\n')

random_state = 42

classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

# Fit the models with the generated data and
# compare model performances
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
    # Data generation
    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
    X = np.r_[X1, X2]
    # Add outliers
    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]

    # Fit the model
    plt.figure(figsize=(20, 22))
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        print()
        print(i + 1, 'fitting', clf_name)
        # fit the data and tag outliers
        clf.fit(X)
        scores_pred = clf.decision_function(X) * -1
        y_pred = clf.predict(X)
        threshold = percentile(scores_pred, 100 * outliers_fraction)
        n_errors = (y_pred != ground_truth).sum()
        # plot the levels lines and the points

        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
        Z = Z.reshape(xx.shape)
        subplot = plt.subplot(5, 5, i + 1)
        subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)
        # a = subplot.contour(xx, yy, Z, levels=[threshold],
        #                     linewidths=2, colors='red')
        subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
        b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', s=20, edgecolor='k')
        c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', s=20, edgecolor='k')
        subplot.axis('tight')
        subplot.legend([ # a.collections[0], 
                       b, c], 
                       [# 'learned decision function',
                           'true inliers', 'true outliers'], 
                       prop=matplotlib.font_manager.FontProperties(size=10),
                       loc='lower right')
        subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
        subplot.set_xlim((-7, 7))
        subplot.set_ylim((-7, 7))
plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
plt.suptitle("25 outlier detection algorithms on synthetic data", fontsize=35)
plt.savefig('ALL.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
for i, offset in enumerate(clusters_separation):
    print(i)
    print(offset)
    np.random.seed(42)
    # Data generation
    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
    #print(X1)
    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
    #X = np.r_[X1, X2]
    # Add outliers
    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
    #print(X)

In [None]:
classifiers = {
	'Angle-based Outlier Detector (ABOD)':
		ABOD(contamination=outliers_fraction),
	'K Nearest Neighbors (KNN)': KNN(
		contamination=outliers_fraction),
	'Average KNN': KNN(method='mean',
					   contamination=outliers_fraction),
	'Median KNN': KNN(method='median',
					  contamination=outliers_fraction),
	'Local Outlier Factor (LOF)':
		LOF(n_neighbors=35, contamination=outliers_fraction),

	'Isolation Forest': IForest(contamination=outliers_fraction,
								random_state=random_state),
	'Deep Isolation Forest (DIF)': DIF(contamination=outliers_fraction,
									   random_state=random_state),
	'INNE': INNE(
		max_samples=2, contamination=outliers_fraction,
		random_state=random_state,
	),

	'Locally Selective Combination (LSCP)': LSCP(
		detector_list, contamination=outliers_fraction,
		random_state=random_state),
	'Feature Bagging':
		FeatureBagging(LOF(n_neighbors=35),
					   contamination=outliers_fraction,
					   random_state=random_state),
	'SUOD': SUOD(contamination=outliers_fraction),

	'Minimum Covariance Determinant (MCD)': MCD(
		contamination=outliers_fraction, random_state=random_state),

	'Principal Component Analysis (PCA)': PCA(
		contamination=outliers_fraction, random_state=random_state),
	'KPCA': KPCA(
		contamination=outliers_fraction),

	'Probabilistic Mixture Modeling (GMM)': GMM(contamination=outliers_fraction,
												random_state=random_state),

	'LMDD': LMDD(contamination=outliers_fraction,
				 random_state=random_state),

	'Histogram-based Outlier Detection (HBOS)': HBOS(
		contamination=outliers_fraction),

	'Copula-base Outlier Detection (COPOD)': COPOD(
		contamination=outliers_fraction),

	'ECDF-baseD Outlier Detection (ECOD)': ECOD(
		contamination=outliers_fraction),
	'Kernel Density Functions (KDE)': KDE(contamination=outliers_fraction),

	'QMCD': QMCD(
		contamination=outliers_fraction),

	'Sampling': Sampling(
		contamination=outliers_fraction),

	'LUNAR': LUNAR(),

	'Cluster-based Local Outlier Factor (CBLOF)':
		CBLOF(contamination=outliers_fraction,
			  check_estimator=False, random_state=random_state),

	'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
}