Skip to content

Commit

Permalink
Merge pull request #34 from winstonll/development
Browse files Browse the repository at this point in the history
Added LOCI
  • Loading branch information
yzhao062 committed Dec 3, 2018
2 parents d4c463a + 64382d2 commit f456fde
Show file tree
Hide file tree
Showing 3 changed files with 502 additions and 0 deletions.
139 changes: 139 additions & 0 deletions examples/loci_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import division
from __future__ import print_function

import os
import sys

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

from sklearn.utils import check_X_y
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from pyod.models.loci import LOCI
from pyod.utils.data import generate_data
from pyod.utils.data import get_color_codes
from pyod.utils.data import evaluate_print


def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True,
save_figure=False): # pragma: no cover
"""
Utility function for visualizing the results in examples
Internal use only
:param clf_name: The name of the detector
:type clf_name: str
:param X_train: The training samples
:param X_train: numpy array of shape (n_samples, n_features)
:param y_train: The ground truth of training samples
:type y_train: list or array of shape (n_samples,)
:param X_test: The test samples
:type X_test: numpy array of shape (n_samples, n_features)
:param y_test: The ground truth of test samples
:type y_test: list or array of shape (n_samples,)
:param y_train_pred: The predicted outlier scores on the training samples
:type y_train_pred: numpy array of shape (n_samples, n_features)
:param y_test_pred: The predicted outlier scores on the test samples
:type y_test_pred: numpy array of shape (n_samples, n_features)
:param show_figure: If set to True, show the figure
:type show_figure: bool, optional (default=True)
:param save_figure: If set to True, save the figure to the local
:type save_figure: bool, optional (default=False)
"""

if X_train.shape[1] != 2 or X_test.shape[1] != 2:
raise ValueError("Input data has to be 2-d for visualization. The "
"input data has {shape}.".format(shape=X_train.shape))

X_train, y_train = check_X_y(X_train, y_train)
X_test, y_test = check_X_y(X_test, y_test)
c_train = get_color_codes(y_train)
c_test = get_color_codes(y_test)

fig = plt.figure(figsize=(12, 10))
plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name))

fig.add_subplot(221)
plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train)
plt.title('Train ground truth')
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
markerfacecolor='b', markersize=8),
Line2D([0], [0], marker='o', color='w', label='outlier',
markerfacecolor='r', markersize=8)]

plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(222)
plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test)
plt.title('Test ground truth')
plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(223)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred)
plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name))
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
markerfacecolor='0', markersize=8),
Line2D([0], [0], marker='o', color='w', label='outlier',
markerfacecolor='yellow', markersize=8)]
plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(224)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred)
plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name))
plt.legend(handles=legend_elements, loc=4)

if save_figure:
plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)
if show_figure:
plt.show()
return


if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points

# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=2,
contamination=contamination,
random_state=42)

# train LOCI detector
clf_name = 'LOCI'
clf = LOCI()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

# visualize the results
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=False)
233 changes: 233 additions & 0 deletions pyod/models/loci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-
"""Local Correlation Integral (LOCI).
Part of the codes are adapted from https://github.com/Cloudy10/loci
"""
# Author: Winston Li <jk_zhengli@hotmail.com>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import numpy as np
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from scipy.spatial.distance import pdist, squareform

from .base import BaseDetector

class LOCI(BaseDetector):
"""Local Correlation Integral.
LOCI is highly effective for detecting outliers and groups of
outliers ( a.k.a.micro-clusters), which offers the following advantages
and novelties: (a) It provides an automatic, data-dictated cut-off to
determine whether a point is an outlier—in contrast, previous methods
force users to pick cut-offs, without any hints as to what cut-off value
is best for a given dataset. (b) It can provide a LOCI plot for each
point; this plot summarizes a wealth of information about the data in
the vicinity of the point, determining clusters, micro-clusters, their
diameters and their inter-cluster distances. None of the existing
outlier-detection methods can match this feature, because they output
only a single number for each point: its outlierness score.(c) It can
be computed as quickly as the best previous methods
Read more in the :cite:`papadimitriou2003loci`.
Parameters
----------
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.
alpha : int, default = 0.5
The neighbourhood parameter measures how large of a neighbourhood
should be considered "local".
k: int, default = 3
An outlier cutoff threshold for determine whether or not a point
should be considered an outlier.
Attributes
----------
decision_scores\_: numpy array of shape (n_samples,)
The outlier scores of the training data.
The higher, the more abnormal. Outliers tend to have higher
scores. This value is available once the detector is
fitted.
threshold\_: float
The threshold is set by the user and is defaulted to be 3 as
recommended by the authors.
labels\_: int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.
Examples
--------
>>> from pyod.models.loci import LOCI
>>> from pyod.utils.data import generate_data
>>> n_train = 50
>>> n_test = 50
>>> contamination = 0.1
>>> X_train, y_train, X_test, y_test = generate_data(
n_train=n_train, n_test=n_test,
contamination=contamination, random_state=42)
>>> clf = LOCI()
>>> clf.fit(X_train)
>>> print(clf.decision_scores_)
"""

def __init__(self, contamination = 0.1, alpha = 0.5, k = 3):
super(LOCI, self).__init__(contamination=contamination)
self._alpha = alpha
self.threshold_ = k

def _get_critical_values(self, dist_matrix, p_ix, r_max, r_min = 0):
"""Computes the critical values of a given distance matrix.
Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.
p_ix : int
Subsetting index
r_max : int
Maximum neighbourhood radius
r_min : int, default = 0
Minimum neighbourhood radius
Returns
-------
cv : array, shape (n_critical_val, )
Returns a list of critical values.
"""

distances = dist_matrix[p_ix, :]
mask = (r_min < distances) & (distances <= r_max)
cv = np.sort(np.concatenate((distances[mask], distances[mask]/self._alpha)))
return cv

def _get_sampling_N(self, dist_matrix, p_ix, r):
"""Computes the set of r-neighbours.
Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.
p_ix : int
Subsetting index
r : int
Neighbourhood radius
Returns
-------
sample : array, shape (n_sample, )
Returns a list of neighbourhood data points.
"""

p_distances = dist_matrix[p_ix, :]
sample = np.nonzero(p_distances <= r)[0]
return sample

def _get_alpha_n(self, dist_matrix, indices, r):
"""Computes the alpha neighbourhood points.
Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.
indices : int
Subsetting index
r : int
Neighbourhood radius
Returns
-------
alpha_n : array, shape (n_alpha, )
Returns the alpha neighbourhood points.
"""

if type(indices) is int:
alpha_n = np.count_nonzero(
dist_matrix[indices, :] < (r * self._alpha))
return alpha_n
else:
alpha_n = np.count_nonzero(
dist_matrix[indices, :] < (r * self._alpha), axis=1)
return alpha_n

def _calculate_decision_score(self, X):
"""Computes the outlier scores.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The input data points.
Returns
-------
outlier_scores : list
Returns the list of outlier scores for input dataset.
"""
outlier_scores = [0] * X.shape[0]
dist_matrix = squareform(pdist(X, metric="euclidean"))
max_dist = dist_matrix.max()
r_max = max_dist/self._alpha

for p_ix in range(X.shape[0]):
critical_values = self._get_critical_values(dist_matrix, p_ix, r_max)
for r in critical_values:
n_values = self._get_alpha_n(dist_matrix,
self._get_sampling_N(dist_matrix, p_ix, r), r)
cur_alpha_n = self._get_alpha_n(dist_matrix, p_ix, r)
n_hat = np.mean(n_values)
mdef = 1 - (cur_alpha_n/n_hat)
sigma_mdef = np.std(n_values)/n_hat
if n_hat >= 20:
outlier_scores[p_ix] = mdef/sigma_mdef
if mdef > (self.threshold_ * sigma_mdef):
break
return outlier_scores

def fit(self, X, y=None):
"""Fit the model using X as training data.
Parameters
----------
X : array, shape (n_samples, n_features)
Training data.
Returns
-------
self : object
"""
X = check_array(X)
self._set_n_classes(y)
outlier_scores = self._calculate_decision_score(X)
self.decision_scores_ = np.array(outlier_scores)
self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel()

# calculate for predict_proba()

self._mu = np.mean(self.decision_scores_)
self._sigma = np.std(self.decision_scores_)
return self

def decision_function(self, X):
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
X = check_array(X)
outlier_scores = self._calculate_decision_score(X)
return np.array(outlier_scores)

0 comments on commit f456fde

Please sign in to comment.