Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added loci #34

Merged
merged 1 commit into from
Dec 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions examples/loci_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import division
from __future__ import print_function

import os
import sys

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

from sklearn.utils import check_X_y
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from pyod.models.loci import LOCI
from pyod.utils.data import generate_data
from pyod.utils.data import get_color_codes
from pyod.utils.data import evaluate_print


def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True,
save_figure=False): # pragma: no cover
"""
Utility function for visualizing the results in examples
Internal use only

:param clf_name: The name of the detector
:type clf_name: str

:param X_train: The training samples
:param X_train: numpy array of shape (n_samples, n_features)

:param y_train: The ground truth of training samples
:type y_train: list or array of shape (n_samples,)

:param X_test: The test samples
:type X_test: numpy array of shape (n_samples, n_features)

:param y_test: The ground truth of test samples
:type y_test: list or array of shape (n_samples,)

:param y_train_pred: The predicted outlier scores on the training samples
:type y_train_pred: numpy array of shape (n_samples, n_features)

:param y_test_pred: The predicted outlier scores on the test samples
:type y_test_pred: numpy array of shape (n_samples, n_features)

:param show_figure: If set to True, show the figure
:type show_figure: bool, optional (default=True)

:param save_figure: If set to True, save the figure to the local
:type save_figure: bool, optional (default=False)
"""

if X_train.shape[1] != 2 or X_test.shape[1] != 2:
raise ValueError("Input data has to be 2-d for visualization. The "
"input data has {shape}.".format(shape=X_train.shape))

X_train, y_train = check_X_y(X_train, y_train)
X_test, y_test = check_X_y(X_test, y_test)
c_train = get_color_codes(y_train)
c_test = get_color_codes(y_test)

fig = plt.figure(figsize=(12, 10))
plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name))

fig.add_subplot(221)
plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train)
plt.title('Train ground truth')
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
markerfacecolor='b', markersize=8),
Line2D([0], [0], marker='o', color='w', label='outlier',
markerfacecolor='r', markersize=8)]

plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(222)
plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test)
plt.title('Test ground truth')
plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(223)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred)
plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name))
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
markerfacecolor='0', markersize=8),
Line2D([0], [0], marker='o', color='w', label='outlier',
markerfacecolor='yellow', markersize=8)]
plt.legend(handles=legend_elements, loc=4)

fig.add_subplot(224)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred)
plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name))
plt.legend(handles=legend_elements, loc=4)

if save_figure:
plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)
if show_figure:
plt.show()
return


if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points

# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=2,
contamination=contamination,
random_state=42)

# train LOCI detector
clf_name = 'LOCI'
clf = LOCI()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

# visualize the results
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=False)
233 changes: 233 additions & 0 deletions pyod/models/loci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-
"""Local Correlation Integral (LOCI).
Part of the codes are adapted from https://github.com/Cloudy10/loci
"""
# Author: Winston Li <jk_zhengli@hotmail.com>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import numpy as np
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from scipy.spatial.distance import pdist, squareform

from .base import BaseDetector

class LOCI(BaseDetector):
"""Local Correlation Integral.

LOCI is highly effective for detecting outliers and groups of
outliers ( a.k.a.micro-clusters), which offers the following advantages
and novelties: (a) It provides an automatic, data-dictated cut-off to
determine whether a point is an outlier—in contrast, previous methods
force users to pick cut-offs, without any hints as to what cut-off value
is best for a given dataset. (b) It can provide a LOCI plot for each
point; this plot summarizes a wealth of information about the data in
the vicinity of the point, determining clusters, micro-clusters, their
diameters and their inter-cluster distances. None of the existing
outlier-detection methods can match this feature, because they output
only a single number for each point: its outlierness score.(c) It can
be computed as quickly as the best previous methods
Read more in the :cite:`papadimitriou2003loci`.

Parameters
----------
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.

alpha : int, default = 0.5
The neighbourhood parameter measures how large of a neighbourhood
should be considered "local".

k: int, default = 3
An outlier cutoff threshold for determine whether or not a point
should be considered an outlier.

Attributes
----------
decision_scores\_: numpy array of shape (n_samples,)
The outlier scores of the training data.
The higher, the more abnormal. Outliers tend to have higher
scores. This value is available once the detector is
fitted.

threshold\_: float
The threshold is set by the user and is defaulted to be 3 as
recommended by the authors.

labels\_: int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.


Examples
--------
>>> from pyod.models.loci import LOCI
>>> from pyod.utils.data import generate_data
>>> n_train = 50
>>> n_test = 50
>>> contamination = 0.1
>>> X_train, y_train, X_test, y_test = generate_data(
n_train=n_train, n_test=n_test,
contamination=contamination, random_state=42)

>>> clf = LOCI()
>>> clf.fit(X_train)
>>> print(clf.decision_scores_)
"""

def __init__(self, contamination = 0.1, alpha = 0.5, k = 3):
super(LOCI, self).__init__(contamination=contamination)
self._alpha = alpha
self.threshold_ = k

def _get_critical_values(self, dist_matrix, p_ix, r_max, r_min = 0):
"""Computes the critical values of a given distance matrix.

Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.

p_ix : int
Subsetting index

r_max : int
Maximum neighbourhood radius

r_min : int, default = 0
Minimum neighbourhood radius

Returns
-------
cv : array, shape (n_critical_val, )
Returns a list of critical values.
"""

distances = dist_matrix[p_ix, :]
mask = (r_min < distances) & (distances <= r_max)
cv = np.sort(np.concatenate((distances[mask], distances[mask]/self._alpha)))
return cv

def _get_sampling_N(self, dist_matrix, p_ix, r):
"""Computes the set of r-neighbours.

Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.

p_ix : int
Subsetting index

r : int
Neighbourhood radius


Returns
-------
sample : array, shape (n_sample, )
Returns a list of neighbourhood data points.
"""

p_distances = dist_matrix[p_ix, :]
sample = np.nonzero(p_distances <= r)[0]
return sample

def _get_alpha_n(self, dist_matrix, indices, r):
"""Computes the alpha neighbourhood points.

Parameters
----------
dist_matrix : array-like, shape (n_samples, n_features)
The distance matrix w.r.t. to the training samples.

indices : int
Subsetting index

r : int
Neighbourhood radius

Returns
-------
alpha_n : array, shape (n_alpha, )
Returns the alpha neighbourhood points.
"""

if type(indices) is int:
alpha_n = np.count_nonzero(
dist_matrix[indices, :] < (r * self._alpha))
return alpha_n
else:
alpha_n = np.count_nonzero(
dist_matrix[indices, :] < (r * self._alpha), axis=1)
return alpha_n

def _calculate_decision_score(self, X):
"""Computes the outlier scores.

Parameters
----------
X : array-like, shape (n_samples, n_features)
The input data points.

Returns
-------
outlier_scores : list
Returns the list of outlier scores for input dataset.
"""
outlier_scores = [0] * X.shape[0]
dist_matrix = squareform(pdist(X, metric="euclidean"))
max_dist = dist_matrix.max()
r_max = max_dist/self._alpha

for p_ix in range(X.shape[0]):
critical_values = self._get_critical_values(dist_matrix, p_ix, r_max)
for r in critical_values:
n_values = self._get_alpha_n(dist_matrix,
self._get_sampling_N(dist_matrix, p_ix, r), r)
cur_alpha_n = self._get_alpha_n(dist_matrix, p_ix, r)
n_hat = np.mean(n_values)
mdef = 1 - (cur_alpha_n/n_hat)
sigma_mdef = np.std(n_values)/n_hat
if n_hat >= 20:
outlier_scores[p_ix] = mdef/sigma_mdef
if mdef > (self.threshold_ * sigma_mdef):
break
return outlier_scores

def fit(self, X, y=None):
"""Fit the model using X as training data.

Parameters
----------
X : array, shape (n_samples, n_features)
Training data.

Returns
-------
self : object

"""
X = check_array(X)
self._set_n_classes(y)
outlier_scores = self._calculate_decision_score(X)
self.decision_scores_ = np.array(outlier_scores)
self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel()

# calculate for predict_proba()

self._mu = np.mean(self.decision_scores_)
self._sigma = np.std(self.decision_scores_)
return self

def decision_function(self, X):
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
X = check_array(X)
outlier_scores = self._calculate_decision_score(X)
return np.array(outlier_scores)
Loading