-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from winstonll/development
Added LOCI
- Loading branch information
Showing
3 changed files
with
502 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import os | ||
import sys | ||
|
||
# temporary solution for relative imports in case pyod is not installed | ||
# if pyod is installed, no need to use the following line | ||
sys.path.append( | ||
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) | ||
|
||
from sklearn.utils import check_X_y | ||
import matplotlib.pyplot as plt | ||
from matplotlib.lines import Line2D | ||
|
||
from pyod.models.loci import LOCI | ||
from pyod.utils.data import generate_data | ||
from pyod.utils.data import get_color_codes | ||
from pyod.utils.data import evaluate_print | ||
|
||
|
||
def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, | ||
y_test_pred, show_figure=True, | ||
save_figure=False): # pragma: no cover | ||
""" | ||
Utility function for visualizing the results in examples | ||
Internal use only | ||
:param clf_name: The name of the detector | ||
:type clf_name: str | ||
:param X_train: The training samples | ||
:param X_train: numpy array of shape (n_samples, n_features) | ||
:param y_train: The ground truth of training samples | ||
:type y_train: list or array of shape (n_samples,) | ||
:param X_test: The test samples | ||
:type X_test: numpy array of shape (n_samples, n_features) | ||
:param y_test: The ground truth of test samples | ||
:type y_test: list or array of shape (n_samples,) | ||
:param y_train_pred: The predicted outlier scores on the training samples | ||
:type y_train_pred: numpy array of shape (n_samples, n_features) | ||
:param y_test_pred: The predicted outlier scores on the test samples | ||
:type y_test_pred: numpy array of shape (n_samples, n_features) | ||
:param show_figure: If set to True, show the figure | ||
:type show_figure: bool, optional (default=True) | ||
:param save_figure: If set to True, save the figure to the local | ||
:type save_figure: bool, optional (default=False) | ||
""" | ||
|
||
if X_train.shape[1] != 2 or X_test.shape[1] != 2: | ||
raise ValueError("Input data has to be 2-d for visualization. The " | ||
"input data has {shape}.".format(shape=X_train.shape)) | ||
|
||
X_train, y_train = check_X_y(X_train, y_train) | ||
X_test, y_test = check_X_y(X_test, y_test) | ||
c_train = get_color_codes(y_train) | ||
c_test = get_color_codes(y_test) | ||
|
||
fig = plt.figure(figsize=(12, 10)) | ||
plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name)) | ||
|
||
fig.add_subplot(221) | ||
plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train) | ||
plt.title('Train ground truth') | ||
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal', | ||
markerfacecolor='b', markersize=8), | ||
Line2D([0], [0], marker='o', color='w', label='outlier', | ||
markerfacecolor='r', markersize=8)] | ||
|
||
plt.legend(handles=legend_elements, loc=4) | ||
|
||
fig.add_subplot(222) | ||
plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test) | ||
plt.title('Test ground truth') | ||
plt.legend(handles=legend_elements, loc=4) | ||
|
||
fig.add_subplot(223) | ||
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred) | ||
plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name)) | ||
legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal', | ||
markerfacecolor='0', markersize=8), | ||
Line2D([0], [0], marker='o', color='w', label='outlier', | ||
markerfacecolor='yellow', markersize=8)] | ||
plt.legend(handles=legend_elements, loc=4) | ||
|
||
fig.add_subplot(224) | ||
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred) | ||
plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name)) | ||
plt.legend(handles=legend_elements, loc=4) | ||
|
||
if save_figure: | ||
plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300) | ||
if show_figure: | ||
plt.show() | ||
return | ||
|
||
|
||
if __name__ == "__main__": | ||
contamination = 0.1 # percentage of outliers | ||
n_train = 200 # number of training points | ||
n_test = 100 # number of testing points | ||
|
||
# Generate sample data | ||
X_train, y_train, X_test, y_test = \ | ||
generate_data(n_train=n_train, | ||
n_test=n_test, | ||
n_features=2, | ||
contamination=contamination, | ||
random_state=42) | ||
|
||
# train LOCI detector | ||
clf_name = 'LOCI' | ||
clf = LOCI() | ||
clf.fit(X_train) | ||
|
||
# get the prediction labels and outlier scores of the training data | ||
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) | ||
y_train_scores = clf.decision_scores_ # raw outlier scores | ||
|
||
# get the prediction on the test data | ||
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) | ||
y_test_scores = clf.decision_function(X_test) # outlier scores | ||
|
||
# evaluate and print the results | ||
print("\nOn Training Data:") | ||
evaluate_print(clf_name, y_train, y_train_scores) | ||
print("\nOn Test Data:") | ||
evaluate_print(clf_name, y_test, y_test_scores) | ||
|
||
# visualize the results | ||
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, | ||
y_test_pred, show_figure=True, save_figure=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Local Correlation Integral (LOCI). | ||
Part of the codes are adapted from https://github.com/Cloudy10/loci | ||
""" | ||
# Author: Winston Li <jk_zhengli@hotmail.com> | ||
# License: BSD 2 clause | ||
|
||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import numpy as np | ||
from sklearn.utils import check_array | ||
from sklearn.utils.validation import check_is_fitted | ||
from scipy.spatial.distance import pdist, squareform | ||
|
||
from .base import BaseDetector | ||
|
||
class LOCI(BaseDetector): | ||
"""Local Correlation Integral. | ||
LOCI is highly effective for detecting outliers and groups of | ||
outliers ( a.k.a.micro-clusters), which offers the following advantages | ||
and novelties: (a) It provides an automatic, data-dictated cut-off to | ||
determine whether a point is an outlier—in contrast, previous methods | ||
force users to pick cut-offs, without any hints as to what cut-off value | ||
is best for a given dataset. (b) It can provide a LOCI plot for each | ||
point; this plot summarizes a wealth of information about the data in | ||
the vicinity of the point, determining clusters, micro-clusters, their | ||
diameters and their inter-cluster distances. None of the existing | ||
outlier-detection methods can match this feature, because they output | ||
only a single number for each point: its outlierness score.(c) It can | ||
be computed as quickly as the best previous methods | ||
Read more in the :cite:`papadimitriou2003loci`. | ||
Parameters | ||
---------- | ||
contamination : float in (0., 0.5), optional (default=0.1) | ||
The amount of contamination of the data set, i.e. | ||
the proportion of outliers in the data set. Used when fitting to | ||
define the threshold on the decision function. | ||
alpha : int, default = 0.5 | ||
The neighbourhood parameter measures how large of a neighbourhood | ||
should be considered "local". | ||
k: int, default = 3 | ||
An outlier cutoff threshold for determine whether or not a point | ||
should be considered an outlier. | ||
Attributes | ||
---------- | ||
decision_scores\_: numpy array of shape (n_samples,) | ||
The outlier scores of the training data. | ||
The higher, the more abnormal. Outliers tend to have higher | ||
scores. This value is available once the detector is | ||
fitted. | ||
threshold\_: float | ||
The threshold is set by the user and is defaulted to be 3 as | ||
recommended by the authors. | ||
labels\_: int, either 0 or 1 | ||
The binary labels of the training data. 0 stands for inliers | ||
and 1 for outliers/anomalies. It is generated by applying | ||
``threshold_`` on ``decision_scores_``. | ||
Examples | ||
-------- | ||
>>> from pyod.models.loci import LOCI | ||
>>> from pyod.utils.data import generate_data | ||
>>> n_train = 50 | ||
>>> n_test = 50 | ||
>>> contamination = 0.1 | ||
>>> X_train, y_train, X_test, y_test = generate_data( | ||
n_train=n_train, n_test=n_test, | ||
contamination=contamination, random_state=42) | ||
>>> clf = LOCI() | ||
>>> clf.fit(X_train) | ||
>>> print(clf.decision_scores_) | ||
""" | ||
|
||
def __init__(self, contamination = 0.1, alpha = 0.5, k = 3): | ||
super(LOCI, self).__init__(contamination=contamination) | ||
self._alpha = alpha | ||
self.threshold_ = k | ||
|
||
def _get_critical_values(self, dist_matrix, p_ix, r_max, r_min = 0): | ||
"""Computes the critical values of a given distance matrix. | ||
Parameters | ||
---------- | ||
dist_matrix : array-like, shape (n_samples, n_features) | ||
The distance matrix w.r.t. to the training samples. | ||
p_ix : int | ||
Subsetting index | ||
r_max : int | ||
Maximum neighbourhood radius | ||
r_min : int, default = 0 | ||
Minimum neighbourhood radius | ||
Returns | ||
------- | ||
cv : array, shape (n_critical_val, ) | ||
Returns a list of critical values. | ||
""" | ||
|
||
distances = dist_matrix[p_ix, :] | ||
mask = (r_min < distances) & (distances <= r_max) | ||
cv = np.sort(np.concatenate((distances[mask], distances[mask]/self._alpha))) | ||
return cv | ||
|
||
def _get_sampling_N(self, dist_matrix, p_ix, r): | ||
"""Computes the set of r-neighbours. | ||
Parameters | ||
---------- | ||
dist_matrix : array-like, shape (n_samples, n_features) | ||
The distance matrix w.r.t. to the training samples. | ||
p_ix : int | ||
Subsetting index | ||
r : int | ||
Neighbourhood radius | ||
Returns | ||
------- | ||
sample : array, shape (n_sample, ) | ||
Returns a list of neighbourhood data points. | ||
""" | ||
|
||
p_distances = dist_matrix[p_ix, :] | ||
sample = np.nonzero(p_distances <= r)[0] | ||
return sample | ||
|
||
def _get_alpha_n(self, dist_matrix, indices, r): | ||
"""Computes the alpha neighbourhood points. | ||
Parameters | ||
---------- | ||
dist_matrix : array-like, shape (n_samples, n_features) | ||
The distance matrix w.r.t. to the training samples. | ||
indices : int | ||
Subsetting index | ||
r : int | ||
Neighbourhood radius | ||
Returns | ||
------- | ||
alpha_n : array, shape (n_alpha, ) | ||
Returns the alpha neighbourhood points. | ||
""" | ||
|
||
if type(indices) is int: | ||
alpha_n = np.count_nonzero( | ||
dist_matrix[indices, :] < (r * self._alpha)) | ||
return alpha_n | ||
else: | ||
alpha_n = np.count_nonzero( | ||
dist_matrix[indices, :] < (r * self._alpha), axis=1) | ||
return alpha_n | ||
|
||
def _calculate_decision_score(self, X): | ||
"""Computes the outlier scores. | ||
Parameters | ||
---------- | ||
X : array-like, shape (n_samples, n_features) | ||
The input data points. | ||
Returns | ||
------- | ||
outlier_scores : list | ||
Returns the list of outlier scores for input dataset. | ||
""" | ||
outlier_scores = [0] * X.shape[0] | ||
dist_matrix = squareform(pdist(X, metric="euclidean")) | ||
max_dist = dist_matrix.max() | ||
r_max = max_dist/self._alpha | ||
|
||
for p_ix in range(X.shape[0]): | ||
critical_values = self._get_critical_values(dist_matrix, p_ix, r_max) | ||
for r in critical_values: | ||
n_values = self._get_alpha_n(dist_matrix, | ||
self._get_sampling_N(dist_matrix, p_ix, r), r) | ||
cur_alpha_n = self._get_alpha_n(dist_matrix, p_ix, r) | ||
n_hat = np.mean(n_values) | ||
mdef = 1 - (cur_alpha_n/n_hat) | ||
sigma_mdef = np.std(n_values)/n_hat | ||
if n_hat >= 20: | ||
outlier_scores[p_ix] = mdef/sigma_mdef | ||
if mdef > (self.threshold_ * sigma_mdef): | ||
break | ||
return outlier_scores | ||
|
||
def fit(self, X, y=None): | ||
"""Fit the model using X as training data. | ||
Parameters | ||
---------- | ||
X : array, shape (n_samples, n_features) | ||
Training data. | ||
Returns | ||
------- | ||
self : object | ||
""" | ||
X = check_array(X) | ||
self._set_n_classes(y) | ||
outlier_scores = self._calculate_decision_score(X) | ||
self.decision_scores_ = np.array(outlier_scores) | ||
self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel() | ||
|
||
# calculate for predict_proba() | ||
|
||
self._mu = np.mean(self.decision_scores_) | ||
self._sigma = np.std(self.decision_scores_) | ||
return self | ||
|
||
def decision_function(self, X): | ||
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) | ||
X = check_array(X) | ||
outlier_scores = self._calculate_decision_score(X) | ||
return np.array(outlier_scores) |
Oops, something went wrong.