Merge pull request #34 from winstonll/development

Added LOCI
yzhao062 · Dec 3, 2018 · f456fde · f456fde
2 parents d4c463a + 64382d2
commit f456fde
Show file tree

Hide file tree

Showing 3 changed files with 502 additions and 0 deletions.
diff --git a/examples/loci_example.py b/examples/loci_example.py
@@ -0,0 +1,139 @@
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from sklearn.utils import check_X_y
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+
+from pyod.models.loci import LOCI
+from pyod.utils.data import generate_data
+from pyod.utils.data import get_color_codes
+from pyod.utils.data import evaluate_print
+
+
+def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True,
+              save_figure=False):  # pragma: no cover
+    """
+    Utility function for visualizing the results in examples
+    Internal use only
+
+    :param clf_name: The name of the detector
+    :type clf_name: str
+
+    :param X_train: The training samples
+    :param X_train: numpy array of shape (n_samples, n_features)
+
+    :param y_train: The ground truth of training samples
+    :type y_train: list or array of shape (n_samples,)
+
+    :param X_test: The test samples
+    :type X_test: numpy array of shape (n_samples, n_features)
+
+    :param y_test: The ground truth of test samples
+    :type y_test: list or array of shape (n_samples,)
+
+    :param y_train_pred: The predicted outlier scores on the training samples
+    :type y_train_pred: numpy array of shape (n_samples, n_features)
+
+    :param y_test_pred: The predicted outlier scores on the test samples
+    :type y_test_pred: numpy array of shape (n_samples, n_features)
+
+    :param show_figure: If set to True, show the figure
+    :type show_figure: bool, optional (default=True)
+
+    :param save_figure: If set to True, save the figure to the local
+    :type save_figure: bool, optional (default=False)
+    """
+
+    if X_train.shape[1] != 2 or X_test.shape[1] != 2:
+        raise ValueError("Input data has to be 2-d for visualization. The "
+                         "input data has {shape}.".format(shape=X_train.shape))
+
+    X_train, y_train = check_X_y(X_train, y_train)
+    X_test, y_test = check_X_y(X_test, y_test)
+    c_train = get_color_codes(y_train)
+    c_test = get_color_codes(y_test)
+
+    fig = plt.figure(figsize=(12, 10))
+    plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name))
+
+    fig.add_subplot(221)
+    plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train)
+    plt.title('Train ground truth')
+    legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
+                              markerfacecolor='b', markersize=8),
+                       Line2D([0], [0], marker='o', color='w', label='outlier',
+                              markerfacecolor='r', markersize=8)]
+
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(222)
+    plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test)
+    plt.title('Test ground truth')
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(223)
+    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred)
+    plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name))
+    legend_elements = [Line2D([0], [0], marker='o', color='w', label='normal',
+                              markerfacecolor='0', markersize=8),
+                       Line2D([0], [0], marker='o', color='w', label='outlier',
+                              markerfacecolor='yellow', markersize=8)]
+    plt.legend(handles=legend_elements, loc=4)
+
+    fig.add_subplot(224)
+    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred)
+    plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name))
+    plt.legend(handles=legend_elements, loc=4)
+
+    if save_figure:
+        plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)
+    if show_figure:
+        plt.show()
+    return
+
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 200  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, y_train, X_test, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=2,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train LOCI detector
+    clf_name = 'LOCI'
+    clf = LOCI()
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
+    # visualize the results
+    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True, save_figure=False)
diff --git a/pyod/models/loci.py b/pyod/models/loci.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+"""Local Correlation Integral (LOCI).
+Part of the codes are adapted from https://github.com/Cloudy10/loci
+"""
+# Author: Winston Li <jk_zhengli@hotmail.com>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted
+from scipy.spatial.distance import pdist, squareform
+
+from .base import BaseDetector
+
+class LOCI(BaseDetector):
+    """Local Correlation Integral.
+    
+    LOCI is highly effective for detecting outliers and groups of 
+    outliers ( a.k.a.micro-clusters), which offers the following advantages 
+    and novelties: (a) It provides an automatic, data-dictated cut-off to 
+    determine whether a point is an outlier—in contrast, previous methods 
+    force users to pick cut-offs, without any hints as to what cut-off value 
+    is best for a given dataset. (b) It can provide a LOCI plot for each 
+    point; this plot summarizes a wealth of information about the data in 
+    the vicinity of the point, determining clusters, micro-clusters, their 
+    diameters and their inter-cluster distances. None of the existing 
+    outlier-detection methods can match this feature, because they output 
+    only a single number for each point: its outlierness score.(c) It can 
+    be computed as quickly as the best previous methods
+    Read more in the :cite:`papadimitriou2003loci`.
+    
+    Parameters
+    ----------
+    contamination : float in (0., 0.5), optional (default=0.1) 
+        The amount of contamination of the data set, i.e.
+        the proportion of outliers in the data set. Used when fitting to
+        define the threshold on the decision function.
+    
+    alpha : int, default = 0.5
+        The neighbourhood parameter measures how large of a neighbourhood
+        should be considered "local".
+    
+    k: int, default = 3
+        An outlier cutoff threshold for determine whether or not a point 
+        should be considered an outlier.
+     
+    Attributes
+    ----------
+    decision_scores\_: numpy array of shape (n_samples,)
+        The outlier scores of the training data.
+        The higher, the more abnormal. Outliers tend to have higher
+        scores. This value is available once the detector is
+        fitted.
+    
+    threshold\_: float
+        The threshold is set by the user and is defaulted to be 3 as
+        recommended by the authors.
+    
+    labels\_: int, either 0 or 1
+        The binary labels of the training data. 0 stands for inliers
+        and 1 for outliers/anomalies. It is generated by applying
+        ``threshold_`` on ``decision_scores_``.
+        
+        
+    Examples
+    --------
+    >>> from pyod.models.loci import LOCI
+    >>> from pyod.utils.data import generate_data
+    >>> n_train = 50
+    >>> n_test = 50
+    >>> contamination = 0.1
+    >>> X_train, y_train, X_test, y_test = generate_data(
+            n_train=n_train, n_test=n_test,
+            contamination=contamination, random_state=42)
+
+    >>> clf = LOCI()
+    >>> clf.fit(X_train)
+    >>> print(clf.decision_scores_)
+    """
+
+    def __init__(self, contamination = 0.1, alpha = 0.5, k = 3):
+        super(LOCI, self).__init__(contamination=contamination)
+        self._alpha = alpha
+        self.threshold_ = k
+
+    def _get_critical_values(self, dist_matrix, p_ix, r_max, r_min = 0):
+        """Computes the critical values of a given distance matrix.
+        
+        Parameters
+        ----------
+        dist_matrix : array-like, shape (n_samples, n_features)
+            The distance matrix w.r.t. to the training samples.
+        
+        p_ix : int
+            Subsetting index
+        
+        r_max : int
+            Maximum neighbourhood radius
+        
+        r_min : int, default = 0
+            Minimum neighbourhood radius
+            
+        Returns
+        -------
+        cv : array, shape (n_critical_val, )
+            Returns a list of critical values.       
+        """
+
+        distances = dist_matrix[p_ix, :]
+        mask = (r_min < distances) & (distances <= r_max)
+        cv = np.sort(np.concatenate((distances[mask], distances[mask]/self._alpha)))
+        return cv
+
+    def _get_sampling_N(self, dist_matrix, p_ix, r):
+        """Computes the set of r-neighbours.
+        
+        Parameters
+        ----------
+        dist_matrix : array-like, shape (n_samples, n_features)
+            The distance matrix w.r.t. to the training samples.
+        
+        p_ix : int
+            Subsetting index
+        
+        r : int
+            Neighbourhood radius
+        
+            
+        Returns
+        -------
+        sample : array, shape (n_sample, )
+            Returns a list of neighbourhood data points.       
+        """
+
+        p_distances = dist_matrix[p_ix, :]
+        sample = np.nonzero(p_distances <= r)[0]
+        return sample
+
+    def _get_alpha_n(self, dist_matrix, indices, r):
+        """Computes the alpha neighbourhood points.
+        
+        Parameters
+        ----------
+        dist_matrix : array-like, shape (n_samples, n_features)
+            The distance matrix w.r.t. to the training samples.
+        
+        indices : int
+            Subsetting index
+        
+        r : int
+            Neighbourhood radius
+            
+        Returns
+        -------
+        alpha_n : array, shape (n_alpha, )
+            Returns the alpha neighbourhood points.       
+        """
+
+        if type(indices) is int:
+            alpha_n = np.count_nonzero(
+                dist_matrix[indices, :] < (r * self._alpha))
+            return alpha_n
+        else:
+            alpha_n = np.count_nonzero(
+                dist_matrix[indices, :] < (r * self._alpha), axis=1)
+            return alpha_n
+
+    def _calculate_decision_score(self, X):
+        """Computes the outlier scores.
+        
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The input data points.
+            
+        Returns
+        -------
+        outlier_scores : list
+            Returns the list of outlier scores for input dataset.       
+        """
+        outlier_scores = [0] * X.shape[0]
+        dist_matrix = squareform(pdist(X, metric="euclidean"))
+        max_dist = dist_matrix.max()
+        r_max = max_dist/self._alpha    
+
+        for p_ix in range(X.shape[0]):
+            critical_values = self._get_critical_values(dist_matrix, p_ix, r_max)
+            for r in critical_values:
+                n_values = self._get_alpha_n(dist_matrix, 
+                                             self._get_sampling_N(dist_matrix, p_ix, r), r)
+                cur_alpha_n = self._get_alpha_n(dist_matrix, p_ix, r)
+                n_hat = np.mean(n_values)
+                mdef = 1 - (cur_alpha_n/n_hat)
+                sigma_mdef = np.std(n_values)/n_hat
+                if n_hat >= 20:
+                    outlier_scores[p_ix] = mdef/sigma_mdef
+                    if mdef > (self.threshold_ * sigma_mdef):
+                        break
+        return outlier_scores
+
+    def fit(self, X, y=None):
+        """Fit the model using X as training data.
+        
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Training data.
+            
+        Returns
+        -------
+        self : object
+
+        """
+        X = check_array(X)
+        self._set_n_classes(y)
+        outlier_scores = self._calculate_decision_score(X)
+        self.decision_scores_ = np.array(outlier_scores)
+        self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel()
+
+        # calculate for predict_proba()
+
+        self._mu = np.mean(self.decision_scores_)
+        self._sigma = np.std(self.decision_scores_)
+        return self
+
+    def decision_function(self, X):
+        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
+        X = check_array(X)
+        outlier_scores = self._calculate_decision_score(X)
+        return np.array(outlier_scores)