Merge pull request #61 from John-Almardeny/SOD_Yahya

SOD implementation (Merge to dev for further development)
yzhao062 · Mar 18, 2019 · 27bf057 · 27bf057
2 parents 42ccf55 + a48ebbd
commit 27bf057
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 0 deletions.
diff --git a/pyod/models/sod.py b/pyod/models/sod.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+"""Subspace Outlier Detection (SOD)
+"""
+# Author: Yahya Almardeny <almardeny@gmail.com>
+# License: MIT
+
+from sklearn.neighbors import NearestNeighbors
+import numpy as np
+from sklearn.utils import check_array
+from .base import BaseDetector
+
+
+class SOD(BaseDetector):
+    """
+    Subspace outlier detection (SOD) algorithm
+    The implementation is based on the work of
+    Krigel, H.P., Kroger, P., Schubert, E., Zimek, A., Outlier detection in axis-parallel subspaces of high dimensional data, 2009.
+
+    Parameters
+    ----------
+    contamination : float in (0., 0.5), optional (default=0.1)
+        The amount of contamination of the data set, i.e.
+        the proportion of outliers in the data set. Used when fitting to
+        define the threshold on the decision function.
+
+    n_neighbors : int, optional (default=10)
+        Number of neighbors to use by default for k neighbors queries.
+
+    ref_set: int, optional (default=5)
+        specifies the number of shared nearest neighbors to create the reference set.
+        Note that ref_set must be smaller than n_neighbors.
+
+    alpha: float in (0., 1.), optional (default=0.8)
+           specifies the lower limit for selecting subspace.
+           0.8 is set as default as suggested in the original paper.
+
+    Attributes
+    ----------
+    decision_scores_ : numpy array of shape (n_samples,)
+        The outlier scores of the training data.
+        The higher, the more abnormal. Outliers tend to have higher
+        scores. This value is available once the detector is
+        fitted.
+
+    threshold_ : float
+        The threshold is based on ``contamination``. It is the
+        ``n_samples * contamination`` most abnormal samples in
+        ``decision_scores_``. The threshold is calculated for generating
+        binary outlier labels.
+
+    labels_ : int, either 0 or 1
+        The binary labels of the training data. 0 stands for inliers
+        and 1 for outliers/anomalies. It is generated by applying
+        ``threshold_`` on ``decision_scores_``.
+    """
+    def __init__(self,contamination=0.1, n_neighbors=10, ref_set=5, alpha=0.8):
+        super(SOD, self).__init__(contamination=contamination)
+        self.n_neighbors = n_neighbors
+        self.ref_set = ref_set
+        self.alpha = alpha
+
+    def fit(self, X, y=None):
+        """Fit detector. y is optional for unsupervised methods.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        y : numpy array of shape (n_samples,), optional (default=None)
+            The ground truth of the input samples (labels).
+        """
+        if self.ref_set >= self.n_neighbors:
+            raise TypeError("Number of Neighbors should be greater than Reference Set")
+        # validate inputs X and y (optional)
+        X = check_array(X)
+        self._set_n_classes(y)
+
+        self.X_train_ = X
+        self.n_train_ = X.shape[0]
+        self.decision_scores_ = self.decision_function(X)
+
+        self._process_decision_scores()
+
+        return self
+
+    def decision_function(self, X):
+        """Predict raw anomaly score of X using the fitted detector.
+
+        The anomaly score of an input sample is computed based on different
+        detector algorithms. For consistency, outliers are assigned with
+        larger anomaly scores.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only
+            if they are supported by the base estimator.
+
+        Returns
+        -------
+        anomaly_scores : numpy array of shape (n_samples,)
+            The anomaly score of the input samples.
+        """
+        return self._sod()
+
+    def _snn(self):
+        """
+        This function calculates the shared nearest neighbors (SNN).
+        SNN is reported to be more robust than k nearest neighbors.
+        Firstly, the k nearest neighbor distances for each observation is calculated.
+        Then, the shared nearest neighbor similarity is calculated based on
+        the result of k nearest neighbor.
+        Note that k must be greater than l.
+        :return: numpy array containing the indices of top k shared nearest neighbors for
+                 each observation.
+        """
+        knn = NearestNeighbors(n_neighbors=self.n_neighbors)
+        knn.fit(self.X_train_)
+        # Get the knn index
+        ind = knn.kneighbors(return_distance=False)
+        if not isinstance(ind, np.ndarray):  # for any future changes in scikit-learn
+            ind = np.array(ind)
+        n = ind.shape[0]
+        _count = np.zeros(shape=(n, self.ref_set), dtype=np.uint16)
+        # Count the distance using the customized function
+        for i in range(n):
+            # The point should not be in its reference set,
+            # but we need it temporarily to maintain indices order,
+            # it has the max possible value: 65535 (max unsigned int16)
+            # so it'll be always as first index
+            temp = np.sum(np.isin(ind, ind[i]), axis=1).ravel()
+            temp[i] = np.iinfo(np.uint16).max
+            # sorting after each iteration because argsort is int64
+            # and cannot handle big data
+            _count[i] = np.argsort(temp)[::-1][1:self.ref_set+1]
+
+        return _count
+
+
+    def _sod(self):
+        """
+        Subspace outlier detection (SOD) algorithm
+        This function performs suspace outlier detection algorithm
+        The implemented method is based on the work of
+        Krigel, H.P., Kroger, P., Schubert, E., Zimek, A., Outlier detection in axis-parallel subspaces of high dimensional data, 2009.
+        :return: The function returns a vector containing the SOD outlier scores for each observation
+        """
+        refInds = self._snn()
+        result = []
+        for i in range(self.X_train_.shape[0]):
+            obs = self.X_train_[i]
+            ref = self.X_train_[refInds[i, ], ]
+            means = np.mean(ref, axis=0)  # mean of each column
+            # average squared distance of the reference to the mean
+            varTotal = sum(sum(np.square(ref - means)))/self.ref_set
+            varExpect = self.alpha * varTotal / self.X_train_.shape[1]
+            varActual = np.var(ref, axis=0)  # variance of each attribute
+
+            varInds = [1 if (i < varExpect) else 0 for i in varActual]
+            relDim = sum(varInds)
+            score_ = np.sqrt(np.dot(varInds, np.square(obs - means))/relDim) if relDim > 0 else 0.
+            result.append(score_)
+        return np.array(result)
diff --git a/pyod/test/test_sod.py b/pyod/test/test_sod.py
@@ -0,0 +1,15 @@
+from pyod.models.sod import SOD
+from pyod.utils import generate_data
+
+
+'''
+TO-DO by Yue Zhao
+'''
+X_train, y_train, X_test, y_test = generate_data(n_train=100, n_test=0, n_features=10,
+                                                 contamination=0.1, random_state=0)
+#print(X_train)
+sod = SOD(contamination=0.1, n_neighbors=15, ref_set=10, alpha=0.8)
+sod.fit(X_train)
+print(sod.decision_scores_)
+print(sod.threshold_)
+print(sod.labels_)