diff --git a/pyod/models/sod.py b/pyod/models/sod.py new file mode 100644 index 000000000..0006037d2 --- /dev/null +++ b/pyod/models/sod.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- +"""Subspace Outlier Detection (SOD) +""" +# Author: Yahya Almardeny +# License: MIT + +from sklearn.neighbors import NearestNeighbors +import numpy as np +from sklearn.utils import check_array +from .base import BaseDetector + + +class SOD(BaseDetector): + """ + Subspace outlier detection (SOD) algorithm + The implementation is based on the work of + Krigel, H.P., Kroger, P., Schubert, E., Zimek, A., Outlier detection in axis-parallel subspaces of high dimensional data, 2009. + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default=10) + Number of neighbors to use by default for k neighbors queries. + + ref_set: int, optional (default=5) + specifies the number of shared nearest neighbors to create the reference set. + Note that ref_set must be smaller than n_neighbors. + + alpha: float in (0., 1.), optional (default=0.8) + specifies the lower limit for selecting subspace. + 0.8 is set as default as suggested in the original paper. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + def __init__(self,contamination=0.1, n_neighbors=10, ref_set=5, alpha=0.8): + super(SOD, self).__init__(contamination=contamination) + self.n_neighbors = n_neighbors + self.ref_set = ref_set + self.alpha = alpha + + def fit(self, X, y=None): + """Fit detector. y is optional for unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : numpy array of shape (n_samples,), optional (default=None) + The ground truth of the input samples (labels). + """ + if self.ref_set >= self.n_neighbors: + raise TypeError("Number of Neighbors should be greater than Reference Set") + # validate inputs X and y (optional) + X = check_array(X) + self._set_n_classes(y) + + self.X_train_ = X + self.n_train_ = X.shape[0] + self.decision_scores_ = self.decision_function(X) + + self._process_decision_scores() + + return self + + def decision_function(self, X): + """Predict raw anomaly score of X using the fitted detector. + + The anomaly score of an input sample is computed based on different + detector algorithms. For consistency, outliers are assigned with + larger anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + return self._sod() + + def _snn(self): + """ + This function calculates the shared nearest neighbors (SNN). + SNN is reported to be more robust than k nearest neighbors. + Firstly, the k nearest neighbor distances for each observation is calculated. + Then, the shared nearest neighbor similarity is calculated based on + the result of k nearest neighbor. + Note that k must be greater than l. + :return: numpy array containing the indices of top k shared nearest neighbors for + each observation. + """ + knn = NearestNeighbors(n_neighbors=self.n_neighbors) + knn.fit(self.X_train_) + # Get the knn index + ind = knn.kneighbors(return_distance=False) + if not isinstance(ind, np.ndarray): # for any future changes in scikit-learn + ind = np.array(ind) + n = ind.shape[0] + _count = np.zeros(shape=(n, self.ref_set), dtype=np.uint16) + # Count the distance using the customized function + for i in range(n): + # The point should not be in its reference set, + # but we need it temporarily to maintain indices order, + # it has the max possible value: 65535 (max unsigned int16) + # so it'll be always as first index + temp = np.sum(np.isin(ind, ind[i]), axis=1).ravel() + temp[i] = np.iinfo(np.uint16).max + # sorting after each iteration because argsort is int64 + # and cannot handle big data + _count[i] = np.argsort(temp)[::-1][1:self.ref_set+1] + + return _count + + + def _sod(self): + """ + Subspace outlier detection (SOD) algorithm + This function performs suspace outlier detection algorithm + The implemented method is based on the work of + Krigel, H.P., Kroger, P., Schubert, E., Zimek, A., Outlier detection in axis-parallel subspaces of high dimensional data, 2009. + :return: The function returns a vector containing the SOD outlier scores for each observation + """ + refInds = self._snn() + result = [] + for i in range(self.X_train_.shape[0]): + obs = self.X_train_[i] + ref = self.X_train_[refInds[i, ], ] + means = np.mean(ref, axis=0) # mean of each column + # average squared distance of the reference to the mean + varTotal = sum(sum(np.square(ref - means)))/self.ref_set + varExpect = self.alpha * varTotal / self.X_train_.shape[1] + varActual = np.var(ref, axis=0) # variance of each attribute + + varInds = [1 if (i < varExpect) else 0 for i in varActual] + relDim = sum(varInds) + score_ = np.sqrt(np.dot(varInds, np.square(obs - means))/relDim) if relDim > 0 else 0. + result.append(score_) + return np.array(result) diff --git a/pyod/test/test_sod.py b/pyod/test/test_sod.py new file mode 100644 index 000000000..0669f2bfc --- /dev/null +++ b/pyod/test/test_sod.py @@ -0,0 +1,15 @@ +from pyod.models.sod import SOD +from pyod.utils import generate_data + + +''' +TO-DO by Yue Zhao +''' +X_train, y_train, X_test, y_test = generate_data(n_train=100, n_test=0, n_features=10, + contamination=0.1, random_state=0) +#print(X_train) +sod = SOD(contamination=0.1, n_neighbors=15, ref_set=10, alpha=0.8) +sod.fit(X_train) +print(sod.decision_scores_) +print(sod.threshold_) +print(sod.labels_) \ No newline at end of file