Improve model docstrings

yzhao062 · May 27, 2018 · d33d308 · d33d308
1 parent f5f7fa4
commit d33d308
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 35 deletions.
diff --git a/pyod/models/abod.py b/pyod/models/abod.py
@@ -59,17 +59,17 @@ def fit(self, X_train):
                                             100 * (1 - self.contamination))
         self.y_pred = (self.decision_scores > self.threshold_).astype('int')
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
 
         if not self._isfitted:
             NotFittedError('Model is not fitted yet')
 
-        X_test = check_array(X_test)
+        X = check_array(X)
         # initialize the output score
-        pred_score = np.zeros([X_test.shape[0], 1])
+        pred_score = np.zeros([X.shape[0], 1])
 
-        for i in range(X_test.shape[0]):
-            curr_pt = X_test[i, :]
+        for i in range(X.shape[0]):
+            curr_pt = X[i, :]
 
             # get the index pairs of the neighbors
             ind = list(range(0, self.n_train))

diff --git a/pyod/models/base.py b/pyod/models/base.py
@@ -1,3 +1,7 @@
+"""
+Abstract base class for outlier detector models
+"""
+
 from abc import ABC, abstractmethod
 import numpy as np
 
@@ -14,14 +18,28 @@ class BaseDetector(ABC):
 
     @abstractmethod
     def __init__(self, contamination=0.1):
+        """
+        :param contamination: percentage of outliers, range in (0, 0.5]
+        :type contamination: float
+        """
         self.contamination = contamination
         self.threshold_ = None
         self.decision_scores = None
         self.y_pred = None
         self._isfitted = False
 
     @abstractmethod
-    def decision_function(self, X_test):
+    def decision_function(self, X):
+        """
+        Anomaly score of X of the base classifiers.
+        The anomaly score of an input sample is computed based on different detector algorithms.
+        For consistency, outliers have larger anomaly scores.
+
+        :param X: The training input samples. Sparse matrices are accepted only if they are supported by the base estimator.
+        :type X: {array-like, sparse matrix}
+        :return: scores: The anomaly score of the input samples. The lower, the more abnormal.
+        :rtype: array of shape (n_samples,)
+        """
         pass
 
     @abstractmethod

diff --git a/pyod/models/glosh.py b/pyod/models/glosh.py
@@ -26,14 +26,14 @@ def fit(self, X_train):
         self.threshold = scoreatpercentile(self.scores,
                                            100 * (1 - self.contamination))
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
 
-        X_test = check_array(X_test)
+        X = check_array(X)
         # initialize the outputs
-        pred_score = np.zeros([X_test.shape[0], 1])
+        pred_score = np.zeros([X.shape[0], 1])
 
-        for i in range(X_test.shape[0]):
-            x_i = X_test[i, :]
+        for i in range(X.shape[0]):
+            x_i = X[i, :]
 
             x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
             x_comb = np.concatenate((self.X_train, x_i), axis=0)

diff --git a/pyod/models/hbos.py b/pyod/models/hbos.py
@@ -81,15 +81,15 @@ def fit(self, X_train):
         self.mu = np.mean(self.decision_scores)
         self.sigma = np.std(self.decision_scores)
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
 
-        X_test = check_array(X_test)
-        n_test = X_test.shape[0]
+        X = check_array(X)
+        n_test = X.shape[0]
         out_scores = np.zeros([n_test, self.d])
 
         for i in range(self.d):
             # find histogram assignments of data points
-            bin_ind = np.digitize(X_test[:, i], self.bin_edges[:, i],
+            bin_ind = np.digitize(X[:, i], self.bin_edges[:, i],
                                   right=False)
 
             # very important to do scaling. Not necessary to use minmax
@@ -99,7 +99,7 @@ def decision_function(self, X_test):
             for j in range(n_test):
                 # out sample left
                 if bin_ind[j] == 0:
-                    dist = np.abs(X_test[j, i] - self.bin_edges[0, i])
+                    dist = np.abs(X[j, i] - self.bin_edges[0, i])
                     bin_width = self.bin_edges[1, i] - self.bin_edges[0, i]
                     # assign it to bin 0
                     if dist < bin_width * self.beta:
@@ -109,7 +109,7 @@ def decision_function(self, X_test):
 
                 # out sample right
                 elif bin_ind[j] == self.bin_edges.shape[0]:
-                    dist = np.abs(X_test[j, i] - self.bin_edges[-1, i])
+                    dist = np.abs(X[j, i] - self.bin_edges[-1, i])
                     bin_width = self.bin_edges[-1, i] - self.bin_edges[-2, i]
                     # assign it to bin k
                     if dist < bin_width * self.beta:

diff --git a/pyod/models/iforest.py b/pyod/models/iforest.py
@@ -80,11 +80,11 @@ def fit(self, X_train, y=None, sample_weight=None):
 
         return self
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
         if not self._isfitted:
             NotFittedError('Model is not fitted yet')
         # invert scores. Outliers comes with higher scores
-        return self.detector_.decision_function(X_test) * -1
+        return self.detector_.decision_function(X) * -1
 
     @property
     def estimators_(self):

diff --git a/pyod/models/knn.py b/pyod/models/knn.py
@@ -59,18 +59,18 @@ def fit(self, X_train):
 
         return self
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
 
         if not self._isfitted:
             NotFittedError('Model is not fitted yet')
 
-        X_test = check_array(X_test)
+        X = check_array(X)
 
         # initialize the output score
-        pred_score = np.zeros([X_test.shape[0], 1])
+        pred_score = np.zeros([X.shape[0], 1])
 
-        for i in range(X_test.shape[0]):
-            x_i = X_test[i, :]
+        for i in range(X.shape[0]):
+            x_i = X[i, :]
             x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
 
             # get the distance of the current point

diff --git a/pyod/models/lof.py b/pyod/models/lof.py
@@ -53,12 +53,12 @@ def fit(self, X_train, y=None):
 
         return self
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
         if not self._isfitted:
             NotFittedError('Model is not fitted yet')
 
         # invert scores. Outliers comes with higher scores
-        return self.detector_._decision_function(X_test) * -1
+        return self.detector_._decision_function(X) * -1
 
     @property
     def negative_outlier_factor_(self):

diff --git a/pyod/models/ocsvm.py b/pyod/models/ocsvm.py
@@ -55,11 +55,11 @@ def fit(self, X_train, y=None, sample_weight=None, **params):
 
         return self
 
-    def decision_function(self, X_test):
+    def decision_function(self, X):
         if not self._isfitted:
             NotFittedError('Model is not fitted yet')
         # invert scores. Outliers comes with higher scores
-        return self.detector_.decision_function(X_test) * -1
+        return self.detector_.decision_function(X) * -1
 
     @property
     def support_(self):

diff --git a/pyod/utils/load_data.py b/pyod/utils/load_data.py
@@ -20,8 +20,10 @@ def generate_data(n_train=1000, n_test=500, contamination=0.1,
 
     :param n_test: number of test points to generate
     :type n_test: int
+
     :param contamination: percentage of outliers
     :type contamination: float
+
     :return: training data and test data (c_train and c_test are colors)
     :rtype: tuple, (ndarry, ndarry, list, ndarry, ndarry, list)
     """

diff --git a/pyod/utils/utility.py b/pyod/utils/utility.py
@@ -11,11 +11,18 @@
 def check_parameter_range(para, low=None, high=None):
     """
     check if input parameter is with in the range low and high
+
     :param para: the input parameter to check
-    :type para:
-    :param low:
-    :param high:
-    :return:
+    :type para: int, float
+
+    :param low: lower bound of the range
+    :type low: int, float
+
+    :param high: higher bound of the range
+    :type high: int, float
+
+    :return: whether the parameter is within the range of (low, high)
+    :rtype: bool
     """
     if low is None and high is None:
         raise ValueError('both low and high bounds are undefined')
@@ -36,20 +43,24 @@ def check_parameter_range(para, low=None, high=None):
 def standardizer(X_train, X_test):
     """
     normalization function wrapper, z- normalization function
+
     :param X_train:
     :param X_test:
     :return: X_train and X_test after the Z-score normalization
+    :rtype: tuple(ndarray, ndarray)
     """
     scaler = StandardScaler().fit(X_train)
-    return scaler.transform(X_train), scaler.transform(X_test)
+    return (scaler.transform(X_train), scaler.transform(X_test))
 
 
-def scores_to_lables(pred_scores, outlier_perc=1):
+def scores_to_lables(pred_scores, outlier_perc=0.1):
     """
     turn raw outlier scores to binary labels (0 or 1)
+
     :param pred_scores: raw outlier scores
     :param outlier_perc: percentage of outliers
     :return: binary labels (1 stands for outlier)
+    :rtype: int
     """
     threshold = scoreatpercentile(pred_scores, 100 * (1 - outlier_perc))
     pred_labels = (pred_scores > threshold).astype('int')
@@ -58,11 +69,13 @@ def scores_to_lables(pred_scores, outlier_perc=1):
 
 def precision_n_scores(y, y_pred, n=None):
     """
-    Utlity function to calculate precision@ rank n_train
+    Utlity function to calculate precision@ rank
+
     :param y: ground truth
     :param y_pred: number of outliers
     :param n: number of outliers, if not defined, infer using ground truth
     :return: precison at rank n score
+    :rtype: float
     """
 
     # turn prediction scores into binary labels