Merge pull request #509 from yzhao062/development

v1.0.9
yzhao062 · Jun 25, 2023 · c0ddda4 · c0ddda4
2 parents c4aecd1 + 430684f
commit c0ddda4
Show file tree

Hide file tree

Showing 16 changed files with 354 additions and 190 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -177,3 +177,4 @@ v<1.0.8>, <03/08/2023> -- Improve clone compatibility (#471).
 v<1.0.8>, <03/08/2023> -- Add QMCD detector (#452).
 v<1.0.8>, <03/08/2023> -- Optimized ECDF and drop Statsmodels dependency (#467).
 v<1.0.9>, <03/19/2023> -- Hot fix for errors in ECOD and COPOD due to the issue of scipy.
+v<1.0.9>, <06/19/2023> -- Further integration of PyThresh.
diff --git a/README.rst b/README.rst
@@ -156,6 +156,7 @@ NeurIPS 2022 paper `ADBench: Anomaly Detection Benchmark Paper <https://www.andr
 * `ADBench Benchmark <#adbench-benchmark>`_
 * `Model Save & Load <#model-save--load>`_
 * `Fast Train with SUOD <#fast-train-with-suod>`_
+* `Thresholding Outlier Scores <#thresholding-outlier-scores>`_
 * `Implemented Algorithms <#implemented-algorithms>`_
 * `Quick Start for Outlier Detection <#quick-start-for-outlier-detection>`_
 * `How to Contribute <#how-to-contribute>`_
@@ -327,7 +328,25 @@ and  `SUOD example <https://github.com/yzhao062/pyod/blob/master/examples/suod_e
     clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average',
                verbose=False)
 
+----
+
+Thresholding Outlier Scores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A more data based approach can be taken when setting the contamination level.
+By using a thresholding method, guessing an abritrary value can be replaced
+with tested techniques for seperating inliers and outliers. Refer to 
+`PyThresh <https://github.com/KulikDM/pythresh>`_ for
+a more in depth look at thresholding.
+
 
+.. code-block:: python
+
+    from pyod.models.knn import KNN
+    from pyod.models.thresholds import FILTER
+
+    # Set the outlier detection and thresholding methods
+    clf = KNN(contamination=FILTER())
 
 
 ----
@@ -337,7 +356,7 @@ and  `SUOD example <https://github.com/yzhao062/pyod/blob/master/examples/suod_e
 Implemented Algorithms
 ^^^^^^^^^^^^^^^^^^^^^^
 
-PyOD toolkit consists of three major functional groups:
+PyOD toolkit consists of four major functional groups:
 
 **(i) Individual Detection Algorithms** :
 
@@ -411,8 +430,43 @@ Combination          Median            Simple combination by taking the median o
 Combination          majority Vote     Simple combination by taking the majority vote of the labels (weights can be used)                     2015   [#Aggarwal2015Theoretical]_
 ===================  ================  =====================================================================================================  =====  ========================================
 
-
-**(iii) Utility Functions**:
+**(iii) Outlier Detection Score Thresholding Methods**:
+
+==================================  ================  ================================================================ ====================================================================================================================
+Type                                Abbr              Algorithm                                                        Documentation                                    
+==================================  ================  ================================================================ ====================================================================================================================
+Kernel-Based                        AUCP              Area Under Curve Percentage                                      `AUCP <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.AUCP>`_
+Statistical Moment-Based            BOOT              Bootstrapping                                                    `BOOT <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.BOOT>`_ 
+Normality-Based                     CHAU              Chauvenet's Criterion                                            `CHAU <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.CHAU>`_
+Linear Model                        CLF               Trained Linear Classifier                                        `CLF <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.CLF>`_
+cluster-Based                       CLUST             Clustering Based                                                 `CLUST <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.CLUST>`_
+Kernel-Based                        CPD               Change Point Detection                                           `CPD <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.CPD>`_
+Transformation-Based                DECOMP            Decomposition                                                    `DECOMP <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.DECOMP>`_
+Normality-Based                     DSN               Distance Shift from Normal                                       `DSN <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.DSN>`_
+Curve-Based                         EB                Elliptical Boundary                                              `EB <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.EB>`_
+Kernel-Based                        FGD               Fixed Gradient Descent                                           `FGD <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.FGD>`_
+Filter-Based                        FILTER            Filtering Based                                                  `FILTER <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.FILTER>`_
+Curve-Based                         FWFM              Full Width at Full Minimum                                       `FWFM <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.FWFM>`_
+Statistical Test-Based              GESD              Generalized Extreme Studentized Deviate                          `GESD <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.GESD>`_
+Filter-Based                        HIST              Histogram Based                                                  `HIST <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.HIST>`_
+Quantile-Based                      IQR               Inter-Quartile Region                                            `IQR <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.IQR>`_
+Statistical Moment-Based            KARCH             Karcher mean (Riemannian Center of Mass)                         `KARCH <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.KARCH>`_
+Statistical Moment-Based            MAD               Median Absolute Deviation                                        `MAD <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.MAD>`_
+Statistical Test-Based              MCST              Monte Carlo Shapiro Tests                                        `MCST <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.MCST>`_
+Ensembles-Based                     META              Meta-model Trained Classifier                                    `META <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.META>`_
+Transformation-Based                MOLL              Friedrichs' Mollifier                                            `MOLL <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.MOLL>`_
+Statistical Test-Based              MTT               Modified Thompson Tau Test                                       `MTT <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.MTT>`_
+Linear Model                        OCSVM             One-Class Support Vector Machine                                 `OCSVM <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.OCSVM>`_
+Quantile-Based                      QMCD              Quasi-Monte Carlo Discrepancy                                    `QMCD <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.QMCD>`_
+Linear Model                        REGR              Regression Based                                                 `REGR <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.REGR>`_
+Neural Networks                     VAE               Variational Autoencoder                                          `VAE <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.VAE>`_
+Curve-Based                         WIND              Topological Winding Number                                       `WIND <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.WIND>`_
+Transformation-Based                YJ                Yeo-Johnson Transformation                                       `YJ <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.YJ>`_
+Normality-Based                     ZSCORE            Z-score                                                          `ZSCORE <https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.thresholds.ZSCORE>`_
+==================================  ================  ================================================================ ====================================================================================================================
+
+
+**(iV) Utility Functions**:
 
 ===================  ======================  =====================================================================================================================================================  ======================================================================================================================================
 Type                 Name                    Function                                                                                                                                               Documentation

diff --git a/docs/about.rst b/docs/about.rst
@@ -5,10 +5,10 @@ About us
 Core Development Team
 ---------------------
 
-Yue Zhao (Ph.D. Student @ Carnegie Mellon University):
+Yue Zhao (Assistant Professor @ USC, Ph.D. @ CMU):
 
 - Initialized the project in 2017
-- `Homepage <https://www.andrew.cmu.edu/user/yuezhao2/>`_
+- `Homepage <https://viterbi-web.usc.edu/~yzhao010/>`_
 - `LinkedIn (Yue Zhao) <https://www.linkedin.com/in/yzhao062/>`_
 
 Zain Nasrullah (Data Scientist at RBC; MSc in Computer Science from University of Toronto):

diff --git a/docs/example.rst b/docs/example.rst
@@ -191,6 +191,45 @@ please navigate to **"/notebooks/Model Combination.ipynb"**
         Combination by AOM ROC:0.9257, precision @ rank n:0.4844
         Combination by MOA ROC:0.9263, precision @ rank n:0.4688
 
+Thresholding Example
+--------------------
+
+
+Full example: `threshold_example.py <https://github.com/yzhao062/Pyod/blob/master/examples/threshold_example.py>`_
+
+1. Import models
+
+    .. code-block:: python
+
+        from pyod.models.knn import KNN   # kNN detector
+        from pyod.models.thresholds import FILTER  # Filter thresholder
+
+
+2. Generate sample data with :func:`pyod.utils.data.generate_data`:
+
+    .. code-block:: python
+
+        contamination = 0.1  # percentage of outliers
+        n_train = 200  # number of training points
+        n_test = 100  # number of testing points
+
+        X_train, X_test, y_train, y_test = generate_data(
+            n_train=n_train, n_test=n_test, contamination=contamination)
+
+3. Initialize a :class:`pyod.models.knn.KNN` detector, fit the model, and make
+   the prediction.
+
+    .. code-block:: python
+
+        # train kNN detector and apply FILTER thresholding
+        clf_name = 'KNN'
+        clf = KNN(contamination=FILTER())
+        clf.fit(X_train)
+
+        # get the prediction labels and outlier scores of the training data
+        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+        y_train_scores = clf.decision_scores_  # raw outlier scores
+
 .. rubric:: References
 
 .. bibliography::

diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -367,6 +367,15 @@ pyod.models.suod module
     :show-inheritance:
     :inherited-members:
 
+pyod.models.thresholds module
+-----------------------
+
+.. automodule:: pyod.models.thresholds
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
 pyod.models.vae module
 ----------------------
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -7,8 +7,10 @@ matplotlib
 nose
 numpy>=1.19
 numba==0.53 # need to lift this later see github for issue
+pyclustering
 pytest
-pythresh
+pythresh>=0.3.1
+ruptures
 scipy>=1.5.1
 scikit_learn>=0.20.0
 scikit-lego

diff --git a/examples/cd_example.py b/examples/cd_example.py
@@ -15,7 +15,6 @@
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
 
-import numpy as np
 from pyod.models.cd import CD
 from pyod.utils.data import generate_data
 from pyod.utils.data import evaluate_print
@@ -30,22 +29,22 @@
     X_train, X_test, y_train, y_test = \
         generate_data(n_train=n_train,
                       n_test=n_test,
-                      n_features=2,
+                      n_features=5,
                       contamination=contamination,
                       random_state=42)
 
     # train HBOS detector
     clf_name = 'CD'
     clf = CD()
-    clf.fit(X_train, y_train)
+    clf.fit(X_train)
 
     # get the prediction labels and outlier scores of the training data
     y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
     y_train_scores = clf.decision_scores_  # raw outlier scores
 
     # get the prediction on the test data
-    y_test_pred = clf.predict(np.append(X_test, y_test.reshape(-1,1), axis=1))  # outlier labels (0 or 1)
-    y_test_scores = clf.decision_function(np.append(X_test, y_test.reshape(-1,1), axis=1))  # outlier scores
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
 
     # evaluate and print the results
     print("\nOn Training Data:")

diff --git a/examples/threshold_example.py b/examples/threshold_example.py
@@ -16,7 +16,7 @@
     os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
 
 from pyod.models.kde import KDE
-from pyod.models.thresholds import ALL
+from pyod.models.thresholds import FILTER
 from pyod.utils.data import generate_data
 from pyod.utils.data import evaluate_print
 from pyod.utils.example import visualize
@@ -36,7 +36,7 @@
 
     # train KDE detector
     clf_name = 'KDE'
-    clf = KDE(contamination=ALL())
+    clf = KDE(contamination=FILTER())
     clf.fit(X_train)
 
     # get the prediction labels and outlier scores of the training data

diff --git a/pyod/models/base.py b/pyod/models/base.py
@@ -173,7 +173,6 @@ def predict(self, X, return_confidence=False):
         # if this is a PyThresh object
         else:
             prediction = self.contamination.eval(pred_score)
-            print(self.contamination)
 
         if return_confidence:
             confidence = self.predict_confidence(X)