Merge pull request #485 from yzhao062/development

V1.0.8
yzhao062 · Mar 8, 2023 · aafb470 · aafb470
2 parents 31db6d1 + 09626cd
commit aafb470
Show file tree

Hide file tree

Showing 31 changed files with 666 additions and 182 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -173,3 +173,6 @@ v<1.0.5>, <09/14/2022> -- Add ALAD.
 v<1.0.6>, <09/23/2022> -- Update ADBench benchmark for NeruIPS 2022.
 v<1.0.6>, <10/23/2022> -- ADD KPCA.
 v<1.0.7>, <12/14/2022> -- Enable automatic thresholding by pythresh (#454).
+v<1.0.8>, <03/08/2023> -- Improve clone compatibility (#471).
+v<1.0.8>, <03/08/2023> -- Add QMCD detector (#452).
+v<1.0.8>, <03/08/2023> -- Optimized ECDF and drop Statsmodels dependency (#467).
diff --git a/README.rst b/README.rst
@@ -200,7 +200,6 @@ Alternatively, you could clone and run setup.py file:
 * scipy>=1.5.1
 * scikit_learn>=0.20.0
 * six
-* statsmodels
 
 **Optional Dependencies (see details below)**\ :
 
@@ -351,6 +350,7 @@ Probabilistic        FastABOD            Fast Angle-Based Outlier Detection usin
 Probabilistic        COPOD               COPOD: Copula-Based Outlier Detection                                                                   2020   [#Li2020COPOD]_
 Probabilistic        MAD                 Median Absolute Deviation (MAD)                                                                         1993   [#Iglewicz1993How]_
 Probabilistic        SOS                 Stochastic Outlier Selection                                                                            2012   [#Janssens2012Stochastic]_
+Probabilistic        QMCD                Quasi-Monte Carlo Discrepancy outlier detection                                                         2001   [#Fang2001Wrap]_
 Probabilistic        KDE                 Outlier Detection with Kernel Density Functions                                                         2007   [#Latecki2007Outlier]_
 Probabilistic        Sampling            Rapid distance-based outlier detection via sampling                                                     2013   [#Sugiyama2013Rapid]_
 Probabilistic        GMM                 Probabilistic Mixture Modeling for Outlier Analysis                                                            [#Aggarwal2015Outlier]_ [Ch.2]
@@ -566,6 +566,8 @@ Reference
 
 .. [#Cook1977Detection] Cook, R.D., 1977. Detection of influential observation in linear regression. Technometrics, 19(1), pp.15-18.
 
+.. [#Fang2001Wrap] Fang, K.T. and Ma, C.X., 2001. Wrap-around L2-discrepancy of random sampling, Latin hypercube and uniform designs. Journal of complexity, 17(4), pp.608-624.
+
 .. [#Goldstein2012Histogram] Goldstein, M. and Dengel, A., 2012. Histogram-based outlier score (hbos): A fast unsupervised anomaly detection algorithm. In *KI-2012: Poster and Demo Track*\ , pp.59-63.
 
 .. [#Goodge2022Lunar] Goodge, A., Hooi, B., Ng, S.K. and Ng, W.S., 2022, June. Lunar: Unifying local outlier detection methods via graph neural networks. In Proceedings of the AAAI Conference on Artificial Intelligence.

diff --git a/docs/about.rst b/docs/about.rst
@@ -59,4 +59,9 @@ Michiel Bongaerts (PhD student @ Erasmus Medical Centre Metabolomics & Genetics)
 Adam Goodge (PhD Researcher @ National University of Singapore):
 
 - Joined in 2022 (implemented LUNAR)
-- `LinkedIn (Adam Goodge) <https://www.linkedin.com/in/adam-goodge-33908691/>`_
+- `LinkedIn (Adam Goodge) <https://www.linkedin.com/in/adam-goodge-33908691/>`_
+
+Daniel Kulik (Machine Learning Developer; MSc Student @ University of the Free State):
+
+- Joined 2022 (implemented integration with PyThresh and more)
+- `LinkedIn (Daniel Kulik) <https://www.linkedin.com/in/daniel-kulik-148256223>`_
diff --git a/docs/index.rst b/docs/index.rst
@@ -186,6 +186,7 @@ Probabilistic        ABOD              Angle-Based Outlier Detection
 Probabilistic        FastABOD          Fast Angle-Based Outlier Detection using approximation                                                  2008   :class:`pyod.models.abod.ABOD`                       :cite:`a-kriegel2008angle`
 Probabilistic        MAD               Median Absolute Deviation (MAD)                                                                         1993   :class:`pyod.models.mad.MAD`                         :cite:`a-iglewicz1993detect`
 Probabilistic        SOS               Stochastic Outlier Selection                                                                            2012   :class:`pyod.models.sos.SOS`                         :cite:`a-janssens2012stochastic`
+Probabilistic        QMCD              Quasi-Monte Carlo Discrepancy outlier detection                                                         2001   :class:`pyod.models.qmcd.QMCD`                       :cite:`a-fang2001wrap`
 Probabilistic        KDE               Outlier Detection with Kernel Density Functions                                                         2007   :class:`pyod.models.kde.KDE`                         :cite:`a-latecki2007outlier`
 Probabilistic        Sampling          Rapid distance-based outlier detection via sampling                                                     2013   :class:`pyod.models.sampling.Sampling`               :cite:`a-sugiyama2013rapid`
 Probabilistic        GMM               Probabilistic Mixture Modeling for Outlier Analysis                                                            :class:`pyod.models.gmm.GMM`                         :cite:`a-aggarwal2015outlier` [Ch.2]

diff --git a/docs/install.rst b/docs/install.rst
@@ -33,7 +33,6 @@ Alternatively, you could clone and run setup.py file:
 * scipy>=1.5.1
 * scikit_learn>=0.20.0
 * six
-* statsmodels
 
 
 **Optional Dependencies (see details below)**:

diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -294,6 +294,15 @@ pyod.models.pca module
     :show-inheritance:
     :inherited-members:
 
+pyod.models.qmcd module
+-------------------------
+
+.. automodule:: pyod.models.qmcd
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
 pyod.models.rgraph module
 -------------------------
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -15,7 +15,6 @@ scikit-lego
 six
 sphinx-rtd-theme
 sphinxcontrib-bibtex
-statsmodels
 suod
 tensorflow
 torch

diff --git a/docs/zreferences.bib b/docs/zreferences.bib
@@ -478,4 +478,15 @@ @article{hoffmann2007kernel
   pages={863--874},
   year={2007},
   publisher={Elsevier}
+}
+
+@article{fang2001wrap,
+  title={Wrap-around L2-discrepancy of random sampling, Latin hypercube and uniform designs},
+  author={Fang, Kai-Tai and Ma, Chang-Xing},
+  journal={Journal of complexity},
+  volume={17},
+  number={4},
+  pages={608--624},
+  year={2001},
+  publisher={Elsevier}
 }
diff --git a/environment.yml b/environment.yml
@@ -14,5 +14,4 @@ dependencies:
   - pip:
       - suod
       - combo
-      - statsmodels
 
diff --git a/examples/qmcd_example.py b/examples/qmcd_example.py
@@ -0,0 +1,58 @@
+"""Example of using Quasi-Monte Carlo Discrepancy (QMCD) for
+outlier detection
+"""
+# Author: D Kulik
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+import numpy as np
+from pyod.models.qmcd import QMCD
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+from pyod.utils.example import visualize
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 200  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, X_test, y_train, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=2,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train QMCD detector
+    clf_name = 'QMCD'
+    clf = QMCD()
+    clf.fit(X_train, y_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(np.append(X_test, y_test.reshape(-1, 1), axis=1))  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(np.append(X_test, y_test.reshape(-1, 1), axis=1))  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
+    # visualize the results
+    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True, save_figure=False)
diff --git a/pyod/models/cof.py b/pyod/models/cof.py
@@ -86,8 +86,7 @@ def __init__(self, contamination=0.1, n_neighbors=20, method="fast"):
         else:
             raise TypeError(
                 "n_neighbors should be int. Got %s" % type(n_neighbors))
-        self.n_neighbors_ = n_neighbors
-        self.decision_scores_ = None
+        self.n_neighbors = n_neighbors
         self.method = method
 
     def fit(self, X, y=None):
@@ -108,6 +107,7 @@ def fit(self, X, y=None):
         """
         X = check_array(X)
         self.n_train_ = X.shape[0]
+        self.n_neighbors_ = self.n_neighbors
 
         if self.n_neighbors_ >= self.n_train_:
             self.n_neighbors_ = self.n_train_ - 1

diff --git a/pyod/models/copod.py b/pyod/models/copod.py
@@ -14,25 +14,10 @@
 from joblib import Parallel, delayed
 from scipy.stats import skew
 from sklearn.utils import check_array
-from statsmodels.distributions.empirical_distribution import ECDF
 
 from .base import BaseDetector
 from .sklearn_base import _partition_estimators
-
-
-def ecdf(X):
-    """Calculated the empirical CDF of a given dataset.
-    Parameters
-    ----------
-    X : numpy array of shape (n_samples, n_features)
-        The training dataset.
-    Returns
-    -------
-    ecdf(X) : float
-        Empirical CDF of X
-    """
-    ecdf = ECDF(X)
-    return ecdf(X)
+from ..utils.stat_models import column_ecdf
 
 
 def _parallel_ecdf(n_dims, X):
@@ -57,8 +42,8 @@ def _parallel_ecdf(n_dims, X):
     U_r_mat = np.zeros([X.shape[0], n_dims])
 
     for i in range(n_dims):
-        U_l_mat[:, i] = ecdf(X[:, i])
-        U_r_mat[:, i] = ecdf(X[:, i] * -1)
+        U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
+        U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
     return U_l_mat, U_r_mat
 
 
@@ -141,8 +126,8 @@ def decision_function(self, X):
         if hasattr(self, 'X_train'):
             original_size = X.shape[0]
             X = np.concatenate((self.X_train, X), axis=0)
-        self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
-        self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
+        self.U_l = -1 * np.log(column_ecdf(X))
+        self.U_r = -1 * np.log(column_ecdf(-X))
 
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(

diff --git a/pyod/models/ecod.py b/pyod/models/ecod.py
@@ -15,25 +15,10 @@
 from joblib import Parallel, delayed
 from scipy.stats import skew
 from sklearn.utils import check_array
-from statsmodels.distributions.empirical_distribution import ECDF
 
 from .base import BaseDetector
 from .sklearn_base import _partition_estimators
-
-
-def ecdf(X):
-    """Calculated the empirical CDF of a given dataset.
-    Parameters
-    ----------
-    X : numpy array of shape (n_samples, n_features)
-        The training dataset.
-    Returns
-    -------
-    ecdf(X) : float
-        Empirical CDF of X
-    """
-    ecdf = ECDF(X)
-    return ecdf(X)
+from ..utils.stat_models import column_ecdf
 
 
 def _parallel_ecdf(n_dims, X):
@@ -58,8 +43,8 @@ def _parallel_ecdf(n_dims, X):
     U_r_mat = np.zeros([X.shape[0], n_dims])
 
     for i in range(n_dims):
-        U_l_mat[:, i] = ecdf(X[:, i])
-        U_r_mat[:, i] = ecdf(X[:, i] * -1)
+        U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
+        U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
     return U_l_mat, U_r_mat
 
 
@@ -143,8 +128,8 @@ def decision_function(self, X):
         if hasattr(self, 'X_train'):
             original_size = X.shape[0]
             X = np.concatenate((self.X_train, X), axis=0)
-        self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
-        self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
+        self.U_l = -1 * np.log(column_ecdf(X))
+        self.U_r = -1 * np.log(column_ecdf(-X))
 
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(

diff --git a/pyod/models/mad.py b/pyod/models/mad.py
@@ -59,13 +59,10 @@ def __init__(self, threshold=3.5):
         # contamination is unneeded since threshold must be
         # decided manually by the user
         super(MAD, self).__init__()
-        self.decision_scores_ = None
         if not isinstance(threshold, (float, int)):
             raise TypeError(
                 'threshold must be a number. Got {}'.format(type(threshold)))
-        self.threshold_ = threshold
-        self.median = None
-        self.median_diff = None
+        self.threshold = threshold
 
     def fit(self, X, y=None):
         """Fit detector. y is ignored in unsupervised methods.
@@ -86,8 +83,9 @@ def fit(self, X, y=None):
         X = check_array(X, ensure_2d=False, force_all_finite=False)
         _check_dim(X)
         self._set_n_classes(y)
-        self.median = None  # reset median after each call
-        self.median_diff = None  # reset median_diff after each call
+        self.threshold_ = self.threshold
+        self.median_ = None  # reset median after each call
+        self.median_diff_ = None  # reset median_diff after each call
         self.decision_scores_ = self.decision_function(X)
         self._process_decision_scores()
 
@@ -127,10 +125,10 @@ def _mad(self, X):
         """
         obs = np.reshape(X, (-1, 1))
         # `self.median` will be None only before `fit()` is called
-        self.median = np.nanmedian(obs) if self.median is None else self.median
-        diff = np.abs(obs - self.median)
-        self.median_diff = np.nanmedian(diff) if self.median_diff is None else self.median_diff
-        return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff))
+        self.median_ = np.nanmedian(obs) if self.median_ is None else self.median_
+        diff = np.abs(obs - self.median_)
+        self.median_diff_ = np.nanmedian(diff) if self.median_diff_ is None else self.median_diff_
+        return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff_))
 
     def _process_decision_scores(self):
         """This overrides PyOD base class function in order to use the
@@ -144,7 +142,7 @@ def _process_decision_scores(self):
         -------
         self
         """
-        self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel()
+        self.labels_ = (self.decision_scores_ > self.threshold).astype('int').ravel()
 
         # calculate for predict_proba()
         self._mu = np.nanmean(self.decision_scores_)