Merge b2d6606 into 575ee98

yzhao062 · Dec 21, 2022 · b78d99b · b78d99b
2 parents 575ee98 + b2d6606
commit b78d99b
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 46 deletions.
diff --git a/README.rst b/README.rst
@@ -200,7 +200,6 @@ Alternatively, you could clone and run setup.py file:
 * scipy>=1.5.1
 * scikit_learn>=0.20.0
 * six
-* statsmodels
 
 **Optional Dependencies (see details below)**\ :
 

diff --git a/docs/install.rst b/docs/install.rst
@@ -33,7 +33,6 @@ Alternatively, you could clone and run setup.py file:
 * scipy>=1.5.1
 * scikit_learn>=0.20.0
 * six
-* statsmodels
 
 
 **Optional Dependencies (see details below)**:

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -15,7 +15,6 @@ scikit-lego
 six
 sphinx-rtd-theme
 sphinxcontrib-bibtex
-statsmodels
 suod
 tensorflow
 torch

diff --git a/environment.yml b/environment.yml
@@ -14,5 +14,4 @@ dependencies:
   - pip:
       - suod
       - combo
-      - statsmodels
 
diff --git a/pyod/models/copod.py b/pyod/models/copod.py
@@ -14,27 +14,12 @@
 from joblib import Parallel, delayed
 from scipy.stats import skew
 from sklearn.utils import check_array
-from statsmodels.distributions.empirical_distribution import ECDF
+from ..utils.stat_models import column_ecdf
 
 from .base import BaseDetector
 from .sklearn_base import _partition_estimators
 
 
-def ecdf(X):
-    """Calculated the empirical CDF of a given dataset.
-    Parameters
-    ----------
-    X : numpy array of shape (n_samples, n_features)
-        The training dataset.
-    Returns
-    -------
-    ecdf(X) : float
-        Empirical CDF of X
-    """
-    ecdf = ECDF(X)
-    return ecdf(X)
-
-
 def _parallel_ecdf(n_dims, X):
     """Private method to calculate ecdf in parallel.    
     Parameters
@@ -57,8 +42,8 @@ def _parallel_ecdf(n_dims, X):
     U_r_mat = np.zeros([X.shape[0], n_dims])
 
     for i in range(n_dims):
-        U_l_mat[:, i] = ecdf(X[:, i])
-        U_r_mat[:, i] = ecdf(X[:, i] * -1)
+        U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
+        U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
     return U_l_mat, U_r_mat
 
 
@@ -141,8 +126,8 @@ def decision_function(self, X):
         if hasattr(self, 'X_train'):
             original_size = X.shape[0]
             X = np.concatenate((self.X_train, X), axis=0)
-        self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
-        self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
+        self.U_l = -1 * np.log(column_ecdf(X))
+        self.U_r = -1 * np.log(column_ecdf(-X))
 
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(

diff --git a/pyod/models/ecod.py b/pyod/models/ecod.py
@@ -15,27 +15,12 @@
 from joblib import Parallel, delayed
 from scipy.stats import skew
 from sklearn.utils import check_array
-from statsmodels.distributions.empirical_distribution import ECDF
+from ..utils.stat_models import column_ecdf
 
 from .base import BaseDetector
 from .sklearn_base import _partition_estimators
 
 
-def ecdf(X):
-    """Calculated the empirical CDF of a given dataset.
-    Parameters
-    ----------
-    X : numpy array of shape (n_samples, n_features)
-        The training dataset.
-    Returns
-    -------
-    ecdf(X) : float
-        Empirical CDF of X
-    """
-    ecdf = ECDF(X)
-    return ecdf(X)
-
-
 def _parallel_ecdf(n_dims, X):
     """Private method to calculate ecdf in parallel.
     Parameters
@@ -58,8 +43,8 @@ def _parallel_ecdf(n_dims, X):
     U_r_mat = np.zeros([X.shape[0], n_dims])
 
     for i in range(n_dims):
-        U_l_mat[:, i] = ecdf(X[:, i])
-        U_r_mat[:, i] = ecdf(X[:, i] * -1)
+        U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
+        U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
     return U_l_mat, U_r_mat
 
 
@@ -143,8 +128,8 @@ def decision_function(self, X):
         if hasattr(self, 'X_train'):
             original_size = X.shape[0]
             X = np.concatenate((self.X_train, X), axis=0)
-        self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
-        self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
+        self.U_l = -1 * np.log(column_ecdf(X))
+        self.U_r = -1 * np.log(column_ecdf(-X))
 
         skewness = np.sign(skew(X, axis=0))
         self.U_skew = self.U_l * -1 * np.sign(

diff --git a/pyod/test/test_copod_parallel.py b/pyod/test/test_copod_parallel.py
@@ -40,6 +40,10 @@ def setUp(self):
         self.clf_ = COPOD(contamination=self.contamination)
         self.clf_.fit(self.X_train)
 
+    def test_fit(self):
+        clf = COPOD(contamination=self.contamination, n_jobs=3)
+        clf.fit(self.X_train[:, :2])
+
     def test_parameters(self):
         assert (hasattr(self.clf, 'decision_scores_') and
                 self.clf.decision_scores_ is not None)
@@ -138,6 +142,11 @@ def test_predict_rank_normalized(self):
         assert_array_less(pred_ranks, 1.01)
         assert_array_less(-0.1, pred_ranks)
 
+    def test_fit_single_feature_multiple_jobs(self):
+        clf = COPOD(contamination=self.contamination, n_jobs=5)
+        with assert_raises(ValueError):
+            clf.fit(self.X_train[:, 0])
+
     # def test_plot(self):
     #     os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1)
     #     assert_array_less(0, os)

diff --git a/pyod/test/test_ecod_parallel.py b/pyod/test/test_ecod_parallel.py
@@ -40,6 +40,10 @@ def setUp(self):
         self.clf_ = ECOD(contamination=self.contamination)
         self.clf_.fit(self.X_train)
 
+    def test_fit(self):
+        clf = ECOD(contamination=self.contamination, n_jobs=3)
+        clf.fit(self.X_train[:, :2])
+
     def test_parameters(self):
         assert (hasattr(self.clf, 'decision_scores_') and
                 self.clf.decision_scores_ is not None)
@@ -138,6 +142,11 @@ def test_predict_rank_normalized(self):
         assert_array_less(pred_ranks, 1.01)
         assert_array_less(-0.1, pred_ranks)
 
+    def test_fit_single_feature_multiple_jobs(self):
+        clf = ECOD(contamination=self.contamination, n_jobs=5)
+        with assert_raises(ValueError):
+            clf.fit(self.X_train[:, 0])
+
     # def test_plot(self):
     #     os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1)
     #     assert_array_less(0, os)

diff --git a/pyod/test/test_stat_models.py b/pyod/test/test_stat_models.py
@@ -20,6 +20,9 @@
 from pyod.utils.stat_models import pairwise_distances_no_broadcast
 from pyod.utils.stat_models import wpearsonr
 from pyod.utils.stat_models import pearsonr_mat
+from pyod.utils.stat_models import column_ecdf
+import statsmodels.distributions
+import time
 
 
 class TestStatModels(unittest.TestCase):
@@ -65,6 +68,56 @@ def test_pearsonr_mat(self):
         assert (np.min(pear_mat) >= -1)
         assert (np.max(pear_mat) <= 1)
 
+    def test_njit_probability_reordering(self):
+        # trigger the njit compiler for one of the functions
+        # in the stat models packages
+        column_ecdf(self.mat)
+
+    def test_column_ecdf(self):
+
+        def ecdf(X):
+            """Calculated the empirical CDF of a given dataset using the statsmodels function.
+            Parameters
+            ----------
+            X : numpy array of shape (n_samples, n_features)
+                The training dataset.
+            Returns
+            -------
+            ecdf(X) : float
+                Empirical CDF of X
+            """
+            ecdf = statsmodels.distributions.ECDF(X)
+            return ecdf(X)
+
+        # run a test case that has equal elements per feature column to show also
+        # this highly unlikely case works
+        mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1], [2, 2, 2]])
+        assert_equal(column_ecdf(mat), np.apply_along_axis(ecdf, 0, mat))
+
+        # run the models multiple times for random matrices
+        new = []
+        old = []
+        for _ in range(50):
+
+            # create random matrix for testing
+            mat = np.random.rand(1000, 100)
+
+            # execute and measure the time of our own function
+            t = time.time()
+            result = column_ecdf(mat)
+            new.append(time.time() - t)
+
+            # execute the statsmodels function and measure execution time
+            t = time.time()
+            expected = np.apply_along_axis(ecdf, 0, mat)
+            old.append(time.time() - t)
+
+            # check that the results are equal
+            assert_equal(result, expected)
+
+        print(f'Statsmodels ECDF took {sum(old)/len(old)*1000:0.1f} ms '
+              f'and own implementation {sum(new)/len(new)*1000:0.1f} ms per run.')
+
     def tearDown(self):
         pass
 

diff --git a/pyod/utils/stat_models.py b/pyod/utils/stat_models.py
@@ -183,3 +183,69 @@ def pearsonr_mat(mat, w=None):
                 pear_mat[cy, cx] = curr_pear
 
     return pear_mat
+
+
+def column_ecdf(matrix: np.ndarray) -> np.ndarray:
+    """
+    Utility function to compute the column wise empirical cumulative distribution of a 2D feature matrix,
+    where the rows are samples and the columns are features per sample. The accumulation is done in the positive
+    direction of the sample axis.
+
+    E.G.
+    p(1) = 0.2, p(0) = 0.3, p(2) = 0.1, p(6) = 0.4
+    ECDF E(5) = p(x <= 5)
+    ECDF E would be E(-1) = 0, E(0) = 0.3, E(1) = 0.5, E(2) = 0.6, E(3) = 0.6, E(4) = 0.6, E(5) = 0.6, E(6) = 1
+
+    Similar to and tested against:
+    https://www.statsmodels.org/stable/generated/statsmodels.distributions.empirical_distribution.ECDF.html
+
+    Returns
+    -------
+
+    """
+    # check the matrix dimensions
+    assert len(matrix.shape) == 2, 'Matrix needs to be two dimensional for the ECDF computation.'
+
+    # create a probability array the same shape as the feature matrix which we will reorder to build
+    # the ecdf
+    probabilities = np.linspace(np.ones(matrix.shape[1])/matrix.shape[0], np.ones(matrix.shape[1]), matrix.shape[0])
+
+    # get the sorting indices for a numpy array
+    sort_idx = np.argsort(matrix, axis=0)
+
+    # sort the numpy array, as we need to look for duplicates in the feature values (that would have different
+    # probabilities if we would just resort the probabilities array)
+    matrix = np.take_along_axis(matrix, sort_idx, axis=0)
+
+    # deal with equal values
+    ecdf_terminate_equals_inplace(matrix, probabilities)
+
+    # return the resorted accumulated probabilities (by reverting the sorting of the input matrix)
+    # looks a little complicated but is faster this way
+    reordered_probabilities = np.ones_like(probabilities)
+    np.put_along_axis(reordered_probabilities, sort_idx, probabilities, axis=0)
+    return reordered_probabilities
+
+
+@njit
+def ecdf_terminate_equals_inplace(matrix: np.ndarray, probabilities: np.ndarray):
+    """
+    This is a helper function for computing the ecdf of an array. It has been outsourced from the original
+    function in order to be able to use the njit compiler of numpy for increased speeds, as it unfortunately
+    needs a loop over all rows and columns of a matrix. It acts in place on the probabilities' matrix.
+
+    Parameters
+    ----------
+    matrix : a feature matrix where the rows are samples and each column is a feature !(expected to be sorted)!
+
+    probabilities : a probability matrix that will be used building the ecdf. It has values between 0 and 1 and
+                    is also sorted.
+
+    Returns
+    -------
+
+    """
+    for cx in range(probabilities.shape[1]):
+        for rx in range(probabilities.shape[0]-2, -1, -1):
+            if matrix[rx, cx] == matrix[rx+1, cx]:
+                probabilities[rx, cx] = probabilities[rx+1, cx]
diff --git a/requirements.txt b/requirements.txt
@@ -4,5 +4,4 @@ numpy>=1.19
 numba>=0.51
 scipy>=1.5.1
 scikit_learn>=0.20.0
-six
-statsmodels
+six