Skip to content

Commit

Permalink
Merge b2d6606 into 575ee98
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucew committed Dec 21, 2022
2 parents 575ee98 + b2d6606 commit b78d99b
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 46 deletions.
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ Alternatively, you could clone and run setup.py file:
* scipy>=1.5.1
* scikit_learn>=0.20.0
* six
* statsmodels

**Optional Dependencies (see details below)**\ :

Expand Down
1 change: 0 additions & 1 deletion docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ Alternatively, you could clone and run setup.py file:
* scipy>=1.5.1
* scikit_learn>=0.20.0
* six
* statsmodels


**Optional Dependencies (see details below)**:
Expand Down
1 change: 0 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ scikit-lego
six
sphinx-rtd-theme
sphinxcontrib-bibtex
statsmodels
suod
tensorflow
torch
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ dependencies:
- pip:
- suod
- combo
- statsmodels

25 changes: 5 additions & 20 deletions pyod/models/copod.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,12 @@
from joblib import Parallel, delayed
from scipy.stats import skew
from sklearn.utils import check_array
from statsmodels.distributions.empirical_distribution import ECDF
from ..utils.stat_models import column_ecdf

from .base import BaseDetector
from .sklearn_base import _partition_estimators


def ecdf(X):
"""Calculated the empirical CDF of a given dataset.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training dataset.
Returns
-------
ecdf(X) : float
Empirical CDF of X
"""
ecdf = ECDF(X)
return ecdf(X)


def _parallel_ecdf(n_dims, X):
"""Private method to calculate ecdf in parallel.
Parameters
Expand All @@ -57,8 +42,8 @@ def _parallel_ecdf(n_dims, X):
U_r_mat = np.zeros([X.shape[0], n_dims])

for i in range(n_dims):
U_l_mat[:, i] = ecdf(X[:, i])
U_r_mat[:, i] = ecdf(X[:, i] * -1)
U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
return U_l_mat, U_r_mat


Expand Down Expand Up @@ -141,8 +126,8 @@ def decision_function(self, X):
if hasattr(self, 'X_train'):
original_size = X.shape[0]
X = np.concatenate((self.X_train, X), axis=0)
self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
self.U_l = -1 * np.log(column_ecdf(X))
self.U_r = -1 * np.log(column_ecdf(-X))

skewness = np.sign(skew(X, axis=0))
self.U_skew = self.U_l * -1 * np.sign(
Expand Down
25 changes: 5 additions & 20 deletions pyod/models/ecod.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,12 @@
from joblib import Parallel, delayed
from scipy.stats import skew
from sklearn.utils import check_array
from statsmodels.distributions.empirical_distribution import ECDF
from ..utils.stat_models import column_ecdf

from .base import BaseDetector
from .sklearn_base import _partition_estimators


def ecdf(X):
"""Calculated the empirical CDF of a given dataset.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training dataset.
Returns
-------
ecdf(X) : float
Empirical CDF of X
"""
ecdf = ECDF(X)
return ecdf(X)


def _parallel_ecdf(n_dims, X):
"""Private method to calculate ecdf in parallel.
Parameters
Expand All @@ -58,8 +43,8 @@ def _parallel_ecdf(n_dims, X):
U_r_mat = np.zeros([X.shape[0], n_dims])

for i in range(n_dims):
U_l_mat[:, i] = ecdf(X[:, i])
U_r_mat[:, i] = ecdf(X[:, i] * -1)
U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
return U_l_mat, U_r_mat


Expand Down Expand Up @@ -143,8 +128,8 @@ def decision_function(self, X):
if hasattr(self, 'X_train'):
original_size = X.shape[0]
X = np.concatenate((self.X_train, X), axis=0)
self.U_l = -1 * np.log(np.apply_along_axis(ecdf, 0, X))
self.U_r = -1 * np.log(np.apply_along_axis(ecdf, 0, -X))
self.U_l = -1 * np.log(column_ecdf(X))
self.U_r = -1 * np.log(column_ecdf(-X))

skewness = np.sign(skew(X, axis=0))
self.U_skew = self.U_l * -1 * np.sign(
Expand Down
9 changes: 9 additions & 0 deletions pyod/test/test_copod_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def setUp(self):
self.clf_ = COPOD(contamination=self.contamination)
self.clf_.fit(self.X_train)

def test_fit(self):
clf = COPOD(contamination=self.contamination, n_jobs=3)
clf.fit(self.X_train[:, :2])

def test_parameters(self):
assert (hasattr(self.clf, 'decision_scores_') and
self.clf.decision_scores_ is not None)
Expand Down Expand Up @@ -138,6 +142,11 @@ def test_predict_rank_normalized(self):
assert_array_less(pred_ranks, 1.01)
assert_array_less(-0.1, pred_ranks)

def test_fit_single_feature_multiple_jobs(self):
clf = COPOD(contamination=self.contamination, n_jobs=5)
with assert_raises(ValueError):
clf.fit(self.X_train[:, 0])

# def test_plot(self):
# os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1)
# assert_array_less(0, os)
Expand Down
9 changes: 9 additions & 0 deletions pyod/test/test_ecod_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def setUp(self):
self.clf_ = ECOD(contamination=self.contamination)
self.clf_.fit(self.X_train)

def test_fit(self):
clf = ECOD(contamination=self.contamination, n_jobs=3)
clf.fit(self.X_train[:, :2])

def test_parameters(self):
assert (hasattr(self.clf, 'decision_scores_') and
self.clf.decision_scores_ is not None)
Expand Down Expand Up @@ -138,6 +142,11 @@ def test_predict_rank_normalized(self):
assert_array_less(pred_ranks, 1.01)
assert_array_less(-0.1, pred_ranks)

def test_fit_single_feature_multiple_jobs(self):
clf = ECOD(contamination=self.contamination, n_jobs=5)
with assert_raises(ValueError):
clf.fit(self.X_train[:, 0])

# def test_plot(self):
# os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1)
# assert_array_less(0, os)
Expand Down
53 changes: 53 additions & 0 deletions pyod/test/test_stat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from pyod.utils.stat_models import pairwise_distances_no_broadcast
from pyod.utils.stat_models import wpearsonr
from pyod.utils.stat_models import pearsonr_mat
from pyod.utils.stat_models import column_ecdf
import statsmodels.distributions
import time


class TestStatModels(unittest.TestCase):
Expand Down Expand Up @@ -65,6 +68,56 @@ def test_pearsonr_mat(self):
assert (np.min(pear_mat) >= -1)
assert (np.max(pear_mat) <= 1)

def test_njit_probability_reordering(self):
# trigger the njit compiler for one of the functions
# in the stat models packages
column_ecdf(self.mat)

def test_column_ecdf(self):

def ecdf(X):
"""Calculated the empirical CDF of a given dataset using the statsmodels function.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training dataset.
Returns
-------
ecdf(X) : float
Empirical CDF of X
"""
ecdf = statsmodels.distributions.ECDF(X)
return ecdf(X)

# run a test case that has equal elements per feature column to show also
# this highly unlikely case works
mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1], [2, 2, 2]])
assert_equal(column_ecdf(mat), np.apply_along_axis(ecdf, 0, mat))

# run the models multiple times for random matrices
new = []
old = []
for _ in range(50):

# create random matrix for testing
mat = np.random.rand(1000, 100)

# execute and measure the time of our own function
t = time.time()
result = column_ecdf(mat)
new.append(time.time() - t)

# execute the statsmodels function and measure execution time
t = time.time()
expected = np.apply_along_axis(ecdf, 0, mat)
old.append(time.time() - t)

# check that the results are equal
assert_equal(result, expected)

print(f'Statsmodels ECDF took {sum(old)/len(old)*1000:0.1f} ms '
f'and own implementation {sum(new)/len(new)*1000:0.1f} ms per run.')

def tearDown(self):
pass

Expand Down
66 changes: 66 additions & 0 deletions pyod/utils/stat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,69 @@ def pearsonr_mat(mat, w=None):
pear_mat[cy, cx] = curr_pear

return pear_mat


def column_ecdf(matrix: np.ndarray) -> np.ndarray:
"""
Utility function to compute the column wise empirical cumulative distribution of a 2D feature matrix,
where the rows are samples and the columns are features per sample. The accumulation is done in the positive
direction of the sample axis.
E.G.
p(1) = 0.2, p(0) = 0.3, p(2) = 0.1, p(6) = 0.4
ECDF E(5) = p(x <= 5)
ECDF E would be E(-1) = 0, E(0) = 0.3, E(1) = 0.5, E(2) = 0.6, E(3) = 0.6, E(4) = 0.6, E(5) = 0.6, E(6) = 1
Similar to and tested against:
https://www.statsmodels.org/stable/generated/statsmodels.distributions.empirical_distribution.ECDF.html
Returns
-------
"""
# check the matrix dimensions
assert len(matrix.shape) == 2, 'Matrix needs to be two dimensional for the ECDF computation.'

# create a probability array the same shape as the feature matrix which we will reorder to build
# the ecdf
probabilities = np.linspace(np.ones(matrix.shape[1])/matrix.shape[0], np.ones(matrix.shape[1]), matrix.shape[0])

# get the sorting indices for a numpy array
sort_idx = np.argsort(matrix, axis=0)

# sort the numpy array, as we need to look for duplicates in the feature values (that would have different
# probabilities if we would just resort the probabilities array)
matrix = np.take_along_axis(matrix, sort_idx, axis=0)

# deal with equal values
ecdf_terminate_equals_inplace(matrix, probabilities)

# return the resorted accumulated probabilities (by reverting the sorting of the input matrix)
# looks a little complicated but is faster this way
reordered_probabilities = np.ones_like(probabilities)
np.put_along_axis(reordered_probabilities, sort_idx, probabilities, axis=0)
return reordered_probabilities


@njit
def ecdf_terminate_equals_inplace(matrix: np.ndarray, probabilities: np.ndarray):
"""
This is a helper function for computing the ecdf of an array. It has been outsourced from the original
function in order to be able to use the njit compiler of numpy for increased speeds, as it unfortunately
needs a loop over all rows and columns of a matrix. It acts in place on the probabilities' matrix.
Parameters
----------
matrix : a feature matrix where the rows are samples and each column is a feature !(expected to be sorted)!
probabilities : a probability matrix that will be used building the ecdf. It has values between 0 and 1 and
is also sorted.
Returns
-------
"""
for cx in range(probabilities.shape[1]):
for rx in range(probabilities.shape[0]-2, -1, -1):
if matrix[rx, cx] == matrix[rx+1, cx]:
probabilities[rx, cx] = probabilities[rx+1, cx]
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@ numpy>=1.19
numba>=0.51
scipy>=1.5.1
scikit_learn>=0.20.0
six
statsmodels
six

0 comments on commit b78d99b

Please sign in to comment.