Skip to content

Commit

Permalink
Merge 8839606 into 9427947
Browse files Browse the repository at this point in the history
  • Loading branch information
vc1492a committed Sep 17, 2020
2 parents 9427947 + 8839606 commit b5b4fa3
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 20 deletions.
24 changes: 17 additions & 7 deletions PyNomaly/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@

try:
import numba
dynamic_range = numba.prange
except ImportError:
dynamic_range = range
pass

__author__ = 'Valentino Constantinou'
__version__ = '0.3.4'
__version__ = '0.4.0'
__license__ = 'Apache License, Version 2.0'


Expand Down Expand Up @@ -321,6 +323,9 @@ def new_f(*args, **kwds):
},
'progress_bar': {
'type': types[8]
},
'parallel': {
'type': types[9]
}
}
for x in kwds:
Expand All @@ -341,10 +346,10 @@ def new_f(*args, **kwds):
return decorator

@accepts(object, np.ndarray, np.ndarray, np.ndarray, (int, np.integer),
(int, np.integer), list, bool, bool)
(int, np.integer), list, bool, bool, bool)
def __init__(self, data=None, distance_matrix=None, neighbor_matrix=None,
extent=3, n_neighbors=10, cluster_labels=None,
use_numba=False, progress_bar=False) -> None:
use_numba=False, progress_bar=False, parallel=False) -> None:
self.data = data
self.distance_matrix = distance_matrix
self.neighbor_matrix = neighbor_matrix
Expand All @@ -359,6 +364,7 @@ def __init__(self, data=None, distance_matrix=None, neighbor_matrix=None,
self.local_outlier_probabilities = None
self._objects = {}
self.progress_bar = progress_bar
self.parallel = parallel
self.is_fit = False

if self.use_numba is True and 'numba' not in sys.modules:
Expand Down Expand Up @@ -524,7 +530,7 @@ def _compute_distance_and_neighbor_matrix(
"""

for i in range(clust_points_vector.shape[0]):
for j in range(i + 1, clust_points_vector.shape[0]):
for j in dynamic_range(i + 1, clust_points_vector.shape[0]):
p = ((i,), (j,))

diff = clust_points_vector[p[0]] - clust_points_vector[p[1]]
Expand All @@ -546,7 +552,7 @@ def _compute_distance_and_neighbor_matrix(

yield distances, indexes, i

def _distances(self, progress_bar: bool = False) -> None:
def _distances(self, progress_bar: bool = False, parallel: bool = False) -> None:
"""
Provides the distances between each observation and it's closest
neighbors. When input data is provided, calculates the euclidean
Expand All @@ -561,7 +567,11 @@ def _distances(self, progress_bar: bool = False) -> None:
dtype=float)
self.points_vector = self.Validate._data(self.data)
compute = numba.jit(self._compute_distance_and_neighbor_matrix,
cache=True) if self.use_numba else \
cache=False,
parallel=parallel,
nopython=parallel,
nogil=parallel
) if self.use_numba else \
self._compute_distance_and_neighbor_matrix
progress = "="
for cluster_id in set(self._cluster_labels()):
Expand Down Expand Up @@ -754,7 +764,7 @@ def fit(self) -> 'LocalOutlierProbability':

store = self._store()
if self.data is not None:
self._distances(progress_bar=self.progress_bar)
self._distances(progress_bar=self.progress_bar, parallel=self.parallel)
store = self._assign_distances(store)
store = self._ssd(store)
store = self._standard_distances(store)
Expand Down
7 changes: 6 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ All notable changes to PyNomaly will be documented in this Changelog.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## 0.3.4
## 0.4.0
### Added
- Parallel processing capability through numba just-in-time
compilation was added as an option for computing the Local Outlier
Probability.

### Changed
- Unit tests from using the `sklearn.utils.testing` submodule
to standard Python assertions, as the submodule will be changed
Expand Down
23 changes: 18 additions & 5 deletions examples/numba_speed_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

t1 = time.time()
scores_numpy = loop.LocalOutlierProbability(
data,
n_neighbors=3,
use_numba=False,
progress_bar=True
data,
n_neighbors=3,
use_numba=False,
progress_bar=True
).fit().local_outlier_probabilities
t2 = time.time()
seconds_no_numba = t2 - t1
Expand All @@ -24,8 +24,21 @@
data,
n_neighbors=3,
use_numba=True,
progress_bar=True
progress_bar=True,
parallel=False
).fit().local_outlier_probabilities
t4 = time.time()
seconds_numba = t4 - t3
print("\nComputation took " + str(seconds_numba) + " seconds with Numba JIT.")

t5 = time.time()
scores_numba_parallel = loop.LocalOutlierProbability(
data,
n_neighbors=3,
use_numba=True,
progress_bar=True,
parallel=True
).fit().local_outlier_probabilities
t6 = time.time()
seconds_numba_parallel = t6 - t5
print("\nComputation took " + str(seconds_numba_parallel) + " seconds with Numba JIT with parallel processing.")
25 changes: 22 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LoOP is a local density based outlier detection method by Kriegel, Kröger, Schu
scores in the range of [0,1] that are directly interpretable as the probability of a sample being an outlier.

[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![PyPi](https://img.shields.io/badge/pypi-0.3.4-blue.svg)](https://pypi.python.org/pypi/PyNomaly/0.3.4)
[![PyPi](https://img.shields.io/badge/pypi-0.4.0-blue.svg)](https://pypi.python.org/pypi/PyNomaly/0.4.0)
![](https://img.shields.io/pypi/dm/PyNomaly.svg?logoColor=blue)
[![Build Status](https://travis-ci.org/vc1492a/PyNomaly.svg?branch=master)](https://travis-ci.org/vc1492a/PyNomaly)
[![Coverage Status](https://coveralls.io/repos/github/vc1492a/PyNomaly/badge.svg?branch=master)](https://coveralls.io/github/vc1492a/PyNomaly?branch=master)
Expand Down Expand Up @@ -39,13 +39,15 @@ to calculate the Local Outlier Probability of each sample.
- Python 3.5 - 3.8
- numpy >= 1.16.3
- python-utils >= 2.3.0
- (optional) numba >= 0.45.1
- (optional) numba >= 0.51.2
- (optional) SciPy >= 1.5.2

Numba just-in-time (JIT) compiles the function with calculates the Euclidean
distance between observations, providing a reduction in computation time
(significantly when a large number of observations are scored). Numba is not a
requirement and PyNomaly may still be used solely with numpy if desired
(details below).
(details below). When using Numba, [SciPy](https://www.scipy.org/) should
also be installed within the environment.

## Quick Start

Expand Down Expand Up @@ -110,6 +112,23 @@ speed of multiple calls to `LocalOutlierProbability()`, and PyNomaly has been
tested with Numba version 0.45.1. An example of the speed difference that can
be realized with using Numba is avaialble in `examples/numba_speed_diff.py`.

Parallel processing is available when using PyNomaly with Numba -
simply set `parallel=True`:

```python
from PyNomaly import loop
m = loop.LocalOutlierProbability(data, use_numba=True, progress_bar=True, parallel=True).fit()
scores = m.local_outlier_probabilities
print(scores)
```

The benefits of using parallelism will vary depending on the CPU architecture (number of cores,
clock speed, etc.) and the shape of the data processed (number of observations
and features). In some cases, it may be best to use Numba to compile LoOP without
parallelization. Additionally, parallelization only applies to calculation of
distances between observations and thus will not be applied when supplying a
distance matrix from outside PyNomaly (more details below).

You may also choose to print progress bars _with our without_ the use of numba
by passing `progress_bar=True` to the `LocalOutlierProbability()` method as above.

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
setup(
name='PyNomaly',
packages=['PyNomaly'],
version='0.3.4',
version='0.4.0',
description='A Python 3 implementation of LoOP: Local Outlier '
'Probabilities, a local density based outlier detection '
'method providing an outlier score in the range of [0,1].',
author='Valentino Constantinou',
author_email='vc@valentino.io',
url='https://github.com/vc1492a/PyNomaly',
download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz',
download_url='https://github.com/vc1492a/PyNomaly/archive/0.4.0.tar.gz',
keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning',
'probability'],
classifiers=[],
Expand Down
4 changes: 2 additions & 2 deletions tests/test_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,8 @@ def test_data_format() -> None:
with pytest.warns(UserWarning) as record:
clf.fit()

# check that only one warning was raised
assert len(record) == 1
# check that at least one warning was raised
assert len(record) >= 1


def test_missing_values() -> None:
Expand Down

0 comments on commit b5b4fa3

Please sign in to comment.