Skip to content

Commit

Permalink
Improve model docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 authored and yuezhao@cs.toronto.edu committed May 27, 2018
1 parent f5f7fa4 commit d33d308
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 35 deletions.
10 changes: 5 additions & 5 deletions pyod/models/abod.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ def fit(self, X_train):
100 * (1 - self.contamination))
self.y_pred = (self.decision_scores > self.threshold_).astype('int')

def decision_function(self, X_test):
def decision_function(self, X):

if not self._isfitted:
NotFittedError('Model is not fitted yet')

X_test = check_array(X_test)
X = check_array(X)
# initialize the output score
pred_score = np.zeros([X_test.shape[0], 1])
pred_score = np.zeros([X.shape[0], 1])

for i in range(X_test.shape[0]):
curr_pt = X_test[i, :]
for i in range(X.shape[0]):
curr_pt = X[i, :]

# get the index pairs of the neighbors
ind = list(range(0, self.n_train))
Expand Down
20 changes: 19 additions & 1 deletion pyod/models/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
Abstract base class for outlier detector models
"""

from abc import ABC, abstractmethod
import numpy as np

Expand All @@ -14,14 +18,28 @@ class BaseDetector(ABC):

@abstractmethod
def __init__(self, contamination=0.1):
"""
:param contamination: percentage of outliers, range in (0, 0.5]
:type contamination: float
"""
self.contamination = contamination
self.threshold_ = None
self.decision_scores = None
self.y_pred = None
self._isfitted = False

@abstractmethod
def decision_function(self, X_test):
def decision_function(self, X):
"""
Anomaly score of X of the base classifiers.
The anomaly score of an input sample is computed based on different detector algorithms.
For consistency, outliers have larger anomaly scores.
:param X: The training input samples. Sparse matrices are accepted only if they are supported by the base estimator.
:type X: {array-like, sparse matrix}
:return: scores: The anomaly score of the input samples. The lower, the more abnormal.
:rtype: array of shape (n_samples,)
"""
pass

@abstractmethod
Expand Down
10 changes: 5 additions & 5 deletions pyod/models/glosh.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def fit(self, X_train):
self.threshold = scoreatpercentile(self.scores,
100 * (1 - self.contamination))

def decision_function(self, X_test):
def decision_function(self, X):

X_test = check_array(X_test)
X = check_array(X)
# initialize the outputs
pred_score = np.zeros([X_test.shape[0], 1])
pred_score = np.zeros([X.shape[0], 1])

for i in range(X_test.shape[0]):
x_i = X_test[i, :]
for i in range(X.shape[0]):
x_i = X[i, :]

x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
x_comb = np.concatenate((self.X_train, x_i), axis=0)
Expand Down
12 changes: 6 additions & 6 deletions pyod/models/hbos.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ def fit(self, X_train):
self.mu = np.mean(self.decision_scores)
self.sigma = np.std(self.decision_scores)

def decision_function(self, X_test):
def decision_function(self, X):

X_test = check_array(X_test)
n_test = X_test.shape[0]
X = check_array(X)
n_test = X.shape[0]
out_scores = np.zeros([n_test, self.d])

for i in range(self.d):
# find histogram assignments of data points
bin_ind = np.digitize(X_test[:, i], self.bin_edges[:, i],
bin_ind = np.digitize(X[:, i], self.bin_edges[:, i],
right=False)

# very important to do scaling. Not necessary to use minmax
Expand All @@ -99,7 +99,7 @@ def decision_function(self, X_test):
for j in range(n_test):
# out sample left
if bin_ind[j] == 0:
dist = np.abs(X_test[j, i] - self.bin_edges[0, i])
dist = np.abs(X[j, i] - self.bin_edges[0, i])
bin_width = self.bin_edges[1, i] - self.bin_edges[0, i]
# assign it to bin 0
if dist < bin_width * self.beta:
Expand All @@ -109,7 +109,7 @@ def decision_function(self, X_test):

# out sample right
elif bin_ind[j] == self.bin_edges.shape[0]:
dist = np.abs(X_test[j, i] - self.bin_edges[-1, i])
dist = np.abs(X[j, i] - self.bin_edges[-1, i])
bin_width = self.bin_edges[-1, i] - self.bin_edges[-2, i]
# assign it to bin k
if dist < bin_width * self.beta:
Expand Down
4 changes: 2 additions & 2 deletions pyod/models/iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ def fit(self, X_train, y=None, sample_weight=None):

return self

def decision_function(self, X_test):
def decision_function(self, X):
if not self._isfitted:
NotFittedError('Model is not fitted yet')
# invert scores. Outliers comes with higher scores
return self.detector_.decision_function(X_test) * -1
return self.detector_.decision_function(X) * -1

@property
def estimators_(self):
Expand Down
10 changes: 5 additions & 5 deletions pyod/models/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,18 @@ def fit(self, X_train):

return self

def decision_function(self, X_test):
def decision_function(self, X):

if not self._isfitted:
NotFittedError('Model is not fitted yet')

X_test = check_array(X_test)
X = check_array(X)

# initialize the output score
pred_score = np.zeros([X_test.shape[0], 1])
pred_score = np.zeros([X.shape[0], 1])

for i in range(X_test.shape[0]):
x_i = X_test[i, :]
for i in range(X.shape[0]):
x_i = X[i, :]
x_i = np.asarray(x_i).reshape(1, x_i.shape[0])

# get the distance of the current point
Expand Down
4 changes: 2 additions & 2 deletions pyod/models/lof.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ def fit(self, X_train, y=None):

return self

def decision_function(self, X_test):
def decision_function(self, X):
if not self._isfitted:
NotFittedError('Model is not fitted yet')

# invert scores. Outliers comes with higher scores
return self.detector_._decision_function(X_test) * -1
return self.detector_._decision_function(X) * -1

@property
def negative_outlier_factor_(self):
Expand Down
4 changes: 2 additions & 2 deletions pyod/models/ocsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ def fit(self, X_train, y=None, sample_weight=None, **params):

return self

def decision_function(self, X_test):
def decision_function(self, X):
if not self._isfitted:
NotFittedError('Model is not fitted yet')
# invert scores. Outliers comes with higher scores
return self.detector_.decision_function(X_test) * -1
return self.detector_.decision_function(X) * -1

@property
def support_(self):
Expand Down
2 changes: 2 additions & 0 deletions pyod/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def generate_data(n_train=1000, n_test=500, contamination=0.1,
:param n_test: number of test points to generate
:type n_test: int
:param contamination: percentage of outliers
:type contamination: float
:return: training data and test data (c_train and c_test are colors)
:rtype: tuple, (ndarry, ndarry, list, ndarry, ndarry, list)
"""
Expand Down
27 changes: 20 additions & 7 deletions pyod/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,18 @@
def check_parameter_range(para, low=None, high=None):
"""
check if input parameter is with in the range low and high
:param para: the input parameter to check
:type para:
:param low:
:param high:
:return:
:type para: int, float
:param low: lower bound of the range
:type low: int, float
:param high: higher bound of the range
:type high: int, float
:return: whether the parameter is within the range of (low, high)
:rtype: bool
"""
if low is None and high is None:
raise ValueError('both low and high bounds are undefined')
Expand All @@ -36,20 +43,24 @@ def check_parameter_range(para, low=None, high=None):
def standardizer(X_train, X_test):
"""
normalization function wrapper, z- normalization function
:param X_train:
:param X_test:
:return: X_train and X_test after the Z-score normalization
:rtype: tuple(ndarray, ndarray)
"""
scaler = StandardScaler().fit(X_train)
return scaler.transform(X_train), scaler.transform(X_test)
return (scaler.transform(X_train), scaler.transform(X_test))


def scores_to_lables(pred_scores, outlier_perc=1):
def scores_to_lables(pred_scores, outlier_perc=0.1):
"""
turn raw outlier scores to binary labels (0 or 1)
:param pred_scores: raw outlier scores
:param outlier_perc: percentage of outliers
:return: binary labels (1 stands for outlier)
:rtype: int
"""
threshold = scoreatpercentile(pred_scores, 100 * (1 - outlier_perc))
pred_labels = (pred_scores > threshold).astype('int')
Expand All @@ -58,11 +69,13 @@ def scores_to_lables(pred_scores, outlier_perc=1):

def precision_n_scores(y, y_pred, n=None):
"""
Utlity function to calculate precision@ rank n_train
Utlity function to calculate precision@ rank
:param y: ground truth
:param y_pred: number of outliers
:param n: number of outliers, if not defined, infer using ground truth
:return: precison at rank n score
:rtype: float
"""

# turn prediction scores into binary labels
Expand Down

0 comments on commit d33d308

Please sign in to comment.