/
xgbod.py
459 lines (360 loc) · 16 KB
/
xgbod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# -*- coding: utf-8 -*-
"""XGBOD: Improving Supervised Outlier Detection with Unsupervised
Representation Learning. A semi-supervised outlier detection framework.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause
from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.utils import check_array
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import check_is_fitted
try:
import xgboost
except ImportError:
print('please install xgboost first for running XGBOD')
from xgboost.sklearn import XGBClassifier
from .base import BaseDetector
from .knn import KNN
from .lof import LOF
from .iforest import IForest
from .hbos import HBOS
from .ocsvm import OCSVM
from ..utils.utility import check_parameter
from ..utils.utility import check_detector
from ..utils.utility import standardizer
from ..utils.utility import precision_n_scores
class XGBOD(BaseDetector):
r"""XGBOD class for outlier detection.
It first uses the passed in unsupervised outlier detectors to extract
richer representation of the data and then concatenates the newly
generated features to the original feature for constructing the augmented
feature space. An XGBoost classifier is then applied on this augmented
feature space. Read more in the :cite:`zhao2018xgbod`.
Parameters
----------
estimator_list : list, optional (default=None)
The list of pyod detectors passed in for unsupervised learning
standardization_flag_list : list, optional (default=None)
The list of boolean flags for indicating whether to perform
standardization for each detector.
max_depth : int
Maximum tree depth for base learners.
learning_rate : float
Boosting learning rate (xgb's "eta")
n_estimators : int
Number of boosted trees to fit.
silent : bool
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
booster : string
Specify which booster to use: gbtree, gblinear or dart.
n_jobs : int
Number of parallel threads used to run xgboost. (replaces ``nthread``)
gamma : float
Minimum loss reduction required to make a further partition on a leaf
node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child.
max_delta_step : int
Maximum delta step we allow each tree's weight estimation to be.
subsample : float
Subsample ratio of the training instance.
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
colsample_bylevel : float
Subsample ratio of columns for each split, in each level.
reg_alpha : float (xgb's alpha)
L1 regularization term on weights.
reg_lambda : float (xgb's lambda)
L2 regularization term on weights.
scale_pos_weight : float
Balancing of positive and negative weights.
base_score:
The initial prediction score of all instances, global bias.
random_state : int
Random number seed. (replaces seed)
# missing : float, optional
# Value in the data which needs to be present as a missing value. If
# None, defaults to np.nan.
importance_type: string, default "gain"
The feature importance type for the ``feature_importances_``
property: either "gain",
"weight", "cover", "total_gain" or "total_cover".
\*\*kwargs : dict, optional
Keyword arguments for XGBoost Booster object. Full documentation of
parameters can be found here:
https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
Attempting to set a parameter via the constructor args and \*\*kwargs
dict simultaneously will result in a TypeError.
Note: \*\*kwargs is unsupported by scikit-learn. We do not
guarantee that parameters passed via this argument will interact
properly with scikit-learn.
Attributes
----------
n_detector_ : int
The number of unsupervised of detectors used.
clf_ : object
The XGBoost classifier.
decision_scores_ : numpy array of shape (n_samples,)
The outlier scores of the training data.
The higher, the more abnormal. Outliers tend to have higher
scores. This value is available once the detector is fitted.
labels_ : int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.
"""
def __init__(self, estimator_list=None, standardization_flag_list=None,
max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True,
objective="binary:logistic", booster='gbtree',
n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, colsample_bytree=1,
colsample_bylevel=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
base_score=0.5, random_state=0,
# missing=None,
**kwargs):
super(XGBOD, self).__init__()
self.estimator_list = estimator_list
self.standardization_flag_list = standardization_flag_list
self.max_depth = max_depth
self.learning_rate = learning_rate
self.n_estimators = n_estimators
self.silent = silent
self.objective = objective
self.booster = booster
self.n_jobs = n_jobs
self.nthread = nthread
self.gamma = gamma
self.min_child_weight = min_child_weight
self.max_delta_step = max_delta_step
self.subsample = subsample
self.colsample_bytree = colsample_bytree
self.colsample_bylevel = colsample_bylevel
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.scale_pos_weight = scale_pos_weight
self.base_score = base_score
self.random_state = random_state
# self.missing = missing
self.kwargs = kwargs
def _init_detectors(self, X):
"""initialize unsupervised detectors if no predefined detectors is
provided.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The train data
Returns
-------
estimator_list : list of object
The initialized list of detectors
standardization_flag_list : list of boolean
The list of bool flag to indicate whether standardization is needed
"""
estimator_list = []
standardization_flag_list = []
# predefined range of n_neighbors for KNN, AvgKNN, and LOF
k_range = [1, 3, 5, 10, 20, 30, 40, 50]
# validate the value of k
k_range = [k for k in k_range if k < X.shape[0]]
for k in k_range:
estimator_list.append(KNN(n_neighbors=k, method='largest'))
# estimator_list.append(KNN(n_neighbors=k, method='mean'))
estimator_list.append(LOF(n_neighbors=k))
# standardization_flag_list.append(True)
standardization_flag_list.append(True)
standardization_flag_list.append(True)
n_bins_range = [5, 10, 15, 20, 25, 30, 50]
for n_bins in n_bins_range:
estimator_list.append(HBOS(n_bins=n_bins))
standardization_flag_list.append(False)
# predefined range of nu for one-class svm
nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
for nu in nu_range:
estimator_list.append(OCSVM(nu=nu))
standardization_flag_list.append(True)
# predefined range for number of estimators in isolation forests
n_range = [10, 20, 50, 70, 100, 150, 200]
for n in n_range:
estimator_list.append(
IForest(n_estimators=n, random_state=self.random_state))
standardization_flag_list.append(False)
# # predefined range for number of estimators in LODA
# n_bins_range = [3, 5, 10, 15, 20, 25, 30, 50]
# for n_bins in n_bins_range:
# estimator_list.append(LODA(n_bins=n_bins))
# standardization_flag_list.append(False)
return estimator_list, standardization_flag_list
def _validate_estimator(self, X):
if self.estimator_list is None:
self.estimator_list, \
self.standardization_flag_list = self._init_detectors(X)
# perform standardization for all detectors by default
if self.standardization_flag_list is None:
self.standardization_flag_list = [True] * len(self.estimator_list)
# validate two lists length
if len(self.estimator_list) != len(self.standardization_flag_list):
raise ValueError(
"estimator_list length ({0}) is not equal "
"to standardization_flag_list length ({1})".format(
len(self.estimator_list),
len(self.standardization_flag_list)))
# validate the estimator list is not empty
check_parameter(len(self.estimator_list), low=1,
param_name='number of estimators',
include_left=True, include_right=True)
for estimator in self.estimator_list:
check_detector(estimator)
return len(self.estimator_list)
def _generate_new_features(self, X):
X_add = np.zeros([X.shape[0], self.n_detector_])
# keep the standardization scalar for test conversion
X_norm = self._scalar.transform(X)
for ind, estimator in enumerate(self.estimator_list):
if self.standardization_flag_list[ind]:
X_add[:, ind] = estimator.decision_function(X_norm)
else:
X_add[:, ind] = estimator.decision_function(X)
return X_add
def fit(self, X, y):
"""Fit the model using X and y as training data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
Training data.
y : numpy array of shape (n_samples,)
The ground truth (binary label)
- 0 : inliers
- 1 : outliers
Returns
-------
self : object
"""
# Validate inputs X and y
X, y = check_X_y(X, y)
X = check_array(X)
self._set_n_classes(y)
self.n_detector_ = self._validate_estimator(X)
self.X_train_add_ = np.zeros([X.shape[0], self.n_detector_])
# keep the standardization scalar for test conversion
X_norm, self._scalar = standardizer(X, keep_scalar=True)
for ind, estimator in enumerate(self.estimator_list):
if self.standardization_flag_list[ind]:
estimator.fit(X_norm)
self.X_train_add_[:, ind] = estimator.decision_scores_
else:
estimator.fit(X)
self.X_train_add_[:, ind] = estimator.decision_scores_
# construct the new feature space
self.X_train_new_ = np.concatenate((X, self.X_train_add_), axis=1)
# initialize, train, and predict on XGBoost
self.clf_ = clf = XGBClassifier(max_depth=self.max_depth,
learning_rate=self.learning_rate,
n_estimators=self.n_estimators,
silent=self.silent,
objective=self.objective,
booster=self.booster,
n_jobs=self.n_jobs,
nthread=self.nthread,
gamma=self.gamma,
min_child_weight=self.min_child_weight,
max_delta_step=self.max_delta_step,
subsample=self.subsample,
colsample_bytree=self.colsample_bytree,
colsample_bylevel=self.colsample_bylevel,
reg_alpha=self.reg_alpha,
reg_lambda=self.reg_lambda,
scale_pos_weight=self.scale_pos_weight,
base_score=self.base_score,
random_state=self.random_state,
# missing=self.missing,
**self.kwargs)
self.clf_.fit(self.X_train_new_, y)
self.decision_scores_ = self.clf_.predict_proba(
self.X_train_new_)[:, 1]
self.labels_ = self.clf_.predict(self.X_train_new_).ravel()
return self
def decision_function(self, X):
check_is_fitted(self, ['clf_', 'decision_scores_',
'labels_', '_scalar'])
X = check_array(X)
# construct the new feature space
X_add = self._generate_new_features(X)
X_new = np.concatenate((X, X_add), axis=1)
pred_scores = self.clf_.predict_proba(X_new)[:, 1]
return pred_scores.ravel()
def predict(self, X):
"""Predict if a particular sample is an outlier or not.
Calling xgboost `predict` function.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
check_is_fitted(self, ['clf_', 'decision_scores_',
'labels_', '_scalar'])
X = check_array(X)
# construct the new feature space
X_add = self._generate_new_features(X)
X_new = np.concatenate((X, X_add), axis=1)
pred_scores = self.clf_.predict(X_new)
return pred_scores.ravel()
def predict_proba(self, X):
"""Predict the probability of a sample being outlier.
Calling xgboost `predict_proba` function.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. Return the outlier probability, ranging
in [0,1].
"""
return self.decision_function(X)
def fit_predict(self, X, y):
self.fit(X, y)
return self.labels_
def fit_predict_score(self, X, y, scoring='roc_auc_score'):
"""Fit the detector, predict on samples, and evaluate the model by
predefined metrics, e.g., ROC.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : Ignored
Not used, present for API consistency by convention.
scoring : str, optional (default='roc_auc_score')
Evaluation metric:
- 'roc_auc_score': ROC score
- 'prc_n_score': Precision @ rank n score
Returns
-------
score : float
"""
self.fit(X, y)
if scoring == 'roc_auc_score':
score = roc_auc_score(y, self.decision_scores_)
elif scoring == 'prc_n_score':
score = precision_n_scores(y, self.decision_scores_)
else:
raise NotImplementedError('PyOD built-in scoring only supports '
'ROC and Precision @ rank n')
print("{metric}: {score}".format(metric=scoring, score=score))
return score