/
data.py
160 lines (114 loc) · 5.06 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""Utility functions for manipulating data
"""
# Author: Yue Zhao <yuezhao@cs.toronto.edu>
# License: BSD 2 clause
from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.utils import column_or_1d
from sklearn.utils import check_random_state
from sklearn.metrics import roc_auc_score
from .utility import precision_n_scores
MAX_INT = np.iinfo(np.int32).max
def _generate_data(n_inliers, n_outliers, n_features, coef, offset,
random_state):
"""Internal function to generate data samples
:param n_inliers: The number of inliers.
:type n_inliers: int
:param n_outliers: The number of outliers.
:type n_outliers: int
:param n_features: The number of features.
:type n_features: int
:param coef: The coefficient of data generation
:type coef: float in range [0,1)+0.001
:param offset: The offset of data generation
:type offset: int
:param random_state: The random number generator
:type random_state: RandomState instance.
:return: X,y
:rtype: numpy array of shape (n_samples, n_features) and (n_features,)
"""
inliers = coef * random_state.randn(n_inliers, n_features) + offset
outliers = random_state.uniform(low=-1 * offset, high=offset,
size=(n_outliers, n_features))
X = np.r_[inliers, outliers]
y = np.r_[np.zeros((n_inliers,)), np.ones((n_outliers,))]
return X, y
def generate_data(n_train=1000, n_test=500, n_features=2, contamination=0.1,
train_only=False, offset=10, random_state=None):
"""Utility function to generate synthesized data.
Normal data is generated by a multivariate Gaussian distribution and
outliers are generated by a uniform distribution.
:param n_train: number of training points to generate
:type n_train: int, (default=1000)
:param n_test: number of test points to generate
:type n_test: int
:param contamination: The amount of contamination of the data set,
i.e. the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.
:type contamination: float in (0., 0.5), optional (default=0.1)
:param n_features: The number of features
:type n_features: int, optional (default=2)
:param train_only: If true, generate train data only
:type train_only: bool, optional(default=False)
:param offset: Adjust the value range of Gaussian and Uniform
:type offset: int, optional (default=10)
:param random_state: If int, random_state is the seed used by the
random number generator; If RandomState instance, random_state is
the random number generator; If None, the random number generator
is the RandomState instance used by `np.random`.
:type random_state: int, RandomState instance or None,
optional (default=None)
:return: training data(X_train, y_test) and test data (X_test, y_test)
:rtype: tuple, (ndarry, ndarry, ndarry, ndarry)
"""
# initialize a random state and seeds for the instance
random_state = check_random_state(random_state)
offset_ = random_state.randint(low=offset)
coef_ = random_state.random_sample() + 0.001 # in case of underflow
n_outliers_train = int(n_train * contamination)
n_inliers_train = int(n_train - n_outliers_train)
X_train, y_train = _generate_data(n_inliers_train, n_outliers_train,
n_features, coef_, offset_, random_state)
if train_only:
return X_train, y_train
n_outliers_test = int(n_test * contamination)
n_inliers_test = int(n_test - n_outliers_test)
X_test, y_test = _generate_data(n_inliers_test, n_outliers_test,
n_features, coef_, offset_, random_state)
return X_train, y_train, X_test, y_test
def get_color_codes(y):
"""
Internal function to generate color codes for inliers and outliers
Inliers (0): blue
Outlier (1): red
:param y: The binary labels of the groud truth, where 0 is inlier
:type y: list, array, numpy array of shape (n_samples,)
:return: The list of color codes ['r', 'b', ..., 'b']
:rtype: list
"""
y = column_or_1d(y)
# inliers are assigned blue
c = np.full([len(y)], 'b', dtype=str)
outliers_ind = np.where(y == 1)
# outlier are assigned red
c[outliers_ind] = 'r'
return c
def evaluate_print(clf_name, y, y_pred):
"""
Utility function for evaluating and printing the results for examples
Internal use only
:param clf_name: The name of the detector
:type clf_name: str
:param y: The ground truth
:type y: list or array of shape (n_samples,)
:param y_pred: The predicted outlier scores
:type y: list or array of shape (n_samples,)
"""
y = column_or_1d(y)
y_pred = column_or_1d(y_pred)
print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
clf_name=clf_name,
roc=np.round(roc_auc_score(y, y_pred), decimals=4),
prn=np.round(precision_n_scores(y, y_pred), decimals=4)))