-
Notifications
You must be signed in to change notification settings - Fork 0
/
goldeneye.py
209 lines (165 loc) · 7.55 KB
/
goldeneye.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from math import ceil, sqrt
from sklearn.model_selection import train_test_split
from goodness import fidelity, class_probability_ranking
def singletonize(groups):
"""Singletonize each element of the contained lists.
Example: [[1,2],[3],[4,5,6]] -> [[1],[2],[3],[4],[5],[6]]
:param groups: list of lists to be singletonized
:return: singletonized lists
"""
singles = []
for group in groups:
singles.extend([[el] for el in group])
return singles
def extract_el_from_group(group, el):
"""Extract an element group from a group.
:param group: list
:param el: element to be extracted
:return: group without the extracted element, the extracted element
"""
extracted_group = [x for x in group if x != el]
return [extracted_group] + [[el]]
def remove_el_from_group(group, el):
"""Remove an element from a group.
:param group: list
:param el: element to be removed
:return: new group without the element
"""
return [x for x in group if x != el]
def grouping(data, model, delta, classname, goodness_fn):
"""Find the optinal grouping of attributes for a dataset.
This function iteratively finds the optimal grouping of attributes for a
dataset using a given classifier.
:param data: the dataset
:param model: trained classification model
:param delta: sensitivity parameter
:param classname: name of the column in the dataset with the class labels
:param goodness_fn: function used to investigate the effect of randomising
attributes in the dataset
:return: detected optimal groups, naive goodness, inflated data
"""
attributes = [col for col in data.columns
if col not in [classname, 'PClass']]
# compute naive bayes goodness using an inflated dataset,
# to ensure precision
p = 0.5
sgm = delta / 5
n = ceil(p * (1 - p) / (sgm ** 2))
inflated_data = data.sample(n, replace=True)
nb_goodness = goodness_fn(inflated_data, model,
singletonize([attributes]), [])
# inflate the dataset to ensure that the desired variance level can be
# reached
p = max(0.5, nb_goodness)
n = max(1000, ceil((p * (1 - p)) / (sgm ** 2)))
inflated_data = data.sample(n, replace=True)
# --------------------------------------------------
detected_groups = [] # accumulated attribute groups
current_group = attributes # currently tested group
removed_attrs = [] # removed attributes
Delta = nb_goodness + delta # grouping threshold
# group the attributes
while current_group or removed_attrs:
current_goodness = \
goodness_fn(inflated_data, model,
[current_group] + singletonize(detected_groups), [])
if not removed_attrs and current_goodness < Delta:
# already below Delta before removing any attributes,
# so assign remaining attributes to singleton groups
detected_groups.extend(singletonize([current_group]))
current_group = []
removed_attrs = []
else:
# find the attribute that decreases the goodness the least
goodnesses = [
goodness_fn(inflated_data, model,
extract_el_from_group(current_group, attr) +
singletonize(detected_groups) + [removed_attrs],
[])
for attr in current_group]
max_goodness = max(goodnesses)
if len(current_group) == 1 or max_goodness < Delta:
# if the goodness drops below Delta, add the group of attributes
# to the result and look for the next group of attributes
detected_groups.append(current_group)
current_group = removed_attrs
removed_attrs = []
else:
# if the goodness stays above Delta,
# continue with the current group (minus the removed attribute)
attr_idx = goodnesses.index(max_goodness)
removed_attrs.append(current_group[attr_idx])
current_group = current_group[:attr_idx] + \
current_group[attr_idx + 1:]
return detected_groups, nb_goodness, inflated_data
def prune_singletons(data, model, delta, groups, goodness_fn):
"""Prune singleton attributes.
This function iteratively prunes those singletons from a given grouping
that do not affect the fidelity more than the given sensitivity.
:param data: the dataset
:param model: trained classification model
:param delta: sensitivity parameter
:param groups: detected optimal grouping
:param goodness_fn: function used to investigate the effect of randomising
attributes in the dataset
:return: grouping with the pruned attributes, pruned attributes
"""
groups_pruned = groups[:]
pruned_singletons = []
# original goodness
Delta = goodness_fn(data, model, groups, []) - delta
# receive singleton attributes
singletons = [group for group in groups if len(group) == 1]
# prune singletons
while singletons:
# find the singleton which decreases the goodness the least
goodnesses = [
goodness_fn(data, model,
remove_el_from_group(groups, singleton), [singleton])
for singleton in singletons]
max_goodness = max(goodnesses)
if max_goodness >= Delta:
# prune the singleton and continue
single_idx = goodnesses.index(max_goodness)
pruned_singletons.extend(singletons[single_idx])
groups_pruned.remove(singletons[single_idx])
singletons = singletons[:single_idx] + singletons[single_idx + 1:]
else:
singletons = []
return groups_pruned, pruned_singletons
def goldeneye(data, model, delta=None, classname='Class', goodness_fn=fidelity):
"""Detect optimal groups in a dataset.
:param data: the dataset
:param model: untrained classification model
:param delta: sensitivity parameter
:param classname: name of the column in the dataset with the class labels
:param goodness_fn: function used to investigate the effect of randomising
attributes in the dataset
:return: detected groups, goodness with the detected groups, pruned groups,
goodness with the pruned groups, naive goodness, original accuracy, final
accuracy, delta
"""
if delta is None:
delta = 1 / sqrt(data.shape[0])
if classname not in data.columns:
raise ValueError("classname not found in dataset")
# learn model
X_train, X_test, y_train, y_test = \
train_test_split(data.drop(classname, axis=1), data[classname],
test_size=0.5, stratify=data.Label)
model.fit(X_train, y_train)
data = X_test.reset_index(drop=True)
data['PClass'] = model.predict(data)
# find optimal grouping of the dataset
groups, nb_goodness, inflated_data = \
grouping(data, model, delta, classname, goodness_fn)
goodness = goodness_fn(inflated_data, model, groups, [])
# prune singletons
groups_pruned, pruned_singletons = \
prune_singletons(inflated_data, model, delta, groups, goodness_fn)
goodness_pruned = \
goodness_fn(inflated_data, model, groups_pruned, pruned_singletons)
acc_original = fidelity(inflated_data, model, [], [])
acc_final = fidelity(inflated_data, model, groups_pruned, pruned_singletons)
return groups, goodness, groups_pruned, goodness_pruned, nb_goodness, \
acc_original, acc_final, delta