-
Notifications
You must be signed in to change notification settings - Fork 0
/
ctwc__cluster_1d.py
executable file
·320 lines (278 loc) · 11.9 KB
/
ctwc__cluster_1d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/python
from ctwc__common import *
import warnings
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
import ctwc__distance_matrix, ctwc__data_handler
# Constants
N_CLUSTERS = 16
MIN_DATA_POINTS_LIMIT = 50
MIN_CLUSTER_LIMIT = 20.0
SPC_BINARY_PATH = './SPC/'
SPC_BINARY_EXE = './SW'
SPC_TMP_FILES_PREFIX = '__tmp_ctwc'
SPC_CLUSTER_FILE = RESULTS_PATH+"spc_cluster-{0}.pklz"
ALLOW_CACHING = False
MIN_SCORE = 8
LARGEST_CLUSTER_IND = 4
# Simulates a distance matrix with two natural clusters. Expected result is (1,0,1,0,1).
sample_dist_matrix = np.array([ [ 0.0, 0.9, 0.1, 0.9, 0.1 ],
[ 0.9, 0.0, 0.9, 0.1, 0.9 ],
[ 0.1, 0.9, 0.0, 0.9, 0.1 ],
[ 0.1, 0.9, 0.9, 0.0, 0.9 ],
[ 0.1, 0.9, 0.1, 0.9, 0.0 ]
])
__map_spc_to_mat = {}
__map_mat_to_spc = {}
def __spc_prepare_run_file(n_data_points):
run_file = """
NumberOfPoints: {0}
NumberOfEdges: {2}
DataFile: {1}.dat
MinTemp: 0.0
MaxTemp: 0.5
TempStep: 0.001
OutFile: {1}.out
SWCycles: 2000
KNearestNeighbours: 11
Dimension: 0
MSTree|
NearestNeighbours|
DirectedGrowth|
SaveSuscept|
WriteLables|
WriteCorFile|
""".format(n_data_points, SPC_TMP_FILES_PREFIX, n_data_points**2/2)
INFO(run_file)
with open(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + ".run", "w+") as run_f:
run_f.write(run_file)
def __is_row_filtered(dist_mat, row):
non_filtered_indices = dist_mat[row] >= 1 # == ctwc__distance_matrix.INF_VALUE
non_filtered_indices[row] = True # diagonal is zeros even for filtered, remove those.
return non_filtered_indices.all()
def __spc_prepare_dat_file(dist_mat):
globals()['__map_spc_to_mat'] = {}
globals()['__map_mat_to_spc'] = {}
lines = []
count = 0
for r in xrange(dist_mat.shape[0]):
if __is_row_filtered(dist_mat, r):
continue
count += 1
__map_spc_to_mat[count] = r
__map_mat_to_spc[r] = count
for r in __map_spc_to_mat.keys():
for c in __map_spc_to_mat.keys():
lines.append("{0} {1} {2}\n".format(r, c, dist_mat[__map_spc_to_mat[r]][__map_spc_to_mat[c]]))
with open(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + ".dat", 'w+') as fn:
for line in lines:
fn.write(line)
return count
def __spc_get_non_masked_data_points(dist_mat):
n = dist_mat.shape[0]
tmp = np.zeros(dist_mat.shape)
tmp[dist_mat < ctwc__distance_matrix.INF_VALUE] = 1
np.fill_diagonal(tmp, 0.0)
non_zero_rows = len(set(tmp.nonzero()[0]))
INFO("Non-masked rows: {0}".format(non_zero_rows))
return non_zero_rows
def __spc_prepare_edge_file(dist_mat):
n = dist_mat.shape[0]
lines = []
DEBUG("Size of __map_mat_to_spc is: {0}".format(len(__map_mat_to_spc)))
for r in __map_mat_to_spc.keys():
for c in __map_mat_to_spc.keys():
lines.append("{0} {1}\n".format(r, c))
with open(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + ".edge", "w+") as edge_f:
for line in lines:
edge_f.write(line)
def __spc_run_and_wait_for_completion():
from subprocess import call
args = [SPC_BINARY_EXE, SPC_TMP_FILES_PREFIX + ".run"]
call(args, cwd=SPC_BINARY_PATH)
def __check_clustering_success(lines):
# excplicit handling of edge case: if there are no linked components
# for SPC's k-means stage, everything breaks down immediately.
return 1 != int(lines[1].split()[LARGEST_CLUSTER_IND])
def __pick_line_by_most_stable_largest_cluster(lines, lower_threshold=0, upper_threshold=float('Inf')):
NUM_COLUMNS_TO_CHECK = 3
if not __check_clustering_success(lines):
return None, None
top_column = -1
top_score = -1
top_line = None
top_size = -1
def is_valid_size(size):
return size < upper_threshold and size > lower_threshold
for column in xrange(LARGEST_CLUSTER_IND, LARGEST_CLUSTER_IND + NUM_COLUMNS_TO_CHECK):
tmp_line, tmp_score = __pick_line_by_most_stable_largest_cluster_for_column(column, lines, lower_threshold, upper_threshold)
tmp_size = int(tmp_line.split()[column])
if (top_line == None or # first iteration
tmp_score > MIN_SCORE and is_valid_size(tmp_size) and tmp_size > top_size or
tmp_score > MIN_SCORE and tmp_size > top_size):
top_column = column
top_line = tmp_line
top_score = tmp_score
top_size = tmp_size
return top_line, top_column - LARGEST_CLUSTER_IND
def __pick_line_by_most_stable_largest_cluster_for_column(column, lines, lower_threshold=0, upper_threshold=float('Inf')):
from itertools import groupby
def get_count_per_value(iterator):
return sum(1 for _ in iterator)
def get_counter_list(iterable):
return [ (k, get_count_per_value(g)) for k, g in groupby(iterable) ]
def get_max_counter(counter_list, lower_threshold):
return max(counter_list, key=lambda item: item[1] if item[0] > lower_threshold and item[0] < upper_threshold and item[1] > MIN_SCORE else -1)
def get_line_num_for_max_counter(counter_list, max_counter):
ind = 0
for counter in counter_list:
if counter == max_counter:
break
else:
ind += counter[1]
return ind
cluster_list = [ int(line.split()[column]) for line in lines ]
counter_list = get_counter_list(cluster_list)
max_counter = get_max_counter(counter_list, lower_threshold)
while (max_counter[1] < MIN_SCORE and lower_threshold > 1
or max_counter[0] == cluster_list[0] and len(counter_list) > 1):
lower_threshold /= 2.0
max_counter = get_max_counter(counter_list, lower_threshold)
ind = get_line_num_for_max_counter(counter_list, max_counter)
return lines[ind], max_counter[1]
def __spc_parse_temperature_results(non_masked_data_points, cluster_limit):
TEMP_IND = 1
lines = []
with open(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + ".out.dg_01", "r") as out_dg_fn:
lines = out_dg_fn.readlines()
DEBUG("SPC output:")
for line in lines:
DEBUG(line)
lower_threshold = MIN_CLUSTER_LIMIT # Disregard clusters smaller than this
upper_threshold = min(non_masked_data_points * 0.99, non_masked_data_points - lower_threshold) # 99%
if cluster_limit > 0:
upper_threshold = min(upper_threshold, cluster_limit - 1)
line, clust_id = __pick_line_by_most_stable_largest_cluster(lines, lower_threshold, upper_threshold)
if line is None:
return 0.0, lines, 0
temperature = float(line.split()[TEMP_IND])
INFO("Most stable temperature: {0}".format(temperature))
return temperature, lines, clust_id
def __spc_get_clusters_by_temperature(t):
TEMP_IND = 1
with open(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + ".out.dg_01.lab", "r") as out_dg_lab_fn:
lines = out_dg_lab_fn.readlines()
for line in lines:
if float(line.split()[TEMP_IND]) == t:
break
spc_indices = map(int, line.split()[TEMP_IND + 1:])
return spc_indices
def __spc_get_cluster_members_by_cluster_id(clusters, cluster_id):
clust = []
for i, val in enumerate(clusters):
if val == cluster_id:
clust.append(__map_spc_to_mat[i + 1])
return clust
def __spc_clear_temporary_files():
from glob import glob
import os
map(os.remove, glob(SPC_BINARY_PATH + SPC_TMP_FILES_PREFIX + '*'))
def __get_precalculated_spc_file_if_exists(h):
return load_from_file(SPC_CLUSTER_FILE.format(h))
def __calculate_hash_for_data(data, dist_matrix):
if data is None or dist_matrix is None:
return 0
return hash( data.tostring() + str(dist_matrix) )
def __get_precalculated_spc_file_if_exists_for_data(data, dist_matrix):
h = __calculate_hash_for_data(data, dist_matrix)
return __get_precalculated_spc_file_if_exists(h)
def __save_calculated_spc_file_and_hash_for_data(data, dist_matrix, cluster, log):
if not ALLOW_CACHING:
return
DEBUG("Saving calculated SPC cluster to file...")
h = __calculate_hash_for_data(data, dist_matrix)
save_to_file((cluster, log), SPC_CLUSTER_FILE.format(h))
def cluster_rows_spc(data, dist_matrix, cluster_limit):
DEBUG("Checking for cached results...")
cached_results = __get_precalculated_spc_file_if_exists_for_data(data, dist_matrix)
if cached_results is not None:
cached_cluster, log = cached_results
INFO("Using pre-cached SPC results for input")
for line in log:
INFO(line)
return data, cached_cluster, None
DEBUG("Clearing temporary files from previous runs...")
__spc_clear_temporary_files()
DEBUG("Preparing files for SPC run...")
DEBUG("Preparing dat file...")
n_data_points = __spc_prepare_dat_file(dist_matrix)
if n_data_points < MIN_DATA_POINTS_LIMIT:
DEBUG("Too few data points, skipping clustering")
return data, [0], None
DEBUG("Preparing edge file...")
__spc_prepare_edge_file(dist_matrix)
DEBUG("Preparing run file...")
__spc_prepare_run_file(n_data_points)
DEBUG("Starting Super-Paramagnetic clustering...")
__spc_run_and_wait_for_completion()
non_masked_data_points = __spc_get_non_masked_data_points(dist_matrix)
t, log, clust_id = __spc_parse_temperature_results(non_masked_data_points, cluster_limit)
if t == 0.0:
top_cluster = __spc_get_cluster_members_by_cluster_id([ 0 ] * n_data_points, 0)
else:
clusters = __spc_get_clusters_by_temperature(t)
top_cluster = __spc_get_cluster_members_by_cluster_id(clusters, clust_id)
DEBUG("Top cluster: {0}".format(top_cluster))
DEBUG("Saving results in cache...")
__save_calculated_spc_file_and_hash_for_data(data, dist_matrix, top_cluster, log)
DEBUG("Finished Super-Paramagnetic clustering.")
return data, top_cluster, None
def cluster_rows_agglomerative(data, dist_matrix, n_clusters=N_CLUSTERS):
DEBUG("Starting Agglomerative clustering...")
ag = AgglomerativeClustering(n_clusters=n_clusters if n_clusters < len(dist_matrix) else len(dist_matrix),
linkage="complete",
affinity="precomputed").fit(dist_matrix)
DEBUG("Finished Agglomerative clustering.")
return data, ag.labels_, ag
def cluster_rows_dbscan(data, dist_matrix, eps=0.5):
db = DBSCAN(eps=eps, metric="precomputed", min_samples=3).fit(dist_matrix)
return data, db.labels_, db
def cluster_rows(data, dist_matrix, cluster_limit=0):
return cluster_rows_spc(data, dist_matrix, cluster_limit)
def __test_agglomerative_clustering():
_, labels, ag = cluster_rows_agglomerative(None, sample_dist_matrix, 2)
import ctwc__cluster_rank
ranks_list = ctwc__cluster_rank.get_ranks(ag)
ctwc__cluster_rank.get_nth_top_cluster_base_node(ranks_list)
def __test_dbscan_clustering(data, dist_matrix):
_, labels, ag = cluster_rows_dbscan(None, dist_matrix)
def __test_spc_clustering():
SIZE = 101
z = np.zeros((SIZE, SIZE))
for i in range(0, SIZE):
for j in range(0, SIZE):
z[i][j] = 0.1 if (i+j) % 3 == 0 else 0.8
if i == j:
z[i][j] = 0.0
INFO(z)
_, top_cluster, _ = cluster_rows_spc(None, z, 50)
INFO(top_cluster)
def test():
__test_spc_clustering()
def __inject_row_pattern_to_data(data):
z = np.zeros(data.shape)
for col in range(data.shape[1]): # Flooding to create a false relationship between rows
for row in range(6):
z[row, col] = (col % 2) * 200
for row in range(data.shape[0] - 10, data.shape[0]):
z[row, col] += (1 - (col % 2)) * 200
return data + z
def __inject_col_pattern_to_data(data):
z = np.zeros(data.shape)
for col in range(30):
for row in range(data.shape[0]): # Flooding to create a false relationship between rows
z[row, col] = data[row, 0] - data[row, col]
return data + z
if __name__ == "__main__":
test()