-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathclustering.py
206 lines (176 loc) · 6.86 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# This is an implementation of https://arxiv.org/pdf/1604.00989.pdf, a modified
# version of rank-order clustering.
import pyflann
import numpy as np
from time import time
from profilehooks import profile
from multiprocessing import Pool
from functools import partial
import json
def build_index(dataset, n_neighbors):
"""
Takes a dataset, returns the "n" nearest neighbors
"""
# Initialize FLANN
pyflann.set_distance_type(distance_type='euclidean')
flann = pyflann.FLANN()
params = flann.build_index(dataset,
algorithm='kdtree',
trees=4
)
#print params
nearest_neighbors, dists = flann.nn_index(dataset, n_neighbors,
checks=params['checks'])
return nearest_neighbors, dists
def create_neighbor_lookup(nearest_neighbors):
"""
Key is the reference face, values are the neighbors.
"""
nn_lookup = {}
for i in range(nearest_neighbors.shape[0]):
nn_lookup[i] = nearest_neighbors[i, :]
# print "NN Lookup :",nn_lookup
return nn_lookup
# @profile
def calculate_symmetric_dist_row(nearest_neighbors, nn_lookup, row_no):
"""
This function calculates the symmetric distances for one row in the
matrix.
"""
dist_row = np.zeros([1, nearest_neighbors.shape[1]])
f1 = nn_lookup[row_no]
# print "f1 : ", f1
for idx, neighbor in enumerate(f1[1:]):
Oi = idx+1
co_neighbor = True
try:
row = nn_lookup[neighbor]
Oj = np.where(row == row_no)[0][0] + 1
# print 'Correct Oj: {}'.format(Oj)
except IndexError:
Oj = nearest_neighbors.shape[1]+1
co_neighbor = False
#dij
f11 = set(f1[0:Oi])
f21 = set(nn_lookup[neighbor])
dij = len(f11.difference(f21))
#dji
f12 = set(f1)
f22 = set(nn_lookup[neighbor][0:Oj])
dji = len(f22.difference(f12))
# print 'dij: {}, dji: {}'.format(dij, dji)
# print 'Oi: {}, Oj: {}'.format(Oi, Oj)
if not co_neighbor:
dist_row[0, Oi] = 9999.0
else:
dist_row[0, Oi] = float(dij + dji)/min(Oi, Oj)
# print dist_row
return dist_row
def calculate_symmetric_dist(app_nearest_neighbors):
"""
This function calculates the symmetric distance matrix.
"""
dist_calc_time = time()
nn_lookup = create_neighbor_lookup(app_nearest_neighbors)
d = np.zeros(app_nearest_neighbors.shape)
p = Pool(processes=4)
func = partial(calculate_symmetric_dist_row, app_nearest_neighbors, nn_lookup)
results = p.map(func, range(app_nearest_neighbors.shape[0]))
for row_no, row_val in enumerate(results):
d[row_no, :] = row_val
d_time = time()-dist_calc_time
print("Distance calculation time : {}".format(d_time))
return d
def aro_clustering(app_nearest_neighbors, distance_matrix, thresh):
'''
Approximate rank-order clustering. Takes in the nearest neighbors matrix
and outputs clusters - list of lists.
'''
# Clustering :
clusters = []
# Start with the first face :
nodes = set(list(np.arange(0, distance_matrix.shape[0])))
# print 'Nodes initial : {}'.format(nodes)
tc = time()
plausible_neighbors = create_plausible_neighbor_lookup(
app_nearest_neighbors,
distance_matrix,
thresh)
# print 'Time to create plausible_neighbors lookup : {}'.format(time()-tc)
ctime = time()
while nodes:
# Get a node :
n = nodes.pop()
# This contains the set of connected nodes :
group = {n}
# Build a queue with this node in it :
queue = [n]
# Iterate over the queue :
while queue:
n = queue.pop(0)
neighbors = plausible_neighbors[n]
# Remove neighbors we've already visited :
neighbors = nodes.intersection(neighbors)
neighbors.difference_update(group)
# Remove nodes from the global set :
nodes.difference_update(neighbors)
# Add the connected neighbors :
group.update(neighbors)
# Add the neighbors to the queue to visit them next :
queue.extend(neighbors)
# Add the group to the list of groups :
clusters.append(group)
# print 'Clustering Time : {}'.format(time()-ctime)
return clusters
def create_plausible_neighbor_lookup(app_nearest_neighbors,
distance_matrix,
thresh):
"""
Create a dictionary where the keys are the row numbers(face numbers) and
the values are the plausible neighbors.
"""
n_vectors = app_nearest_neighbors.shape[0]
plausible_neighbors = {}
for i in range(n_vectors):
plausible_neighbors[i] = set(list(app_nearest_neighbors[i,
np.where(
distance_matrix[i, :] <= thresh)]
[0]))
# min_dist = np.min(distance_matrix[i, 1:])
# if min_dist <= thresh:
# nn_indices = np.where(distance_matrix[i, :] == min_dist)
# old_nn_row = app_nearest_neighbors[i, :]
# plausible_neighbors[i] = set(list(old_nn_row[nn_indices]))
# else:
# plausible_neighbors[i] = set([])
return plausible_neighbors
def cluster(descriptor_matrix, n_neighbors=10, thresh=[2]):
"""
Master function. Takes the descriptor matrix and returns clusters.
n_neighbors are the number of nearest neighbors considered and thresh
is the clustering distance threshold
"""
app_nearest_neighbors, dists = build_index(descriptor_matrix, n_neighbors)
distance_matrix = calculate_symmetric_dist(app_nearest_neighbors)
clusters = []
for th in thresh:
clusters_th = aro_clustering(app_nearest_neighbors, distance_matrix, th)
print("N Clusters: {}, thresh: {}".format(len(clusters_th), th))
clusters.append({'clusters': clusters_th, 'threshold': th})
return clusters
if __name__ == '__main__':
descriptor_matrix = np.random.rand(20, 10)
app_nearest_neighbors, dists = build_index(descriptor_matrix, n_neighbors=2)
distance_matrix = calculate_symmetric_dist(app_nearest_neighbors)
clusters = cluster(descriptor_matrix, n_neighbors=2)
# print clusters[0]
clusters_to_be_saved = {}
for i, cluster in enumerate(clusters[0]["clusters"]):
c = [int(x) for x in list(cluster)]
clusters_to_be_saved[i] = c
with open("clusters.json", "w") as f:
json.dump(clusters_to_be_saved, f)
# n_faces = 0
# for c in clusters:
# n_faces += len(c)
# print clusters