-
Notifications
You must be signed in to change notification settings - Fork 8
/
spkmeans.py
executable file
·246 lines (194 loc) · 6.54 KB
/
spkmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python3
################################################################################
# A Python implementation of the Spherical K-Means clustering algorithm.
#
# The SPK-Means algorithm is used to create K clusters into which it sorts
# given documents containing text data. These clusters are generated as a
# means of finding patterns in large amounts of data.
# The number of clusters, K, is provided as a parameter.
#
# Tested with Python versions: 3.1.3, 3.2.3
#
# Doctest:
# $ python3 -m doctest spkmeans.py
################################################################################
# import custom module
from reader import Reader
from vector import Vector
# import global modules
import math
################################################################################
# SPKMeans class
class SPKMeans():
"""
Contains functions to cluster a documenting using the Spherical
K-Means algorithm.
"""
def __init__(self, reader):
"""Sets up the reader and initializes class variables."""
self.reader = reader
self.doc_vecs = None
self.num_docs = 0
self.num_words = 0
def read_docs(self):
"""Reads the documents from files into memory, and report stats."""
self.doc_vecs = self.reader.read()
self.num_docs = len(self.doc_vecs)
self.num_words = len(self.reader.word_list)
self.reader.report()
def cluster(self, k):
"""Run the full SPKMeans algorithm, and return the partitions."""
# k must be at least 2 (otherwise meaningless)
if k < 2:
print("Warning: must use at least 2 partitions. Stopping.")
return None
self.read_docs()
# number of documents must be at least 2 (otherwise meaningless)
if self.num_docs < 2:
print("Warning: must use at least 2 documents. Stopping")
return None
print("Running SPKMeans clustering: {} partitions.".format(k))
# normalize the document vectors, and apply the txn scheme
for doc_v in self.doc_vecs:
doc_v.normalize()
doc_v *= math.sqrt(doc_v.get_sum()) # txn
# apply TXN scheme to the document vectors
#self.apply_txn_scheme()
# create first partition set and concept vectors
partitions = self.randomize_partitions(k)
concepts = self.get_concepts(partitions)
q = self.quality(partitions, concepts)
dQ = 100
qThresh = 0.01
t = 0
# while change in quality is large enough, or within finite time,
# compute a better partitioning:
while dQ > qThresh and t < 200:
partitions = [[]] * k
for doc_v in self.doc_vecs: # for each document
#find closest doc vector of cosine similarity
v_norm = doc_v.norm()
closest = 1
closest_val = doc_v.dot(concepts[0])
closest_val /= (v_norm * concepts[0].norm())
for i in range(1, k):
val = doc_v.dot(concepts[i])
val /= (v_norm * concepts[i].norm())
if val > closest_val:
closest = i
closest_val = val
# put the document Vector into the partition associated
# with the closest concept Vector
partitions[closest].append(doc_v)
# recalculate concept vectors
concepts = self.get_concepts(partitions)
# compute quality of the new partition
lastQ = q
q = self.quality(partitions, concepts)
dQ = abs(q - lastQ)
t += 1
print("lastQ =",lastQ,"q =",q,"dQ =",dQ)
# return the partitions and associated concept vectors
return partitions, concepts
def quality(self, partitions, concepts):
"""Compute and return the quality of the given partitions."""
q = 0
for i in range(len(partitions)):
sum_v = Vector.sum_vectors(partitions[i])
q += sum_v.dot(concepts[i])
return q
def txn(self, doc_vec_i):
"""
Scales the given document Vector using the TXN scheme.
doc_vec_i[j] = t_ji * g_j * s_i
- (doc i, word j)
t_ji: term weighting component
- depends on count of word j in doc i
g_j: global weighting component
- depends on number of docs containing word j
s_i: normalization component for doc i
= sqrt[sum(j,word_count):(t_ji * g_j)^2]
*** TESTING: ***
###### Set up class:
>>> s = SPKMeans(None)
>>> s.num_words = 5
###### Test vectors:
>>> v = [1,2,3,0,0]
>>> s.txn(v)
>>> print(v)
[0,0,0,0]
>>> v = [0,0,2,3,4]
>>> s.txn(v)
>>> print(v)
[0,0,0,0]
"""
# calculate s_i (normalization component) for vector (i):
# = sqrt[sum(j,word_count):(t_ji * g_j)^2]
sum_v = 0
for j in range(self.num_words):
t_ji = doc_vec_i[j] # term weighting component
g_j = 1 # global weighting component (TODO - ignored)
sum_v += (t_ji*g_j)*(t_ji*g_j) # = math.pow((t_ji*g_j), 2)
s_i = math.sqrt(sum_v)
print(s_i)
# calculate the new vector values for vector (i):
# doc_vec_i[j] = t_ji * g_j * s_i
for j in range(self.num_words):
t_ji = doc_vec_i[j]
g_j = 1
doc_vec_i[j] = t_ji * g_j * s_i
def get_concepts(self, partitions):
"""Returns a set of concept Vectors, one for each partition."""
concepts = []
for p in partitions:
concepts.append(self.compute_concept(p))
return concepts
def compute_concept(self, p):
"""
Computes the Concept Vector of given partition p, and returns said
Concept Vector. Parameter p must be a list containing at least one
document Vector.
"""
# computer sum of all vectors in partition p
cv = Vector(self.num_words)
for doc_v in p:
cv += doc_v
# compute the mean vector for partition using the sum vector
cv *= (1/len(p))
# computer the norm of the mean vector
cv.normalize()
return cv
def randomize_partitions(self, k):
"""
Create a set of k partitions with the documents randomly
distributed to a cluster. Maintains as much distribution equality
between the partitions as possible.
Returns the set of partitions.
TODO: not random; splits up by order docs were scanned in
"""
partitions = []
num_per_partition = int(self.num_docs / k)
for pindex in range(k):
partition_vecs = []
partition_size = num_per_partition
if pindex == k-1:
partition_size = partition_size + (self.num_docs % k)
for num in range(partition_size):
num += (pindex * (k-1))
partition_vecs.append(self.doc_vecs[num])
partitions.append(partition_vecs)
return partitions
################################################################################
# start here
def main():
# define documents to be used and number of clusters
# TODO - docs and number of clusters as parameters
NUM_CLUSTERS = 2
docs = ['one.txt', 'x', 'two.txt', 'three.txt']
#'long1.txt', 'long2.txt', 'long3.txt']
# set up reader and run SPKMeans clustering
reader = Reader(docs)
p, c = SPKMeans(reader).cluster(NUM_CLUSTERS)
print(p,c)
main()
#