-
Notifications
You must be signed in to change notification settings - Fork 19
/
kmeans.myl
50 lines (37 loc) · 1.82 KB
/
kmeans.myl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
DEF EuclideanDistance(x0, y0, x1, y1):
sqrt(pow(x0 - x1, 2) + pow(y0 - y1, 2));
-- Load some points; assume each point has a unique ID
Point = SCAN(public:adhoc:points);
-- Create some initial cluster centers from the first K points
-- TODO: We should choose these at random somehow...
-- TODO: The cluster count should be expressable as a constant
Centroid = [FROM LIMIT(Point, 3) AS K EMIT id AS cluster_id, x AS x,y AS y];
-- Assign each point to the first cluster
FirstCluster = LIMIT(Centroid, 1);
Kmeans = [FROM Point EMIT Point.id AS id,
*FirstCluster.cluster_id AS cluster_id];
DO
-- Calculate distance from each point to each centroid
Distance = [FROM Point, Centroid
EMIT Point.id AS id,
Centroid.cluster_id AS cluster_id,
EuclideanDistance(Point.x, Centroid.x, Point.y, Centroid.y) AS distance];
-- Choose closest cluster for each point
Closest = [FROM Distance EMIT id, MIN(distance) AS distance];
NewKmeans = [FROM Closest, Distance
WHERE Closest.id == Distance.id AND
ABS(Closest.distance - Distance.distance) < .000001
EMIT Closest.id AS id, MIN(Distance.cluster_id) AS cluster_id];
-- Compute delta from the previous iteration
Delta = DIFF(NewKmeans, Kmeans);
Continue = [FROM Delta EMIT COUNT(id) > 0];
Kmeans = NewKmeans;
-- Update centroids
PointsInCentroid = [FROM Centroid, Kmeans, Point
WHERE Centroid.cluster_id == Kmeans.cluster_id AND
Point.id == Kmeans.id
EMIT Centroid.cluster_id AS cluster_id, Point.x AS x,
Point.y AS y];
Centroid = [FROM PointsInCentroid EMIT cluster_id, avg(x) AS x, avg(y) AS y];
WHILE Continue;
STORE(Kmeans, OUTPUT);