Permalink
Fetching contributors…
Cannot retrieve contributors at this time
51 lines (37 sloc) 1.82 KB
DEF EuclideanDistance(x0, y0, x1, y1):
sqrt(pow(x0 - x1, 2) + pow(y0 - y1, 2));
-- Load some points; assume each point has a unique ID
Point = SCAN(public:adhoc:points);
-- Create some initial cluster centers from the first K points
-- TODO: We should choose these at random somehow...
-- TODO: The cluster count should be expressable as a constant
Centroid = [FROM LIMIT(Point, 3) AS K EMIT id AS cluster_id, x AS x,y AS y];
-- Assign each point to the first cluster
FirstCluster = LIMIT(Centroid, 1);
Kmeans = [FROM Point EMIT Point.id AS id,
*FirstCluster.cluster_id AS cluster_id];
DO
-- Calculate distance from each point to each centroid
Distance = [FROM Point, Centroid
EMIT Point.id AS id,
Centroid.cluster_id AS cluster_id,
EuclideanDistance(Point.x, Centroid.x, Point.y, Centroid.y) AS distance];
-- Choose closest cluster for each point
Closest = [FROM Distance EMIT id, MIN(distance) AS distance];
NewKmeans = [FROM Closest, Distance
WHERE Closest.id == Distance.id AND
ABS(Closest.distance - Distance.distance) < .000001
EMIT Closest.id AS id, MIN(Distance.cluster_id) AS cluster_id];
-- Compute delta from the previous iteration
Delta = DIFF(NewKmeans, Kmeans);
Continue = [FROM Delta EMIT COUNT(id) > 0];
Kmeans = NewKmeans;
-- Update centroids
PointsInCentroid = [FROM Centroid, Kmeans, Point
WHERE Centroid.cluster_id == Kmeans.cluster_id AND
Point.id == Kmeans.id
EMIT Centroid.cluster_id AS cluster_id, Point.x AS x,
Point.y AS y];
Centroid = [FROM PointsInCentroid EMIT cluster_id, avg(x) AS x, avg(y) AS y];
WHILE Continue;
STORE(Kmeans, OUTPUT);