/
kmeans.go
163 lines (149 loc) · 4.32 KB
/
kmeans.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
package cluster
import (
"errors"
"math/rand"
"strconv"
)
// KMeans represents a classification model utilizing K-Means clustering
type KMeans struct {
CENTROIDS [][]float64
clusters int
max_episodes int
}
// NewKMeans creates a new K-Means model with `n` clusters. `Max_eps` represents the maximum
// amount of episodes or iterations the model will train on until convergence.
func NewKMeans(n, max_eps int) *KMeans {
var model KMeans
model.clusters = n
model.max_episodes = max_eps
return &model
}
// Train performs the K-Means clustering algorithm on `data` and generates a classification model.
// An error is returned if dimensions are empty or not consistent. As well, it returns an
// error if centroid calculations fail at any point.
func (model *KMeans) Train(data [][]float64) error {
// Check data dimensions
if len(data) < 1 {
return errors.New("KMeans.Train(): data cannnot be empty")
}
// Initialize first centroid with random point
init_idx := int(rand.Float64() * float64(len(data)))
model.CENTROIDS = append(model.CENTROIDS, data[init_idx])
// Initialize other centroids with probabilities proportional to their distances to the first centroid
for i := 1; i < model.clusters; i++ {
// Calculate dist from points to centroids
dist := make([]float64, len(data))
sum := 0.0
for j := range model.CENTROIDS {
tmp := Euclidean(model.CENTROIDS[j], data)
for k := range tmp {
dist[k] += tmp[k]
sum += tmp[k]
}
}
// Normalize distances
for j := 0; j < len(dist); j++ {
dist[j] /= sum
}
// Choose next centroid given probabilities
p := rand.Float64()
sum = 0.0
found := false
for j := 0; j < len(dist)-1; j++ {
sum += dist[j]
if p < sum {
model.CENTROIDS = append(model.CENTROIDS, data[j])
found = true
break
}
}
if !found {
return errors.New("KMeans.Train(): failed to add centroid")
}
}
// Adjust centroids over iterations until convergence or max_episodes
for iter := 0; iter < model.max_episodes; iter++ {
// Assign point to nearest centroid
points := make(map[int][][]float64)
for j := range data {
// Get centroid idx
dist := Euclidean(data[j], model.CENTROIDS)
min := dist[0]
min_idx := 0
for k := range dist {
if dist[k] < min {
min = dist[k]
min_idx = k
}
}
// Store point in map
points[min_idx] = append(points[min_idx], data[j])
}
// Store previous centroids
previous := make([][]float64, len(model.CENTROIDS))
for j := range model.CENTROIDS {
previous[j] = make([]float64, len(model.CENTROIDS[j]))
copy(previous[j], model.CENTROIDS[j])
}
// Reassign centroids as mean of its own points
for j := range model.CENTROIDS {
mean_x, mean_y := 0.0, 0.0
for k := range points[j] {
mean_x += points[j][k][0] / float64(len(points[j]))
mean_y += points[j][k][1] / float64(len(points[j]))
}
if len(points[j]) == 0 {
model.CENTROIDS[j] = []float64{previous[j][0], previous[j][1]}
} else {
model.CENTROIDS[j] = []float64{mean_x, mean_y}
}
}
// Check if centroids converged
converged := true
for j := range model.CENTROIDS {
if previous[j][0] != model.CENTROIDS[j][0] && previous[j][1] != model.CENTROIDS[j][1] {
converged = false
break
}
}
if converged {
break
}
}
return nil
}
// Evaluate takes in `data` and classifies it according to the trained K-Means model. It
// returns the data along with the centroid points with its correlated labels. An error
// is returned if the model was not yet trained.
func (model *KMeans) Evaluate(data [][]float64) ([][]float64, []string, error) {
var result [][]float64
var label []string
// Check if model has been fit
if len(model.CENTROIDS) < 1 {
return result, label, errors.New("KMeans.Evaluate(): model not yet trained")
}
// Add centroid points
for i := range model.CENTROIDS {
// Add label and point
label = append(label, "Centroid Center")
result = append(result, model.CENTROIDS[i])
}
// Add evalulated data points
for i := range data {
// Get centroid idx
dist := Euclidean(data[i], model.CENTROIDS)
min := dist[0]
min_idx := 0
for j := range dist {
if dist[j] < min {
min = dist[j]
min_idx = j
}
}
// Add label and point
name := "Cluster " + strconv.Itoa(min_idx)
label = append(label, name)
result = append(result, data[i])
}
return result, label, nil
}