-
Notifications
You must be signed in to change notification settings - Fork 13
/
Clustering_MLJ.jl
223 lines (190 loc) · 8.9 KB
/
Clustering_MLJ.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"Part of [BetaML](https://github.com/sylvaticus/BetaML.jl). Licence is MIT."
# MLJ interface for hard clustering models
import MLJModelInterface # It seems that having done this in the top module is not enought
const MMI = MLJModelInterface # We need to repeat it here
export KMeans, KMedoids
# ------------------------------------------------------------------------------
# Model Structure declarations..
"""
$(TYPEDEF)
The classical KMeans clustering algorithm, from the Beta Machine Learning Toolkit (BetaML).
# Parameters:
$(TYPEDFIELDS)
# Notes:
- data must be numerical
- online fitting (re-fitting with new data) is supported
# Example:
```julia
julia> using MLJ
julia> modelType = @load KMeans pkg = "BetaML"
[ Info: For silent loading, specify `verbosity=0`.
import BetaML ✔
BetaML.Clustering.KMeans
julia> model = modelType()
KMeans(
n_classes = 3,
dist = BetaML.Clustering.var"#25#27"(),
initialisation_strategy = "shuffle",
initial_representatives = nothing,
rng = Random._GLOBAL_RNG())
julia> X, y = @load_iris;
julia> (fitResults, cache, report) = MLJ.fit(model, 0, X);
julia> est_classes = predict(model, fitResults, X)
150-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
3
3
3
⋮
1
1
2
```
"""
mutable struct KMeans <: MMI.Unsupervised
"Number of classes to discriminate the data [def: 3]"
n_classes::Int64
"Function to employ as distance. Default to the Euclidean distance. Can be one of the predefined distances (`l1_distance`, `l2_distance`, `l2squared_distance`), `cosine_distance`), any user defined function accepting two vectors and returning a scalar or an anonymous function with the same characteristics. Attention that, contrary to `KMedoids`, the `KMeansClusterer` algorithm is not guaranteed to converge with other distances than the Euclidean one."
dist::Function
"""
The computation method of the vector of the initial representatives.
One of the following:
- "random": randomly in the X space
- "grid": using a grid approach
- "shuffle": selecting randomly within the available points [default]
- "given": using a provided set of initial representatives provided in the `initial_representatives` parameter
"""
initialisation_strategy::String
"Provided (K x D) matrix of initial representatives (useful only with `initialisation_strategy=\"given\"`) [default: `nothing`]"
initial_representatives::Union{Nothing,Matrix{Float64}}
"Random Number Generator [deafult: `Random.GLOBAL_RNG`]"
rng::AbstractRNG
end
KMeans(;
n_classes = 3,
dist = dist=(x,y) -> norm(x-y),
initialisation_strategy = "shuffle",
initial_representatives = nothing,
rng = Random.GLOBAL_RNG,
) = KMeans(n_classes,dist,initialisation_strategy,initial_representatives,rng)
"""
$(TYPEDEF)
# Parameters:
$(TYPEDFIELDS)
The K-medoids clustering algorithm with customisable distance function, from the Beta Machine Learning Toolkit (BetaML).
Similar to K-Means, but the "representatives" (the cetroids) are guaranteed to be one of the training points. The algorithm work with any arbitrary distance measure.
# Notes:
- data must be numerical
- online fitting (re-fitting with new data) is supported
# Example:
```julia
julia> using MLJ
julia> modelType = @load KMedoids pkg = "BetaML"
[ Info: For silent loading, specify `verbosity=0`.
import BetaML ✔
BetaML.Clustering.KMedoids
julia> model = modelType()
KMedoids(
n_classes = 3,
dist = BetaML.Clustering.var"#49#51"(),
initialisation_strategy = "shuffle",
initial_representatives = nothing,
rng = Random._GLOBAL_RNG())
julia> X, y = @load_iris;
julia> (fitResults, cache, report) = MLJ.fit(model, 0, X);
julia> est_classes = predict(model, fitResults, X)
150-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
2
3
3
⋮
1
1
1
```
"""
mutable struct KMedoids <: MMI.Unsupervised
"Number of classes to discriminate the data [def: 3]"
n_classes::Int64
"Function to employ as distance. Default to the Euclidean distance. Can be one of the predefined distances (`l1_distance`, `l2_distance`, `l2squared_distance`), `cosine_distance`), any user defined function accepting two vectors and returning a scalar or an anonymous function with the same characteristics."
dist::Function
"""
The computation method of the vector of the initial representatives.
One of the following:
- "random": randomly in the X space
- "grid": using a grid approach
- "shuffle": selecting randomly within the available points [default]
- "given": using a provided set of initial representatives provided in the `initial_representatives` parameter
"""
initialisation_strategy::String
"Provided (K x D) matrix of initial representatives (useful only with `initialisation_strategy=\"given\"`) [default: `nothing`]"
initial_representatives::Union{Nothing,Matrix{Float64}}
"Random Number Generator [deafult: `Random.GLOBAL_RNG`]"
rng::AbstractRNG
end
KMedoids(;
n_classes = 3,
dist = (x,y) -> norm(x-y),
initialisation_strategy = "shuffle",
initial_representatives = nothing,
rng = Random.GLOBAL_RNG,
) = KMedoids(n_classes,dist,initialisation_strategy,initial_representatives,rng)
# ------------------------------------------------------------------------------
# Fit functions...
function MMI.fit(m::Union{KMeans,KMedoids}, verbosity, X)
x = MMI.matrix(X) # convert table to matrix
# Using low level API here. We could switch to APIV2...
typeof(verbosity) <: Integer || error("Verbosity must be a integer. Current \"steps\" are 0, 1, 2 and 3.")
verbosity = Utils.mljverbosity_to_betaml_verbosity(verbosity)
if typeof(m) == KMeans
(assignedClasses,representatives) = kmeans(x,m.n_classes,dist=m.dist,initialisation_strategy=m.initialisation_strategy,initial_representatives=m.initial_representatives,rng=m.rng,verbosity=verbosity)
else
(assignedClasses,representatives) = kmedoids(x,m.n_classes,dist=m.dist,initialisation_strategy=m.initialisation_strategy,initial_representatives=m.initial_representatives,rng=m.rng, verbosity=verbosity)
end
cache=nothing
report=nothing
return ((classes=assignedClasses,centers=representatives,distanceFunction=m.dist), cache, report)
end
MMI.fitted_params(model::Union{KMeans,KMedoids}, fitresult) = (centers=fitesult[2], cluster_labels=CategoricalArrays.categorical(fitresults[1]))
# ------------------------------------------------------------------------------
# Transform functions...
""" fit(m::KMeans, fitResults, X) - Given a fitted clustering model and some observations, return the distances to each centroids """
function MMI.transform(m::Union{KMeans,KMedoids}, fitResults, X)
x = MMI.matrix(X) # convert table to matrix
(N,D) = size(x)
nCl = size(fitResults.centers,1)
distances = Array{Float64,2}(undef,N,nCl)
for n in 1:N
for c in 1:nCl
distances[n,c] = fitResults.distanceFunction(x[n,:],fitResults[2][c,:])
end
end
return MMI.table(distances)
end
# ------------------------------------------------------------------------------
# Predict functions...
""" predict(m::KMeans, fitResults, X) - Given a fitted clustering model and some observations, predict the class of the observation"""
function MMI.predict(m::Union{KMeans,KMedoids}, fitResults, X)
x = MMI.matrix(X) # convert table to matrix
(N,D) = size(x)
nCl = size(fitResults.centers,1)
distances = MMI.matrix(MMI.transform(m, fitResults, X))
mindist = argmin(distances,dims=2)
assignedClasses = [Tuple(mindist[n,1])[2] for n in 1:N]
return CategoricalArray(assignedClasses,levels=1:nCl)
end
# ------------------------------------------------------------------------------
# Model metadata for registration in MLJ...
MMI.metadata_model(KMeans,
input_scitype = MMI.Table(MMI.Continuous), # scitype of the inputs
output_scitype = MMI.Table(MMI.Continuous), # scitype of the output of `transform`
target_scitype = AbstractArray{<:MMI.Multiclass}, # scitype of the output of `predict`
supports_weights = false, # does the model support sample weights?
load_path = "BetaML.Clustering.KMeans"
)
MMI.metadata_model(KMedoids,
input_scitype = MMI.Table(MMI.Continuous), # scitype of the inputs
output_scitype = MMI.Table(MMI.Continuous), # scitype of the output of `transform`
target_scitype = AbstractArray{<:MMI.Multiclass}, # scitype of the output of `predict`
supports_weights = false, # does the model support sample weights?
load_path = "BetaML.Clustering.KMedoids"
)