/
GMM_clustering.jl
431 lines (370 loc) · 20.2 KB
/
GMM_clustering.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
"Part of [BetaML](https://github.com/sylvaticus/BetaML.jl). Licence is MIT."
"""
estep(X,pₖ,mixtures)
E-step: assign the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data (this last one for informative purposes and terminating the algorithm only)
"""
function estep(X,pₖ,mixtures)
(N,D) = size(X)
K = length(mixtures)
Xmask = .! ismissing.(X)
logpₙₖ = zeros(N,K)
lL = 0
for n in 1:N
if any(Xmask[n,:]) # if at least one true
Xu = X[n,Xmask[n,:]]
logpx = lse([log(pₖ[k] + 1e-16) + lpdf(mixtures[k],Xu,Xmask[n,:]) for k in 1:K])
lL += logpx
for k in 1:K
logpₙₖ[n,k] = log(pₖ[k] + 1e-16)+lpdf(mixtures[k],Xu,Xmask[n,:])-logpx
end
else
logpₙₖ[n,:] = log.(pₖ)
end
end
pₙₖ = exp.(logpₙₖ)
return (pₙₖ,lL)
end
## The gmm algorithm (Lecture/segment 16.5 of https://www.edx.org/course/machine-learning-with-python-from-linear-models-to)
# no longer true with the numerical trick implemented
# - For mixtures with full covariance matrix (i.e. `FullGaussian(μ,σ²)`) the minimum_covariance should NOT be set equal to the minimum_variance, or if the covariance matrix goes too low, it will become singular and not invertible.
"""
gmm(X,K;initial_probmixtures,mixtures,tol,verbosity,minimum_variance,minimum_covariance,initialisation_strategy)
Compute Expectation-Maximisation algorithm to identify K clusters of X data, i.e. employ a Generative Mixture Model as the underlying probabilistic model.
!!! warning
This function is no longer exported. Use one of the various models that use GMM as backend instead.
X can contain missing values in some or all of its dimensions. In such case the learning is done only with the available data.
Implemented in the log-domain for better numerical accuracy with many dimensions.
# Parameters:
* `X` : A (n x d) data to clusterise
* `K` : Number of cluster wanted
* `initial_probmixtures` : Initial probabilities of the categorical distribution (K x 1) [default: `[]`]
* `mixtures`: An array (of length K) of the mixture to employ (see notes) [def: `[DiagonalGaussian() for i in 1:K]`]
* `tol`: Tolerance to stop the algorithm [default: 10^(-6)]
* `verbosity`: A verbosity parameter regulating the information messages frequency [def: `STD`]
* `minimum_variance`: Minimum variance for the mixtures [default: 0.05]
* `minimum_covariance`: Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minimum_variance (see notes).
* `initialisation_strategy`: Mixture initialisation algorithm [def: `kmeans`]
* `maximum_iterations`: Maximum number of iterations [def: `typemax(Int64)`, i.e. ∞]
* `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]
# Returns:
* A named touple of:
* `pₙₖ`: Matrix of size (N x K) of the probabilities of each point i to belong to cluster j
* `pₖ`: Probabilities of the categorical distribution (K x 1)
* `mixtures`: Vector (K x 1) of the estimated underlying distributions
* `ϵ`: Vector of the discrepancy (matrix norm) between pⱼₓ and the lagged pⱼₓ at each iteration
* `lL`: The log-likelihood (without considering the last mixture optimisation)
* `BIC`: The Bayesian Information Criterion (lower is better)
* `AIC`: The Akaike Information Criterion (lower is better)
# Notes:
- The mixtures currently implemented are `SphericalGaussian(μ,σ²)`,`DiagonalGaussian(μ,σ²)` and `FullGaussian(μ,σ²)`
- Reasonable choices for the minimum_variance/Covariance depends on the mixture. For example 0.25 seems a reasonable value for the SphericalGaussian, 0.05 seems better for the DiagonalGaussian, and FullGaussian seems to prefer either very low values of variance/covariance (e.g. `(0.05,0.05)` ) or very big but similar ones (e.g. `(100,100)` ).
- For `initialisation_strategy`, look at the documentation of `init_mixtures!` for the mixture you want. The provided gaussian mixtures support `grid`, `kmeans` or `given`. `grid` is faster (expecially if X contains missing values), but `kmeans` often provides better results.
# Resources:
- [Paper describing gmm with missing values](https://doi.org/10.1016/j.csda.2006.10.002)
- [Class notes from MITx 6.86x (Sec 15.9)](https://stackedit.io/viewer#!url=https://github.com/sylvaticus/MITx_6.86x/raw/master/Unit 04 - Unsupervised Learning/Unit 04 - Unsupervised Learning.md)
- [Limitations of gmm](https://www.r-craft.org/r-news/when-not-to-use-gaussian-mixture-model-gmm-clustering/)
# Example:
```julia
julia> clusters = gmm([1 10.5;1.5 0; 1.8 8; 1.7 15; 3.2 40; 0 0; 3.3 38; 0 -2.3; 5.2 -2.4],3,verbosity=HIGH)
```
"""
function gmm(X,K;initial_probmixtures=Float64[],mixtures=[DiagonalGaussian() for i in 1:K],tol=10^(-6),verbosity=STD,minimum_variance=0.05,minimum_covariance=0.0,initialisation_strategy="kmeans",maximum_iterations=typemax(Int64),rng = Random.GLOBAL_RNG)
# TODO: benchmark with this one: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04740-9
if verbosity > STD
@codelocation
end
# debug:
#X = [1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4]
#K = 3
#initial_probmixtures=nothing; tol=0.0001; msgStep=1; minimum_variance=0.25; initialisation_strategy="grid"
#mixtures = [SphericalGaussian() for i in 1:K]
# ---------
X = makematrix(X)
(N,D) = size(X)
pₖ = isempty(initial_probmixtures) ? fill(1/K,K) : copy(initial_probmixtures)
# no longer true with the numerical trick implemented
#if (minimum_variance == minimum_covariance)
# @warn("Setting the minimum_variance equal to the minimum_covariance may lead to singularity problems for mixtures with full covariance matrix.")
#end
msgStepMap = Dict(NONE => 0, LOW=>100, STD=>20, HIGH=>5, FULL=>1)
msgStep = msgStepMap[verbosity]
# Initialisation of the parameters of the mixtures
mixtures = identity.(deepcopy(mixtures)) # to set the container to the minimum common denominator of element types the deepcopy is not to change the function argument
#mixtures = identity.(mixtures)
init_mixtures!(mixtures,X,minimum_variance=minimum_variance,minimum_covariance=minimum_covariance,initialisation_strategy=initialisation_strategy,rng=rng)
pₙₖ = zeros(Float64,N,K) # The posteriors, i.e. the prob that item n belong to cluster k
ϵ = Float64[]
# Checking dimensions only once (but adding then inbounds doesn't change anything. Still good
# to provide a nice informative message)
if size(pₖ,1) != K || length(mixtures) != K
error("Error in the dimensions of the inputs. Please check them.")
end
# finding empty/non_empty values
#Xmask = .! ismissing.(X)
lL = -Inf
iter = 1
while(true)
oldlL = lL
# E Step: assigning the posterior prob p(j|xi) and computing the log-Likelihood of the parameters given the set of data
pₙₖlagged = copy(pₙₖ)
pₙₖ, lL = estep(X,pₖ,mixtures)
push!(ϵ,norm(pₙₖlagged - pₙₖ))
# M step: find parameters that maximise the likelihood
# Updating the probabilities of the different mixtures
nₖ = sum(pₙₖ,dims=1)'
n = sum(nₖ)
pₖ = nₖ ./ n
update_parameters!(mixtures, X, pₙₖ; minimum_variance=minimum_variance,minimum_covariance=minimum_covariance)
# Information. Note the likelihood is whitout accounting for the new mu, sigma
if msgStep != 0 && (length(ϵ) % msgStep == 0 || length(ϵ) == 1)
println("Iter. $(length(ϵ)):\tVar. of the post $(ϵ[end]) \t Log-likelihood $(lL)")
end
# Closing conditions. Note that the logLikelihood is those without considering the new mu,sigma
if ((lL - oldlL) <= (tol * abs(lL))) || (iter >= maximum_iterations)
npars = npar(mixtures) + (K-1)
#BIC = lL - (1/2) * npars * log(N)
BICv = bic(lL,npars,N)
AICv = aic(lL,npars)
#if (ϵ[end] < tol)
return (pₙₖ=pₙₖ,pₖ=pₖ,mixtures=mixtures,ϵ=ϵ,lL=lL,BIC=BICv,AIC=AICv)
else
iter += 1
end
end # end while loop
end # end function
# - For mixtures with full covariance matrix (i.e. `FullGaussian(μ,σ²)`) the minimum_covariance should NOT be set equal to the minimum_variance, or if the covariance matrix goes too low, it will become singular and not invertible.
# Avi v2..
"""
$(TYPEDEF)
Hyperparameters for GMM clusters and other GMM-based algorithms
## Parameters:
$(FIELDS)
"""
mutable struct GaussianMixture_hp <: BetaMLHyperParametersSet
"Number of mixtures (latent classes) to consider [def: 3]"
n_classes::Int64
"Initial probabilities of the categorical distribution (n_classes x 1) [default: `[]`]"
initial_probmixtures::Vector{Float64}
"""An array (of length `n_classes`) of the mixtures to employ (see the [`?GMM`](@ref GMM) module).
Each mixture object can be provided with or without its parameters (e.g. mean and variance for the gaussian ones). Fully qualified mixtures are useful only if the `initialisation_strategy` parameter is set to \"gived\".
This parameter can also be given symply in term of a _type_. In this case it is automatically extended to a vector of `n_classes` mixtures of the specified type.
Note that mixing of different mixture types is not currently supported and that currently implemented mixtures are `SphericalGaussian`, `DiagonalGaussian` and `FullGaussian`.
[def: `DiagonalGaussian`]"""
mixtures::Union{Type,Vector{<: AbstractMixture}}
"Tolerance to stop the algorithm [default: 10^(-6)]"
tol::Float64
"Minimum variance for the mixtures [default: 0.05]"
minimum_variance::Float64
"Minimum covariance for the mixtures with full covariance matrix [default: 0]. This should be set different than minimum_variance."
minimum_covariance::Float64
"""
The computation method of the vector of the initial mixtures.
One of the following:
- "grid": using a grid approach
- "given": using the mixture provided in the fully qualified `mixtures` parameter
- "kmeans": use first kmeans (itself initialised with a "grid" strategy) to set the initial mixture centers [default]
Note that currently "random" and "shuffle" initialisations are not supported in gmm-based algorithms.
"""
initialisation_strategy::String
"Maximum number of iterations [def: 5000]"
maximum_iterations::Int64
"""
The method - and its parameters - to employ for hyperparameters autotuning.
See [`SuccessiveHalvingSearch`](@ref) for the default method (suitable for the GMM-based regressors)
To implement automatic hyperparameter tuning during the (first) `fit!` call simply set `autotune=true` and eventually change the default `tunemethod` options (including the parameter ranges, the resources to employ and the loss function to adopt).
"""
tunemethod::AutoTuneMethod
function GaussianMixture_hp(;
n_classes::Union{Nothing,Int64} = nothing, # def: 3
initial_probmixtures::Vector{Float64} = Float64[],
mixtures::Union{Type,Vector{<: AbstractMixture},Nothing} = nothing, # DiagonalGaussian
tol::Float64 = 10^(-6),
minimum_variance::Float64 = 0.05,
minimum_covariance::Float64 = 0.0,
initialisation_strategy::String = "kmeans",
maximum_iterations::Int64 = 5000,
tunemethod::AutoTuneMethod = SuccessiveHalvingSearch(
hpranges = Dict("n_classes" =>[2,3,4,5,6],
"mixtures"=>[SphericalGaussian,DiagonalGaussian,FullGaussian],
"initialisation_strategy"=>["grid","kmeans"],
"minimum_covariance"=>[0.01,0.02],
"minimum_variance"=>[0.05,0.07,0.1])
,multithreads=true)
)
if isnothing(mixtures) && isnothing(n_classes)
n_classes = 3
mixtures = [DiagonalGaussian() for i in 1:3]
elseif isnothing(mixtures) && !isnothing(n_classes)
mixtures = [DiagonalGaussian() for i in 1:n_classes]
elseif typeof(mixtures) <: UnionAll && isnothing(n_classes)
n_classes = 3
mixtures = [mixtures() for i in 1:n_classes]
elseif typeof(mixtures) <: UnionAll && !isnothing(n_classes)
mixtures = [mixtures() for i in 1:n_classes]
elseif typeof(mixtures) <: AbstractVector && isnothing(n_classes)
println("sfsdfsdf")
n_classes = length(mixtures)
elseif typeof(mixtures) <: AbstractVector && !isnothing(n_classes)
n_classes == length(mixtures) || error("The length of the mixtures vector must be equal to the number of classes")
end
return new(n_classes,initial_probmixtures,mixtures,tol,minimum_variance,minimum_covariance,initialisation_strategy,maximum_iterations,tunemethod)
end
end
Base.@kwdef mutable struct GMMCluster_lp <: BetaMLLearnableParametersSet
mixtures::Union{Type,Vector{<: AbstractMixture}} = AbstractMixture[] # attention that this is set up at model construction, as it has the same name as the hyperparameter
initial_probmixtures::Vector{Float64} = []
#probRecords::Union{Nothing,Matrix{Float64}} = nothing
end
"""
$(TYPEDEF)
Assign class probabilities to records (i.e. _soft_ clustering) assuming a probabilistic generative model of observed data using mixtures.
For the parameters see [`?GaussianMixture_hp`](@ref GaussianMixture_hp) and [`?BML_options`](@ref BML_options).
# Notes:
- Data must be numerical
- Mixtures can be user defined: see the [`?GMM`](@ref GMM) module documentation for a discussion on provided vs custom mixtures.
- Online fitting (re-fitting with new data) is supported by setting the old learned mixtrures as the starting values
- The model is fitted using an Expectation-Minimisation (EM) algorithm that supports Missing data and is implemented in the log-domain for better numerical accuracy with many dimensions
# Example:
```julia
julia> using BetaML
julia> X = [1.1 10.1; 0.9 9.8; 10.0 1.1; 12.1 0.8; 0.8 9.8];
julia> mod = GaussianMixtureClusterer(n_classes=2)
GaussianMixtureClusterer - A Generative Mixture Model (unfitted)
julia> prob_belong_classes = fit!(mod,X)
Iter. 1: Var. of the post 2.15612140465882 Log-likelihood -29.06452054772657
5×2 Matrix{Float64}:
1.0 0.0
1.0 0.0
0.0 1.0
0.0 1.0
1.0 0.0
julia> new_probs = fit!(mod,[11 0.9])
Iter. 1: Var. of the post 1.0 Log-likelihood -1.3312256125240092
1×2 Matrix{Float64}:
0.0 1.0
julia> info(mod)
Dict{String, Any} with 6 entries:
"xndims" => 2
"error" => [1.0, 0.0, 0.0]
"AIC" => 15.7843
"fitted_records" => 6
"lL" => 1.10786
"BIC" => -2.21571
julia> parameters(mod)
BetaML.GMM.GMMCluster_lp (a BetaMLLearnableParametersSet struct)
- mixtures: DiagonalGaussian{Float64}[DiagonalGaussian{Float64}([0.9333333333333332, 9.9], [0.05, 0.05]), DiagonalGaussian{Float64}([11.05, 0.9500000000000001], [0.05, 0.05])]
- initial_probmixtures: [0.0, 1.0]
```
"""
mutable struct GaussianMixtureClusterer <: BetaMLUnsupervisedModel
hpar::GaussianMixture_hp
opt::BML_options
par::Union{Nothing,GMMCluster_lp}
cres::Union{Nothing,Matrix{Float64}}
fitted::Bool
info::Dict{String,Any}
end
function GaussianMixtureClusterer(;kwargs...)
m = GaussianMixtureClusterer(GaussianMixture_hp(),BML_options(),GMMCluster_lp(),nothing,false,Dict{Symbol,Any}())
thisobjfields = fieldnames(nonmissingtype(typeof(m)))
for (kw,kwv) in kwargs
found = false
for f in thisobjfields
fobj = getproperty(m,f)
if kw in fieldnames(typeof(fobj))
setproperty!(fobj,kw,kwv)
found = true
end
end
found || error("Keyword \"$kw\" is not part of this model.")
end
# Special correction for GaussianMixture_hp
kwkeys = keys(kwargs) #in(2,[1,2,3])
if !in(:mixtures,kwkeys) && !in(:n_classes,kwkeys)
m.hpar.n_classes = 3
m.hpar.mixtures = [DiagonalGaussian() for i in 1:3]
elseif !in(:mixtures,kwkeys) && in(:n_classes,kwkeys)
m.hpar.mixtures = [DiagonalGaussian() for i in 1:kwargs[:n_classes]]
elseif typeof(kwargs[:mixtures]) <: UnionAll && !in(:n_classes,kwkeys)
m.hpar.n_classes = 3
m.hpar.mixtures = [kwargs[:mixtures]() for i in 1:3]
elseif typeof(kwargs[:mixtures]) <: UnionAll && in(:n_classes,kwkeys)
m.hpar.mixtures = [kwargs[:mixtures]() for i in 1:kwargs[:n_classes]]
elseif typeof(kwargs[:mixtures]) <: AbstractVector && !in(:n_classes,kwkeys)
m.hpar.n_classes = length(kwargs[:mixtures])
elseif typeof(kwargs[:mixtures]) <: AbstractVector && in(:n_classes,kwkeys)
kwargs[:n_classes] == length(kwargs[:mixtures]) || error("The length of the mixtures vector must be equal to the number of classes")
end
return m
end
"""
$(TYPEDSIGNATURES)
Fit the [`GaussianMixtureClusterer`](@ref) model to data
# Notes:
- re-fitting is a new complete fitting but starting with mixtures computed in the previous fitting(s)
"""
function fit!(m::GaussianMixtureClusterer,x)
# Parameter alias..
K = m.hpar.n_classes
initial_probmixtures = m.hpar.initial_probmixtures
mixtures = m.hpar.mixtures
if typeof(mixtures) <: UnionAll
mixtures = [mixtures() for i in 1:K]
end
tol = m.hpar.tol
minimum_variance = m.hpar.minimum_variance
minimum_covariance = m.hpar.minimum_covariance
initialisation_strategy = m.hpar.initialisation_strategy
maximum_iterations = m.hpar.maximum_iterations
cache = m.opt.cache
verbosity = m.opt.verbosity
rng = m.opt.rng
if m.fitted
verbosity >= HIGH && @info "Continuing training of a pre-fitted model"
gmmOut = gmm(x,K;initial_probmixtures=m.par.initial_probmixtures,mixtures=m.par.mixtures,tol=tol,verbosity=verbosity,minimum_variance=minimum_variance,minimum_covariance=minimum_covariance,initialisation_strategy="given",maximum_iterations=maximum_iterations,rng = rng)
else
gmmOut = gmm(x,K;initial_probmixtures=initial_probmixtures,mixtures=mixtures,tol=tol,verbosity=verbosity,minimum_variance=minimum_variance,minimum_covariance=minimum_covariance,initialisation_strategy=initialisation_strategy,maximum_iterations=maximum_iterations,rng = rng)
end
probRecords = gmmOut.pₙₖ
m.par = GMMCluster_lp(mixtures = gmmOut.mixtures, initial_probmixtures=makecolvector(gmmOut.pₖ))
m.cres = cache ? probRecords : nothing
m.info["error"] = gmmOut.ϵ
m.info["lL"] = gmmOut.lL
m.info["BIC"] = gmmOut.BIC
m.info["AIC"] = gmmOut.AIC
m.info["fitted_records"] = get(m.info,"fitted_records",0) + size(x,1)
m.info["xndims"] = size(x,2)
m.fitted=true
return cache ? m.cres : nothing
end
"""
$(TYPEDSIGNATURES)
Predict the classes probabilities associated to new data assuming the mixtures computed in fitting a [`GaussianMixtureClusterer`](@ref) model.
"""
function predict(m::GaussianMixtureClusterer,X)
X = makematrix(X)
mixtures = m.par.mixtures
initial_probmixtures = m.par.initial_probmixtures
probRecords, lL = estep(X,initial_probmixtures,mixtures)
return probRecords
end
function show(io::IO, ::MIME"text/plain", m::GaussianMixtureClusterer)
if m.fitted == false
print(io,"GaussianMixtureClusterer - A Generative Mixture Model (unfitted)")
else
print(io,"GaussianMixtureClusterer - A Generative Mixture Model (fitted on $(m.info["fitted_records"]) records)")
end
end
function show(io::IO, m::GaussianMixtureClusterer)
m.opt.descr != "" && println(io,m.opt.descr)
if m.fitted == false
print(io,"GaussianMixtureClusterer - A $(m.hpar.n_classes)-classes Generative Mixture Model (unfitted)")
else
print(io,"GaussianMixtureClusterer - A $(m.hpar.n_classes)-classes Generative Mixture Model(fitted on $(m.info["fitted_records"]) records)")
println(io,m.info)
println(io,"Mixtures:")
println(io,m.par.mixtures)
println(io,"Probability of each mixture:")
println(io,m.par.initial_probmixtures)
end
end