Merge 21563b1 into eb5eb7f

theogf · Nov 22, 2021 · 0df4f94 · 0df4f94
2 parents eb5eb7f + 21563b1
commit 0df4f94
Show file tree

Hide file tree

Showing 41 changed files with 188 additions and 190 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,8 +25,6 @@ jobs:
           - 'nightly'
         os:
           - ubuntu-latest
-          - macOS-latest
-          - windows-latest
         arch:
           - x64
     steps:

diff --git a/docs/examples/heteroscedastic.jl b/docs/examples/heteroscedastic.jl
@@ -5,7 +5,7 @@ using AugmentedGaussianProcesses
 using Distributions
 using LinearAlgebra
 using Plots
-default(lw=3.0, msw=0.0)
+default(; lw=3.0, msw=0.0)
 # using CairoMakie
 
 # ## Model generated data
@@ -30,8 +30,8 @@ g = rand(MvNormal(μ₀ * ones(N), K))
 y = f + σ .* randn(N); # We finally sample the ouput
 # We can visualize the data:
 n_sig = 2 # Number of std. dev. around the mean
-plot(x, f, ribbon = n_sig * σ, lab= "p(y|f,g)") # Mean and std. dev. of y
-scatter!(x, y, alpha=0.5, msw=0.0, lab="y") # Observation samples
+plot(x, f; ribbon=n_sig * σ, lab="p(y|f,g)") # Mean and std. dev. of y
+scatter!(x, y; alpha=0.5, msw=0.0, lab="y") # Observation samples
 
 # ## Model creation and training
 # We will now use the augmented model to infer both `f` and `g`
@@ -55,12 +55,18 @@ train!(model, 100);
 y_m, y_σ = proba_y(model, x_test);
 # Let's first look at the differece between the latent `f` and `g`
 plot(x, [f, g]; label=["f" "g"])
-plot!(x_test, [f_m, g_m]; ribbon=[n_sig * f_σ n_sig * g_σ], lab=["f_pred" "g_pred"], legend=true)
+plot!(
+    x_test,
+    [f_m, g_m];
+    ribbon=[n_sig * f_σ n_sig * g_σ],
+    lab=["f_pred" "g_pred"],
+    legend=true,
+)
 # But it's more interesting to compare the predictive probability of `y` directly:
-plot(x, f; ribbon = n_sig * σ, lab="p(y|f,g)")
-plot!(x_test, y_m, ribbon = n_sig * sqrt.(y_σ), lab="p(y|f,g) pred")
+plot(x, f; ribbon=n_sig * σ, lab="p(y|f,g)")
+plot!(x_test, y_m; ribbon=n_sig * sqrt.(y_σ), lab="p(y|f,g) pred")
 scatter!(x, y; lab="y", alpha=0.2)
 # Or to explore the heteroscedasticity itself, we can look at the residuals
-scatter(x, (f - y).^2; yaxis=:log, lab="residuals",alpha=0.2)
+scatter(x, (f - y) .^ 2; yaxis=:log, lab="residuals", alpha=0.2)
 plot!(x, σ .^ 2; lab="true σ²(x)")
 plot!(x_test, y_σ; lab="predicted σ²(x)")
diff --git a/src/ComplementaryDistributions/lap_transf_dist.jl b/src/ComplementaryDistributions/lap_transf_dist.jl
@@ -25,12 +25,12 @@ end
 function _check_f(f)
     return true # TODO Add tests for complete monotonicity / PDR
 end
-_gradf(d::LaplaceTransformDistribution, x::Real) = first(ForwardDiff.gradient(d.f, [x]))
+_gradf(d::LaplaceTransformDistribution, x::Real) = only(ForwardDiff.gradient(d.f, [x]))
 function _gradlogf(d::LaplaceTransformDistribution, x::Real)
-    return first(ForwardDiff.gradient(log ∘ d.f, [x]))
+    return only(ForwardDiff.gradient(log ∘ d.f, [x]))
 end
 function _hessianlogf(d::LaplaceTransformDistribution, x::Real)
-    return first(ForwardDiff.hessian(log ∘ d.f, [x]))
+    return only(ForwardDiff.hessian(log ∘ d.f, [x]))
 end
 
 Distributions.pdf(dist::LaplaceTransformDistribution, x::Real) = apply_f(dist, x)
@@ -42,7 +42,7 @@ function Distributions.var(dist::LaplaceTransformDistribution)
 end
 
 function Random.rand(dist::LaplaceTransformDistribution)
-    return first(rand(dist, 1))
+    return only(rand(dist, 1))
 end
 
 # Sampling from Ridout (09)
@@ -173,7 +173,7 @@ function laptrans(
             k += 1
             t = t - (apply_F(dist, t) - u[i]) / apply_f(dist, t)
             if t < x_L || t > x_U
-                t = 0.5 * (x_L + x_U)
+                t = (x_L + x_U) / 2
             end
             if apply_F(dist, t) <= u[i]
                 x_L = t

diff --git a/src/ComplementaryDistributions/polyagamma.jl b/src/ComplementaryDistributions/polyagamma.jl
@@ -34,7 +34,7 @@ struct PolyaGamma{Tc,A} <: Distributions.ContinuousUnivariateDistribution
     end
 end
 
-Statistics.mean(d::PolyaGamma) = d.b / (2 * d.c) * tanh(0.5 * d.c)
+Statistics.mean(d::PolyaGamma) = d.b / (2 * d.c) * tanh(d.c / 2)
 
 function PolyaGamma(b::Int, c::T; nmax::Int=10, trunc::Int=200) where {T<:Real}
     return PolyaGamma{T}(b, c, trunc, nmax)
@@ -59,17 +59,17 @@ end
 function a(n::Int, x::Real)
     k = (n + 0.5) * π
     if x > __TRUNC
-        return k * exp(-0.5 * k^2 * x)
+        return k * exp(-k^2 * x / 2)
     elseif x > 0
-        expnt = -1.5 * (log(0.5 * π) + log(x)) + log(k) - 2.0 * (n + 0.5)^2 / x
+        expnt = -1.5 * (log(π / 2) + log(x)) + log(k) - 2 * (n + 0.5)^2 / x
         return exp(expnt)
     end
 end
 
 function mass_texpon(z::Real)
     t = __TRUNC
 
-    fz = 0.125 * π^2 + 0.5 * z^2
+    fz = 0.125 * π^2 + z^2 / 2
     b = sqrt(1.0 / t) * (t * z - 1)
     a = sqrt(1.0 / t) * (t * z + 1) * -1.0
 
@@ -99,13 +99,13 @@ function rtigauss(rng::AbstractRNG, z::Real)
             end
             x = 1 + e1 * t
             x = t / x^2
-            alpha = exp(-0.5 * z^2 * x)
+            alpha = exp(-z^2 * x / 2)
         end
     else
         mu = 1.0 / z
         while (x > t)
             y = randn(rng)^2
-            half_mu = 0.5 * mu
+            half_mu = mu / 2
             mu_Y = mu * y
             x = mu + half_mu * mu_Y - half_mu * sqrt(4 * mu_Y + mu_Y^2)
             if rand(rng) > mu / (mu + x)
@@ -122,10 +122,10 @@ end
 
 function draw_like_devroye(rng::AbstractRNG, c::Real)
     # Change the parameter.
-    c = 0.5 * abs(c)
+    c = abs(c) / 2
 
     # Now sample 0.25 * J^*(1, Z := Z/2).
-    fz = 0.125 * π^2 + 0.5 * c^2
+    fz = 0.125 * π^2 + c^2 / 2
     # ... Problems with large Z?  Try using q_over_p.
     # double p  = 0.5 * __PI * exp(-1.0 * fz * __TRUNC) / fz;
     # double q  = 2 * exp(-1.0 * Z) * pigauss(__TRUNC, Z);

diff --git a/src/functions/KLdivergences.jl b/src/functions/KLdivergences.jl
@@ -14,7 +14,7 @@ function GaussianKL(
     Σ::Symmetric{T,Matrix{T}},
     K::Cholesky{T,Matrix{T}},
 ) where {T<:Real}
-    return 0.5 * (logdet(K) - logdet(Σ) + tr(K \ Σ) + invquad(K, μ - μ₀) - length(μ))
+    return (logdet(K) - logdet(Σ) + tr(K \ Σ) + invquad(K, μ - μ₀) - length(μ)) / 2
 end
 
 function GaussianKL(
@@ -24,7 +24,7 @@ function GaussianKL(
     K::AbstractMatrix{T},
 ) where {T<:Real}
     K
-    return 0.5 * (logdet(K) - logdet(Σ) + tr(K \ Σ) + dot(μ - μ₀, K \ (μ - μ₀)) - length(μ))
+    return (logdet(K) - logdet(Σ) + tr(K \ Σ) + dot(μ - μ₀, K \ (μ - μ₀)) - length(μ)) / 2
 end
 
 extraKL(::AbstractGPModel{T}, ::Any) where {T} = zero(T)
@@ -42,13 +42,13 @@ function extraKL(model::OnlineSVGP{T}, state) where {T}
         κₐμ = kernel_mat.κₐ * mean(gp)
         KLₐ = prev_gp.prev𝓛ₐ
         KLₐ +=
-            -0.5 * sum(
+            -sum(
                 trace_ABt.(
                     Ref(prev_gp.invDₐ),
                     [kernel_mat.K̃ₐ, kernel_mat.κₐ * cov(gp) * transpose(kernel_mat.κₐ)],
                 ),
-            )
-        KLₐ += dot(prev_gp.prevη₁, κₐμ) - 0.5 * dot(κₐμ, prev_gp.invDₐ * κₐμ)
+            ) / 2
+        KLₐ += dot(prev_gp.prevη₁, κₐμ) - dot(κₐμ, prev_gp.invDₐ * κₐμ) / 2
         return KLₐ
     end
 end
@@ -94,7 +94,7 @@ end
 KL(q(ω)||p(ω)), where q(ω) = PG(b,c) and p(ω) = PG(b,0). θ = 𝑬[ω]
 """
 function PolyaGammaKL(b, c, θ)
-    return dot(b, logcosh.(0.5 * c)) - 0.5 * dot(abs2.(c), θ)
+    return dot(b, logcosh.(c / 2)) - dot(abs2.(c), θ) / 2
 end
 
 """
@@ -104,10 +104,10 @@ Entropy of GIG variables with parameters a,b and p and omitting the derivative d
 """
 function GIGEntropy(a, b, p)
     sqrt_ab = sqrt.(a .* b)
-    return 0.5 * (sum(log, a) - sum(log, b)) +
+    return (sum(log, a) - sum(log, b)) / 2 +
            mapreduce((p, s) -> log(2 * besselk(p, s)), +, p, sqrt_ab) +
            sum(
-               0.5 * sqrt_ab ./ besselk.(p, sqrt_ab) .*
+               sqrt_ab ./ besselk.(p, sqrt_ab) .*
                (besselk.(p + 1, sqrt_ab) + besselk.(p - 1, sqrt_ab)),
-           )
+           ) / 2
 end
diff --git a/src/hyperparameter/autotuning.jl b/src/hyperparameter/autotuning.jl
@@ -226,7 +226,7 @@ end
 
 ## Return the derivative of the KL divergence between the posterior and the GP prior ##
 function hyperparameter_KL_gradient(J::AbstractMatrix, A::AbstractMatrix)
-    return 0.5 * trace_ABt(J, A)
+    return trace_ABt(J, A) / 2
 end
 
 function hyperparameter_gradient_function(gp::LatentGP{T}, ::AbstractVector) where {T}
@@ -376,7 +376,7 @@ function hyperparameter_online_gradient(
 )
     ιₐ = (Jab - gp.κₐ * Jmm) / pr_cov(gp)
     trace_term =
-        -0.5 * sum(
+        -sum(
             trace_ABt.(
                 [gp.invDₐ],
                 [
@@ -386,7 +386,7 @@ function hyperparameter_online_gradient(
                     -gp.κₐ * transpose(Jab),
                 ],
             ),
-        )
+        ) / 2
     term_1 = dot(gp.prevη₁, ιₐ * mean(gp))
     term_2 = -dot(ιₐ * mean(gp), gp.invDₐ * gp.κₐ * mean(gp))
     return trace_term + term_1 + term_2

diff --git a/src/inference/analytic.jl b/src/inference/analytic.jl
@@ -37,10 +37,10 @@ function analytic_updates(m::GP{T}, state, y) where {T}
     f = getf(m)
     l = likelihood(m)
     K = state.kernel_matrices.K
-    f.post.Σ = K + first(l.σ²) * I
+    f.post.Σ = K + only(l.σ²) * I
     f.post.α .= cov(f) \ (y - pr_mean(f, input(m.data)))
     if !isnothing(l.opt_noise)
-        g = 0.5 * (norm(mean(f), 2) - tr(inv(cov(f))))
+        g = (norm(mean(f), 2) - tr(inv(cov(f)))) / 2
         state_σ², Δlogσ² = Optimisers.apply(
             l.opt_noise, state.local_vars.state_σ², l.σ², g .* l.σ²
         )

diff --git a/src/inference/analyticVI.jl b/src/inference/analyticVI.jl
@@ -119,7 +119,7 @@ function local_updates!(
     μ::Tuple{<:AbstractVector{T}},
     diagΣ::Tuple{<:AbstractVector{T}},
 ) where {T}
-    return local_updates!(local_vars, l, y, first(μ), first(diagΣ))
+    return local_updates!(local_vars, l, y, only(μ), only(diagΣ))
 end
 
 # Coordinate ascent updates on the natural parameters ##
@@ -135,7 +135,7 @@ function natural_gradient!(
 ) where {T}
     K = kernel_matrices.K
     gp.post.η₁ .= ∇E_μ .+ K \ pr_mean(gp, X)
-    gp.post.η₂ .= -Symmetric(Diagonal(∇E_Σ) + 0.5 * inv(K))
+    gp.post.η₂ .= -Symmetric(Diagonal(∇E_Σ) + inv(K) / 2)
     return gp
 end
 
@@ -168,15 +168,15 @@ function ∇η₁(
     return transpose(κ) * (ρ * ∇μ) + (K \ μ₀) - η₁
 end
 
-# Gradient of on the second natural parameter η₂ = -0.5Σ⁻¹
+# Gradient of on the second natural parameter η₂ = -1/2Σ⁻¹
 function ∇η₂(
     θ::AbstractVector{T},
     ρ::Real,
     κ::AbstractMatrix{<:Real},
     K::Cholesky{T,Matrix{T}},
     η₂::Symmetric{T,Matrix{T}},
 ) where {T<:Real}
-    return -(ρκdiagθκ(ρ, κ, θ) + 0.5 * inv(K)) - η₂
+    return -(ρκdiagθκ(ρ, κ, θ) + inv(K) / 2) - η₂
 end
 
 # Natural gradient for the ONLINE model (OSVGP) #
@@ -198,7 +198,7 @@ function natural_gradient!(
     invDₐ = previous_gp.invDₐ
     gp.post.η₁ = K \ pr_mean(gp, Z) + transpose(κ) * ∇E_μ + transpose(κₐ) * prevη₁
     gp.post.η₂ =
-        -Symmetric(ρκdiagθκ(1.0, κ, ∇E_Σ) + 0.5 * transpose(κₐ) * invDₐ * κₐ + 0.5 * inv(K))
+        -Symmetric(ρκdiagθκ(1.0, κ, ∇E_Σ) + transpose(κₐ) * invDₐ * κₐ / 2 + inv(K) / 2)
     return gp
 end
 
@@ -304,5 +304,5 @@ function expec_loglikelihood(
     diagΣ::Tuple{<:AbstractVector{T}},
     state,
 ) where {T}
-    return expec_loglikelihood(l, i, y, first(μ), first(diagΣ), state)
+    return expec_loglikelihood(l, i, y, only(μ), only(diagΣ), state)
 end
diff --git a/src/inference/gibbssampling.jl b/src/inference/gibbssampling.jl
@@ -44,7 +44,7 @@ end
 function sample_local!(
     local_vars, l::AbstractLikelihood, y, f::Tuple{<:AbstractVector{T}}
 ) where {T}
-    return sample_local!(local_vars, l, y, first(f))
+    return sample_local!(local_vars, l, y, only(f))
 end
 
 function sample_global!(

diff --git a/src/inference/inference.jl b/src/inference/inference.jl
@@ -11,6 +11,7 @@ post_process!(::AbstractGPModel) = nothing
 
 # Utils to iterate over inference objects
 Base.length(::AbstractInference) = 1
+Base.size(::AbstractInference) = (1,)
 
 Base.iterate(i::AbstractInference) = (i, nothing)
 Base.iterate(::AbstractInference, ::Any) = nothing
@@ -22,13 +23,13 @@ conv_crit(i::AbstractInference) = i.ϵ
 
 ## Conversion from natural to standard distribution parameters ##
 function global_update!(gp::AbstractLatent)
-    gp.post.Σ .= -0.5 * inv(nat2(gp))
+    gp.post.Σ .= -inv(nat2(gp)) / 2
     return gp.post.μ .= cov(gp) * nat1(gp)
 end
 
 ## For the online case, the size may vary and inplace updates are note valid
 function global_update!(gp::OnlineVarLatent)
-    gp.post.Σ = -0.5 * inv(nat2(gp))
+    gp.post.Σ = -inv(nat2(gp)) / 2
     return gp.post.μ = cov(gp) * nat1(gp)
 end
 

diff --git a/src/inference/numericalVI.jl b/src/inference/numericalVI.jl
@@ -128,7 +128,7 @@ function classical_gradient!(
     opt_state,
 ) where {T<:Real}
     K = kernel_matrices.K
-    opt_state.∇η₂ .= Diagonal(∇E_Σ) - 0.5 * (inv(K) - inv(cov(gp)))
+    opt_state.∇η₂ .= Diagonal(∇E_Σ) - (inv(K) - inv(cov(gp))) / 2
     opt_state.∇η₁ .= ∇E_μ - K \ (mean(gp) - pr_mean(gp, X))
     return opt_state
 end
@@ -144,7 +144,7 @@ function classical_gradient!(
 ) where {T<:Real}
     K = kernel_matrices.K
     κ = kernel_matrices.κ
-    opt_state.∇η₂ .= ρκdiagθκ(ρ(i), κ, ∇E_Σ) - 0.5 * (inv(K) - inv(cov(gp)))
+    opt_state.∇η₂ .= ρκdiagθκ(ρ(i), κ, ∇E_Σ) - (inv(K) - inv(cov(gp))) / 2
     opt_state.∇η₁ .= ρ(i) * transpose(κ) * ∇E_μ - K \ (mean(gp) - pr_mean(gp, Z))
     return opt_state
 end
@@ -187,7 +187,7 @@ function expec_loglikelihood(
     μ::Tuple{<:AbstractVector{T}},
     Σ::Tuple{<:AbstractVector{T}},
 ) where {T}
-    return expec_loglikelihood(l, i, y, first(μ), first(Σ))
+    return expec_loglikelihood(l, i, y, only(μ), only(Σ))
 end
 
 function ELBO(m::AbstractGPModel{T,L,<:NumericalVI}, state, y) where {T,L}

diff --git a/src/inference/quadratureVI.jl b/src/inference/quadratureVI.jl
@@ -90,10 +90,12 @@ end
 function expec_loglikelihood(
     l::AbstractLikelihood, i::QuadratureVI, y, μ::AbstractVector, diagΣ::AbstractVector
 )
-    return mapreduce(apply_quad, :+, y, μ, diagΣ, i, l)
+    return mapreduce(+, y, μ, diagΣ) do y, μ, σ²
+        apply_quad(i, y, μ, σ², l)
+    end
 end
 
-function apply_quad(y::Real, μ::Real, σ²::Real, i::QuadratureVI, l::AbstractLikelihood)
+function apply_quad(i::QuadratureVI, y::Real, μ::Real, σ²::Real, l::AbstractLikelihood)
     xs = i.nodes * sqrt(σ²) .+ μ
     return dot(i.weights, loglikelihood.(Ref(l), y, xs))
     # return mapreduce((w, x) -> w * Distributions.loglikelihood(l, y, x), +, i.weights, xs)# loglikelihood.(l, y, x))

diff --git a/src/inference/sampling.jl b/src/inference/sampling.jl
@@ -41,7 +41,7 @@ function ∇logprior(gp::AbstractLatent, f)
 end
 
 function logprior(gp::AbstractLatent, f)
-    return -0.5 * logdet(pr_cov(gp)) - 0.5 * invquad(pr_cov(gp), f) # Remove μ₀ temp
+    return -logdet(pr_cov(gp)) / 2 - invquad(pr_cov(gp), f) / 2 # Remove μ₀ temp
 end
 
 function store_variables!(i::SamplingInference{T}, fs) where {T}