Skip to content

Commit

Permalink
Merge 21563b1 into eb5eb7f
Browse files Browse the repository at this point in the history
  • Loading branch information
theogf committed Nov 22, 2021
2 parents eb5eb7f + 21563b1 commit 0df4f94
Show file tree
Hide file tree
Showing 41 changed files with 188 additions and 190 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/ci.yml
Expand Up @@ -25,8 +25,6 @@ jobs:
- 'nightly'
os:
- ubuntu-latest
- macOS-latest
- windows-latest
arch:
- x64
steps:
Expand Down
20 changes: 13 additions & 7 deletions docs/examples/heteroscedastic.jl
Expand Up @@ -5,7 +5,7 @@ using AugmentedGaussianProcesses
using Distributions
using LinearAlgebra
using Plots
default(lw=3.0, msw=0.0)
default(; lw=3.0, msw=0.0)
# using CairoMakie

# ## Model generated data
Expand All @@ -30,8 +30,8 @@ g = rand(MvNormal(μ₀ * ones(N), K))
y = f + σ .* randn(N); # We finally sample the ouput
# We can visualize the data:
n_sig = 2 # Number of std. dev. around the mean
plot(x, f, ribbon = n_sig * σ, lab= "p(y|f,g)") # Mean and std. dev. of y
scatter!(x, y, alpha=0.5, msw=0.0, lab="y") # Observation samples
plot(x, f; ribbon=n_sig * σ, lab="p(y|f,g)") # Mean and std. dev. of y
scatter!(x, y; alpha=0.5, msw=0.0, lab="y") # Observation samples

# ## Model creation and training
# We will now use the augmented model to infer both `f` and `g`
Expand All @@ -55,12 +55,18 @@ train!(model, 100);
y_m, y_σ = proba_y(model, x_test);
# Let's first look at the differece between the latent `f` and `g`
plot(x, [f, g]; label=["f" "g"])
plot!(x_test, [f_m, g_m]; ribbon=[n_sig * f_σ n_sig * g_σ], lab=["f_pred" "g_pred"], legend=true)
plot!(
x_test,
[f_m, g_m];
ribbon=[n_sig * f_σ n_sig * g_σ],
lab=["f_pred" "g_pred"],
legend=true,
)
# But it's more interesting to compare the predictive probability of `y` directly:
plot(x, f; ribbon = n_sig * σ, lab="p(y|f,g)")
plot!(x_test, y_m, ribbon = n_sig * sqrt.(y_σ), lab="p(y|f,g) pred")
plot(x, f; ribbon=n_sig * σ, lab="p(y|f,g)")
plot!(x_test, y_m; ribbon=n_sig * sqrt.(y_σ), lab="p(y|f,g) pred")
scatter!(x, y; lab="y", alpha=0.2)
# Or to explore the heteroscedasticity itself, we can look at the residuals
scatter(x, (f - y).^2; yaxis=:log, lab="residuals",alpha=0.2)
scatter(x, (f - y) .^ 2; yaxis=:log, lab="residuals", alpha=0.2)
plot!(x, σ .^ 2; lab="true σ²(x)")
plot!(x_test, y_σ; lab="predicted σ²(x)")
10 changes: 5 additions & 5 deletions src/ComplementaryDistributions/lap_transf_dist.jl
Expand Up @@ -25,12 +25,12 @@ end
function _check_f(f)
return true # TODO Add tests for complete monotonicity / PDR
end
_gradf(d::LaplaceTransformDistribution, x::Real) = first(ForwardDiff.gradient(d.f, [x]))
_gradf(d::LaplaceTransformDistribution, x::Real) = only(ForwardDiff.gradient(d.f, [x]))
function _gradlogf(d::LaplaceTransformDistribution, x::Real)
return first(ForwardDiff.gradient(log d.f, [x]))
return only(ForwardDiff.gradient(log d.f, [x]))
end
function _hessianlogf(d::LaplaceTransformDistribution, x::Real)
return first(ForwardDiff.hessian(log d.f, [x]))
return only(ForwardDiff.hessian(log d.f, [x]))
end

Distributions.pdf(dist::LaplaceTransformDistribution, x::Real) = apply_f(dist, x)
Expand All @@ -42,7 +42,7 @@ function Distributions.var(dist::LaplaceTransformDistribution)
end

function Random.rand(dist::LaplaceTransformDistribution)
return first(rand(dist, 1))
return only(rand(dist, 1))
end

# Sampling from Ridout (09)
Expand Down Expand Up @@ -173,7 +173,7 @@ function laptrans(
k += 1
t = t - (apply_F(dist, t) - u[i]) / apply_f(dist, t)
if t < x_L || t > x_U
t = 0.5 * (x_L + x_U)
t = (x_L + x_U) / 2
end
if apply_F(dist, t) <= u[i]
x_L = t
Expand Down
16 changes: 8 additions & 8 deletions src/ComplementaryDistributions/polyagamma.jl
Expand Up @@ -34,7 +34,7 @@ struct PolyaGamma{Tc,A} <: Distributions.ContinuousUnivariateDistribution
end
end

Statistics.mean(d::PolyaGamma) = d.b / (2 * d.c) * tanh(0.5 * d.c)
Statistics.mean(d::PolyaGamma) = d.b / (2 * d.c) * tanh(d.c / 2)

function PolyaGamma(b::Int, c::T; nmax::Int=10, trunc::Int=200) where {T<:Real}
return PolyaGamma{T}(b, c, trunc, nmax)
Expand All @@ -59,17 +59,17 @@ end
function a(n::Int, x::Real)
k = (n + 0.5) * π
if x > __TRUNC
return k * exp(-0.5 * k^2 * x)
return k * exp(-k^2 * x / 2)
elseif x > 0
expnt = -1.5 * (log(0.5 * π) + log(x)) + log(k) - 2.0 * (n + 0.5)^2 / x
expnt = -1.5 * (log(π / 2) + log(x)) + log(k) - 2 * (n + 0.5)^2 / x
return exp(expnt)
end
end

function mass_texpon(z::Real)
t = __TRUNC

fz = 0.125 * π^2 + 0.5 * z^2
fz = 0.125 * π^2 + z^2 / 2
b = sqrt(1.0 / t) * (t * z - 1)
a = sqrt(1.0 / t) * (t * z + 1) * -1.0

Expand Down Expand Up @@ -99,13 +99,13 @@ function rtigauss(rng::AbstractRNG, z::Real)
end
x = 1 + e1 * t
x = t / x^2
alpha = exp(-0.5 * z^2 * x)
alpha = exp(-z^2 * x / 2)
end
else
mu = 1.0 / z
while (x > t)
y = randn(rng)^2
half_mu = 0.5 * mu
half_mu = mu / 2
mu_Y = mu * y
x = mu + half_mu * mu_Y - half_mu * sqrt(4 * mu_Y + mu_Y^2)
if rand(rng) > mu / (mu + x)
Expand All @@ -122,10 +122,10 @@ end

function draw_like_devroye(rng::AbstractRNG, c::Real)
# Change the parameter.
c = 0.5 * abs(c)
c = abs(c) / 2

# Now sample 0.25 * J^*(1, Z := Z/2).
fz = 0.125 * π^2 + 0.5 * c^2
fz = 0.125 * π^2 + c^2 / 2
# ... Problems with large Z? Try using q_over_p.
# double p = 0.5 * __PI * exp(-1.0 * fz * __TRUNC) / fz;
# double q = 2 * exp(-1.0 * Z) * pigauss(__TRUNC, Z);
Expand Down
18 changes: 9 additions & 9 deletions src/functions/KLdivergences.jl
Expand Up @@ -14,7 +14,7 @@ function GaussianKL(
Σ::Symmetric{T,Matrix{T}},
K::Cholesky{T,Matrix{T}},
) where {T<:Real}
return 0.5 * (logdet(K) - logdet(Σ) + tr(K \ Σ) + invquad(K, μ - μ₀) - length(μ))
return (logdet(K) - logdet(Σ) + tr(K \ Σ) + invquad(K, μ - μ₀) - length(μ)) / 2
end

function GaussianKL(
Expand All @@ -24,7 +24,7 @@ function GaussianKL(
K::AbstractMatrix{T},
) where {T<:Real}
K
return 0.5 * (logdet(K) - logdet(Σ) + tr(K \ Σ) + dot- μ₀, K \- μ₀)) - length(μ))
return (logdet(K) - logdet(Σ) + tr(K \ Σ) + dot- μ₀, K \- μ₀)) - length(μ)) / 2
end

extraKL(::AbstractGPModel{T}, ::Any) where {T} = zero(T)
Expand All @@ -42,13 +42,13 @@ function extraKL(model::OnlineSVGP{T}, state) where {T}
κₐμ = kernel_mat.κₐ * mean(gp)
KLₐ = prev_gp.prev𝓛ₐ
KLₐ +=
-0.5 * sum(
-sum(
trace_ABt.(
Ref(prev_gp.invDₐ),
[kernel_mat.K̃ₐ, kernel_mat.κₐ * cov(gp) * transpose(kernel_mat.κₐ)],
),
)
KLₐ += dot(prev_gp.prevη₁, κₐμ) - 0.5 * dot(κₐμ, prev_gp.invDₐ * κₐμ)
) / 2
KLₐ += dot(prev_gp.prevη₁, κₐμ) - dot(κₐμ, prev_gp.invDₐ * κₐμ) / 2
return KLₐ
end
end
Expand Down Expand Up @@ -94,7 +94,7 @@ end
KL(q(ω)||p(ω)), where q(ω) = PG(b,c) and p(ω) = PG(b,0). θ = 𝑬[ω]
"""
function PolyaGammaKL(b, c, θ)
return dot(b, logcosh.(0.5 * c)) - 0.5 * dot(abs2.(c), θ)
return dot(b, logcosh.(c / 2)) - dot(abs2.(c), θ) / 2
end

"""
Expand All @@ -104,10 +104,10 @@ Entropy of GIG variables with parameters a,b and p and omitting the derivative d
"""
function GIGEntropy(a, b, p)
sqrt_ab = sqrt.(a .* b)
return 0.5 * (sum(log, a) - sum(log, b)) +
return (sum(log, a) - sum(log, b)) / 2 +
mapreduce((p, s) -> log(2 * besselk(p, s)), +, p, sqrt_ab) +
sum(
0.5 * sqrt_ab ./ besselk.(p, sqrt_ab) .*
sqrt_ab ./ besselk.(p, sqrt_ab) .*
(besselk.(p + 1, sqrt_ab) + besselk.(p - 1, sqrt_ab)),
)
) / 2
end
6 changes: 3 additions & 3 deletions src/hyperparameter/autotuning.jl
Expand Up @@ -226,7 +226,7 @@ end

## Return the derivative of the KL divergence between the posterior and the GP prior ##
function hyperparameter_KL_gradient(J::AbstractMatrix, A::AbstractMatrix)
return 0.5 * trace_ABt(J, A)
return trace_ABt(J, A) / 2
end

function hyperparameter_gradient_function(gp::LatentGP{T}, ::AbstractVector) where {T}
Expand Down Expand Up @@ -376,7 +376,7 @@ function hyperparameter_online_gradient(
)
ιₐ = (Jab - gp.κₐ * Jmm) / pr_cov(gp)
trace_term =
-0.5 * sum(
-sum(
trace_ABt.(
[gp.invDₐ],
[
Expand All @@ -386,7 +386,7 @@ function hyperparameter_online_gradient(
-gp.κₐ * transpose(Jab),
],
),
)
) / 2
term_1 = dot(gp.prevη₁, ιₐ * mean(gp))
term_2 = -dot(ιₐ * mean(gp), gp.invDₐ * gp.κₐ * mean(gp))
return trace_term + term_1 + term_2
Expand Down
4 changes: 2 additions & 2 deletions src/inference/analytic.jl
Expand Up @@ -37,10 +37,10 @@ function analytic_updates(m::GP{T}, state, y) where {T}
f = getf(m)
l = likelihood(m)
K = state.kernel_matrices.K
f.post.Σ = K + first(l.σ²) * I
f.post.Σ = K + only(l.σ²) * I
f.post.α .= cov(f) \ (y - pr_mean(f, input(m.data)))
if !isnothing(l.opt_noise)
g = 0.5 * (norm(mean(f), 2) - tr(inv(cov(f))))
g = (norm(mean(f), 2) - tr(inv(cov(f)))) / 2
state_σ², Δlogσ² = Optimisers.apply(
l.opt_noise, state.local_vars.state_σ², l.σ², g .* l.σ²
)
Expand Down
12 changes: 6 additions & 6 deletions src/inference/analyticVI.jl
Expand Up @@ -119,7 +119,7 @@ function local_updates!(
μ::Tuple{<:AbstractVector{T}},
diagΣ::Tuple{<:AbstractVector{T}},
) where {T}
return local_updates!(local_vars, l, y, first(μ), first(diagΣ))
return local_updates!(local_vars, l, y, only(μ), only(diagΣ))
end

# Coordinate ascent updates on the natural parameters ##
Expand All @@ -135,7 +135,7 @@ function natural_gradient!(
) where {T}
K = kernel_matrices.K
gp.post.η₁ .= ∇E_μ .+ K \ pr_mean(gp, X)
gp.post.η₂ .= -Symmetric(Diagonal(∇E_Σ) + 0.5 * inv(K))
gp.post.η₂ .= -Symmetric(Diagonal(∇E_Σ) + inv(K) / 2)
return gp
end

Expand Down Expand Up @@ -168,15 +168,15 @@ function ∇η₁(
return transpose(κ) ** ∇μ) + (K \ μ₀) - η₁
end

# Gradient of on the second natural parameter η₂ = -0.5Σ⁻¹
# Gradient of on the second natural parameter η₂ = -1/2Σ⁻¹
function ∇η₂(
θ::AbstractVector{T},
ρ::Real,
κ::AbstractMatrix{<:Real},
K::Cholesky{T,Matrix{T}},
η₂::Symmetric{T,Matrix{T}},
) where {T<:Real}
return -(ρκdiagθκ(ρ, κ, θ) + 0.5 * inv(K)) - η₂
return -(ρκdiagθκ(ρ, κ, θ) + inv(K) / 2) - η₂
end

# Natural gradient for the ONLINE model (OSVGP) #
Expand All @@ -198,7 +198,7 @@ function natural_gradient!(
invDₐ = previous_gp.invDₐ
gp.post.η₁ = K \ pr_mean(gp, Z) + transpose(κ) * ∇E_μ + transpose(κₐ) * prevη₁
gp.post.η₂ =
-Symmetric(ρκdiagθκ(1.0, κ, ∇E_Σ) + 0.5 * transpose(κₐ) * invDₐ * κₐ + 0.5 * inv(K))
-Symmetric(ρκdiagθκ(1.0, κ, ∇E_Σ) + transpose(κₐ) * invDₐ * κₐ / 2 + inv(K) / 2)
return gp
end

Expand Down Expand Up @@ -304,5 +304,5 @@ function expec_loglikelihood(
diagΣ::Tuple{<:AbstractVector{T}},
state,
) where {T}
return expec_loglikelihood(l, i, y, first(μ), first(diagΣ), state)
return expec_loglikelihood(l, i, y, only(μ), only(diagΣ), state)
end
2 changes: 1 addition & 1 deletion src/inference/gibbssampling.jl
Expand Up @@ -44,7 +44,7 @@ end
function sample_local!(
local_vars, l::AbstractLikelihood, y, f::Tuple{<:AbstractVector{T}}
) where {T}
return sample_local!(local_vars, l, y, first(f))
return sample_local!(local_vars, l, y, only(f))
end

function sample_global!(
Expand Down
5 changes: 3 additions & 2 deletions src/inference/inference.jl
Expand Up @@ -11,6 +11,7 @@ post_process!(::AbstractGPModel) = nothing

# Utils to iterate over inference objects
Base.length(::AbstractInference) = 1
Base.size(::AbstractInference) = (1,)

Base.iterate(i::AbstractInference) = (i, nothing)
Base.iterate(::AbstractInference, ::Any) = nothing
Expand All @@ -22,13 +23,13 @@ conv_crit(i::AbstractInference) = i.ϵ

## Conversion from natural to standard distribution parameters ##
function global_update!(gp::AbstractLatent)
gp.post.Σ .= -0.5 * inv(nat2(gp))
gp.post.Σ .= -inv(nat2(gp)) / 2
return gp.post.μ .= cov(gp) * nat1(gp)
end

## For the online case, the size may vary and inplace updates are note valid
function global_update!(gp::OnlineVarLatent)
gp.post.Σ = -0.5 * inv(nat2(gp))
gp.post.Σ = -inv(nat2(gp)) / 2
return gp.post.μ = cov(gp) * nat1(gp)
end

Expand Down
6 changes: 3 additions & 3 deletions src/inference/numericalVI.jl
Expand Up @@ -128,7 +128,7 @@ function classical_gradient!(
opt_state,
) where {T<:Real}
K = kernel_matrices.K
opt_state.∇η₂ .= Diagonal(∇E_Σ) - 0.5 * (inv(K) - inv(cov(gp)))
opt_state.∇η₂ .= Diagonal(∇E_Σ) - (inv(K) - inv(cov(gp))) / 2
opt_state.∇η₁ .= ∇E_μ - K \ (mean(gp) - pr_mean(gp, X))
return opt_state
end
Expand All @@ -144,7 +144,7 @@ function classical_gradient!(
) where {T<:Real}
K = kernel_matrices.K
κ = kernel_matrices.κ
opt_state.∇η₂ .= ρκdiagθκ(ρ(i), κ, ∇E_Σ) - 0.5 * (inv(K) - inv(cov(gp)))
opt_state.∇η₂ .= ρκdiagθκ(ρ(i), κ, ∇E_Σ) - (inv(K) - inv(cov(gp))) / 2
opt_state.∇η₁ .= ρ(i) * transpose(κ) * ∇E_μ - K \ (mean(gp) - pr_mean(gp, Z))
return opt_state
end
Expand Down Expand Up @@ -187,7 +187,7 @@ function expec_loglikelihood(
μ::Tuple{<:AbstractVector{T}},
Σ::Tuple{<:AbstractVector{T}},
) where {T}
return expec_loglikelihood(l, i, y, first(μ), first(Σ))
return expec_loglikelihood(l, i, y, only(μ), only(Σ))
end

function ELBO(m::AbstractGPModel{T,L,<:NumericalVI}, state, y) where {T,L}
Expand Down
6 changes: 4 additions & 2 deletions src/inference/quadratureVI.jl
Expand Up @@ -90,10 +90,12 @@ end
function expec_loglikelihood(
l::AbstractLikelihood, i::QuadratureVI, y, μ::AbstractVector, diagΣ::AbstractVector
)
return mapreduce(apply_quad, :+, y, μ, diagΣ, i, l)
return mapreduce(+, y, μ, diagΣ) do y, μ, σ²
apply_quad(i, y, μ, σ², l)
end
end

function apply_quad(y::Real, μ::Real, σ²::Real, i::QuadratureVI, l::AbstractLikelihood)
function apply_quad(i::QuadratureVI, y::Real, μ::Real, σ²::Real, l::AbstractLikelihood)
xs = i.nodes * sqrt(σ²) .+ μ
return dot(i.weights, loglikelihood.(Ref(l), y, xs))
# return mapreduce((w, x) -> w * Distributions.loglikelihood(l, y, x), +, i.weights, xs)# loglikelihood.(l, y, x))
Expand Down
2 changes: 1 addition & 1 deletion src/inference/sampling.jl
Expand Up @@ -41,7 +41,7 @@ function ∇logprior(gp::AbstractLatent, f)
end

function logprior(gp::AbstractLatent, f)
return -0.5 * logdet(pr_cov(gp)) - 0.5 * invquad(pr_cov(gp), f) # Remove μ₀ temp
return -logdet(pr_cov(gp)) / 2 - invquad(pr_cov(gp), f) / 2 # Remove μ₀ temp
end

function store_variables!(i::SamplingInference{T}, fs) where {T}
Expand Down

0 comments on commit 0df4f94

Please sign in to comment.