In [1]:
using Distributions
using Turing
using JSON

In [2]:
cd("/home/kai/projects/Turing-exps/amazon-talk")
include("LDA.data.jl");

In [3]:
@model ldamodel(K, V, M, N, w, doc, beta, alpha) = begin
  theta = Vector{Vector{Real}}(M)
  for m = 1:M
    theta[m] ~ Dirichlet(alpha)
  end

  phi = Vector{Vector{Real}}(K)
  for k = 1:K
    phi[k] ~ Dirichlet(beta)
  end

  for n = 1:N
    # Marginalize z         
    phi_dot_theta = [dot(map(p -> p[i], phi), theta[doc[n]]) for i = 1:V]
    w[n] ~ Categorical(phi_dot_theta)
  end

end

ldamodel (generic function with 9 methods)

In [4]:
samples = sample(ldamodel(data=ldadata), HMC(250, 0.1, 3))

[Turing]:  Assume - `theta` is a parameter
  in @~(::Any, ::Any) at compiler.jl:49
[Turing]:  Assume - `phi` is a parameter
  in @~(::Any, ::Any) at compiler.jl:49
[Turing]:  Observe - `w` is an observation
  in @~(::Any, ::Any) at compiler.jl:28
[HMC] Done with accept rate = 0.0.


[HMC] Sampling...  0%  ETA: 0:10:45[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  1.0[0m[1G[K[A[1G[K[A[HMC] Sampling...  3%  ETA: 0:01:54[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  1.0[0m[1G[K[A[1G[K[A[HMC] Sampling...  6%  ETA: 0:01:21[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  1.0[0m[1G[K[A[1G[K[A[HMC] Sampling...  8%  ETA: 0:01:05[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  0.8910633600461504[0m[1G[K[A[1G[K[A[HMC] Sampling... 11%  ETA: 0:00:56[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  1.0[0m[1G[K[A[1G[K[A[HMC] Sampling... 14%  ETA: 0:00:51[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  0.9801858140614681[0m[1G[K[A[1G[K[A[HMC] Sampling... 16%  ETA: 0:00:47[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  0.7713359831105696[0m[1G[K[A[1G[K[A[HMC] Sampling... 19%  ETA: 0:00:45[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  0.9493506890446585[0m[1G[K[A[1G[K[A[HMC] Sampling... 22%  ETA: 0:00:42[1m[34m
  ϵ:  0.1[0m[1m[34m
  α:  1.0[0m[1G[K[A[1G[K[A[HMC] Sampling... 24%

Object of type "Turing.Chain"

Iterations = 1:250
Thinning interval = 1
Chains = 1
Samples per chain = 250

[0.118375 0.881625 … 0.671565 0.328435; 0.116266 0.883734 … 0.582399 0.417601; … ; 0.524472 0.475528 … 0.0459957 0.954004; 0.603701 0.396299 … 0.0445284 0.955472]

In [5]:
# Convert phi from array-of-array to matrix
phiarr = mean(samples[:phi])
phi = [phiarr[1]'; phiarr[2]']

2×5 Array{Float64,2}:
 0.132134  0.0904896  3.69149e-6  0.421433   0.355939   
 0.208727  0.206974   0.513495    0.0706376  0.000166274

In [6]:
# Convert theta from array-of-array to matrix
thetaarr = mean(samples[:theta])
theta = reduce((a, b) -> cat(1, a, b'), Matrix{Float64}(0, 2), thetaarr)

25×2 Array{Float64,2}:
 0.37563   0.62437 
 0.688316  0.311684
 0.540454  0.459546
 0.592479  0.407521
 0.347451  0.652549
 0.792279  0.207721
 0.553344  0.446656
 0.214449  0.785551
 0.793985  0.206015
 0.337101  0.662899
 0.781692  0.218308
 0.573008  0.426992
 0.358833  0.641167
 0.798589  0.201411
 0.144629  0.855371
 0.822587  0.177413
 0.354986  0.645014
 0.339862  0.660138
 0.459212  0.540788
 0.411284  0.588716
 0.312996  0.687004
 0.37578   0.62422 
 0.624204  0.375796
 0.463228  0.536772
 0.462539  0.537461

In [7]:
# Build a vector storing lengths of docs for vis
doclist = ldadata["doc"]
docldict = reduce((a, b) -> if haskey(a, b) a[b] += 1; a else a[b] = 1; a end, Dict(), doclist)
docls = map(i -> docldict[i], 1:ldadata["M"])

25-element Array{Int64,1}:
 10
 11
 17
 14
 11
  7
  5
 16
  7
 16
  6
  4
  9
 11
 11
 10
  9
 10
  9
  9
 10
  8
 10
 21
 11

In [8]:
# Build a vector of word frequencies for vis
wordlist = ldadata["w"]
freqdict = reduce((a, b) -> if haskey(a, b) a[b] += 1; a else a[b] = 1; a end, Dict(), wordlist)
freq = map(i -> freqdict[i], 1:ldadata["V"])

5-element Array{Int64,1}:
 51
 43
 71
 55
 42

In [9]:
# Save result for vis
ldaresult = Dict(
    "topic_term_dists" => phi',
    "doc_topic_dist" => theta',
    "doc_lengths" => docls,
    "vocab" => 1:ldadata["V"],
    "term_frequency" => freq
)

open("/home/kai/projects/Turing-exps/amazon-talk/LDA.result.json", "w") do f
    JSON.print(f, ldaresult)
end