## Plotting library

This is not going to be needed much so it can be skipped.

In [None]:
using Plots
gr()

## PDMP libraries + JLD and co

Takes 10 seconds to load in Julia 0.6, this is due to ApproxFun... 

In [None]:
print("loading PDMP... "); ta = time()
using PDMP
println("[done in $(round(time()-ta,1))s]")

print("loading other packages... "); ta = time()
using JLD
println("[done in $(round(time()-ta,1))s]")

cprint(s, b)   = b ? print(s)   : nothing
cprintln(s, b) = b ? println(s) : nothing
;

## Loading & prepping of data

Note: the scaling (or absence thereof) of the data, changes the scale of the RMSE. One has to be careful about that before comparing with "benchmark data".

In [None]:
verb = true

cprint("reading and preparing data... ", verb) ; ta = time()

# This is the Movielens 1M dataset
rows      = vec(readdlm("data/rows.csv",  Int))
cols      = vec(readdlm("data/cols.csv",  Int))
raw_rates = vec(readdlm("data/rates.csv", Float64))

# centre and scale the rates
range     = maximum(raw_rates)-minimum(raw_rates)
nrm_rates = (raw_rates - mean(raw_rates))/range

# scaling as per Salakhutdinov & Mni
pmf_rates = (raw_rates - minimum(raw_rates))/range

cprintln("[done in $(round(time()-ta,1))s]", verb)
;

### Picking one to go with

In [None]:
rates = pmf_rates;

## Splitting train & test

95% for training, remaining for test.

In [None]:
nfull      = length(rates)
ntrain     = round(Int,0.95*nfull)
mask       = randperm(nfull)
train_mask = mask[1:ntrain]
test_mask  = mask[(ntrain+1):end]
;

## Computation of base sigmas

* $\sigma_R = 0.5$ (orig paper)
* $\sigma_U, \sigma_V$ set as per https://pymc-devs.github.io/pymc3/notebooks/pmf-pymc.html

In [None]:
rs = rates[train_mask]

nU = maximum(rows)
nV = maximum(cols)

cU,sU,s2U = zeros(nU), zeros(nU), zeros(nU)
cV,sV,s2V = zeros(nV), zeros(nV), zeros(nV)

for (k,rk) in enumerate(rs)
    cU[rows[k]]  += 1
    sU[rows[k]]  += rk
    s2U[rows[k]] += rk^2
    cV[cols[k]]  += 1
    sV[cols[k]]  += rk
    s2V[cols[k]] += rk^2
end
vU = (s2U ./ cU) - (sU ./ cU).^2
vV = (s2V ./ cV) - (sV ./ cV).^2

vU[vU.<1e-10]=0.0
vV[vV.<1e-10]=0.0


base_sigma_r = 0.5 # Salakhutdinov & Mni
# https://pymc-devs.github.io/pymc3/notebooks/pmf-pymc.html
base_sigma_u = mean(sqrt.(vU[.~isnan.(vU)]))
base_sigma_v = mean(sqrt.(vV[.~isnan.(vV)]))

println(base_sigma_r)
println(base_sigma_u)
println(base_sigma_v)

### Studying the 0 vector

In [None]:
println(sqrt(sum(rates[train_mask].^2)/ntrain))
println(sqrt(sum(rates[test_mask].^2)/ntest))

In [None]:
include("pmf_rmse.jl");

## SVD

In [None]:
d = 30

spmat = sparse(rows[train_mask],cols[train_mask],rates[train_mask])
S = svds(spmat, nsv=d)[1]

xSVD  = [vec(S.U');vec(S.Vt)]
;

In [None]:
# training error
println(pmf_rmse(rows[train_mask],cols[train_mask],rates[train_mask], nU, nV, d, xSVD))
println(pmf_rmse(rows[test_mask], cols[test_mask], rates[test_mask],  nU, nV, d, xSVD))

## LBPS runs

In [None]:
include("pmf_lbps.jl")

In [None]:
data = Dict(
    "ROWS"  => rows[train_mask],
    "COLS"  => cols[train_mask],
    "RATES" => rates[train_mask]
);

In [None]:
d  = 30
sU = base_sigma_u
sV = base_sigma_v
sR = 0.5
lr = 0.01
mn = 5000
mt = Inf

en = "d$d-sU$(round(sU,2))-sV$(round(sV,2))-sR$sR-lr$lr-mn$mn-mt$mt"

# draw x0 from spherical priors
x0 = sU*randn(d)
for i in 2:nU
    append!(x0, sU*randn(d))
end
for i in 1:nV
    append!(x0, sV*randn(d))
end

lbpsparams = Dict(
    "EXPNAME"    => en, # name of the experiment
    "LATENT_D"   => d,  # dimension of latent space
    "SIGMA_U"    => sU, #
    "SIGMA_V"    => sV, #
    "SIGMA_R"    => sR, #
    "X0"         => x0, #
    "LAMBDAREF"  => lr, # refreshment rate
    "MAXNEVENTS" => mn, # maximum number of events to generate
    "MAXT"       => mt, # maximum time
)
ta      = time()
results = pmf_lbps(data, lbpsparams)
simtime = round(time()-ta,2)

# ------------------------------------

pm  = pathmean(results["ALL_EVLIST"])
pmu = pm[1:nU]
pmv = pm[nU+1:end]

xx = similar(x0)
for i in 1:length(pm)
    xx[((i-1)*30+1):(i*30)] = pm[i]
end

rmse = pmf_rmse(rows[test_mask], cols[test_mask], rates[test_mask],  nU, nV, d, xx)

# -------------------------------------

open("results.dat","a") do f
    l = "$en : $simtime s : $rmse\n"
    print(l)
    write(f, l)
end
;

## HMC territory

In [None]:
include("pmf_ll.jl")
(ll, gll) = pmf_ll(rows[train_mask], cols[train_mask], rates[train_mask], 
                    nU, nV, sR, sU, sV, d)

## HMC runs

In [None]:
include("hmc.jl")

In [None]:
ta = time()
samples = hmc(ll, gll, x0; steps=100, burnin=5, stepsize=0.01);
print(time()-ta)

In [None]:
ss = sum(samples)/length(samples);

In [None]:
pmf_rmse(rows[test_mask], cols[test_mask], rates[test_mask],  nU, nV, d, ss)