In [1]:
using DataFrames, TableOperations, Tables, Random
using Recommender: Movielens100k, load_dataset, ratio_split, ItemkNN, evaluate_u2i, PrecisionAtK, RecallAtK, NDCG, fit!, predict_u2i

┌ Info: Precompiling Recommender [f1dcbcfb-0b1d-45c0-883e-2edcfc1d4c23]
└ @ Base loading.jl:1317


In [2]:
ml100k = Movielens100k()
download(ml100k)
rating, user, movie = load_dataset(ml100k);

In [3]:
rating = rating |> TableOperations.filter(x->Tables.getcolumn(x, :rating) >= 4)
rating = rating |> TableOperations.transform(Dict(:rating=>x->1))

TableOperations.Transforms{false, TableOperations.Filter{var"#1#2", CSV.File{false}}, Dict{Symbol, var"#3#4"}}(TableOperations.Filter{var"#1#2", CSV.File{false}}(var"#1#2"(), CSV.File("/Users/keisuke.yanagi/workspace/Recommender.jl/src/dataset/../../dataset/movielens100k/u.data"):
Size: 100000 x 4
Tables.Schema:
 :userid     Int64
 :movieid    Int64
 :rating     Int64
 :timestamp  Int64), Dict(:rating => var"#3#4"()))

In [4]:
movie2title = Dict()
for row in Tables.rows(movie)
    movie2title[row[:movieid]] = row[:movie_title]
end
rating = rating |> TableOperations.transform(Dict(:movieid=>x->movie2title[x]))

TableOperations.Transforms{false, TableOperations.Transforms{false, TableOperations.Filter{var"#1#2", CSV.File{false}}, Dict{Symbol, var"#3#4"}}, Dict{Symbol, var"#5#6"}}(TableOperations.Transforms{false, TableOperations.Filter{var"#1#2", CSV.File{false}}, Dict{Symbol, var"#3#4"}}(TableOperations.Filter{var"#1#2", CSV.File{false}}(var"#1#2"(), CSV.File("/Users/keisuke.yanagi/workspace/Recommender.jl/src/dataset/../../dataset/movielens100k/u.data"):
Size: 100000 x 4
Tables.Schema:
 :userid     Int64
 :movieid    Int64
 :rating     Int64
 :timestamp  Int64), Dict(:rating => var"#3#4"())), Dict(:movieid => var"#5#6"()))

In [5]:
Random.seed!(1234);
train_valid_table, test_table = ratio_split(rating, 0.8)

train_table, valid_table = ratio_split(train_valid_table, 0.8)
length(Tables.rows(train_table)), length(Tables.rows(valid_table)), length(Tables.rows(test_table))


(35440, 8860, 11075)

In [6]:
prec10 = PrecisionAtK(10)
recall10 = RecallAtK(10)
ndcg10 = NDCG(10)
metrics = [prec10, recall10, ndcg10]

3-element Vector{Recommender.AbstractMetric}:
 PrecisionAtK(10)
 RecallAtK(10)
 NDCG(10)

In [7]:
using TreeParzen

In [8]:
space = Dict(
    :topk=>HP.QuantUniform(:topk, 10., 500., 1.),
    :shrink=>HP.LogUniform(:shrink, log(1e-3), log(1e3)),
    :weighting=>HP.Choice(:weighting, 
        [
            Dict(:weighting=>:dummy, :weighting_at_inference=>false),
            Dict(:weighting=>:tfidf, :weighting_at_inference=>false),
            Dict(:weighting=>:bm25, :weighting_at_inference=>HP.Choice(:weighting_at_inference, [true, false]))
        ]
    ),
    :normalize=>HP.Choice(:normalize, [true, false])
)

Dict{Symbol, TreeParzen.Types.AbstractDelayed} with 4 entries:
  :weighting => Choice(Param(:weighting, RandIndex(3)), Dict{Symbol, Any}[Dict(…
  :topk      => QuantUniform(:topk, QuantUniform(10.0, 500.0, 1.0))
  :normalize => Choice(Param(:normalize, RandIndex(2)), Bool[1, 0])
  :shrink    => LogUniform(:shrink, LogUniform(-6.90776, 6.90776))

In [9]:
function invert_output(params)
    k = convert(Int, params[:topk])
    model = ItemkNN(k, params[:shrink],params[:weighting][:weighting],params[:weighting][:weighting_at_inference],params[:normalize])
    result = evaluate_u2i(model, train_table, valid_table, metrics, 10, col_user=:userid, col_item=:movieid, col_rating=:rating, drop_history=true)
    @show params, result
    return -result[end]
end

invert_output (generic function with 1 method)

In [10]:
best = fmin(invert_output, space, 20, logging_interval=-1)

(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :bm25, :weighting_at_inference => false), :topk => 326.0, :normalize => false, :shrink => 4.881008704229105), (NDCG(10) = 0.18504434322533067, RecallAtK(10) = 0.1773478885693898, PrecisionAtK(10) = 0.13555555555555507))
(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :dummy, :weighting_at_inference => false), :topk => 482.0, :normalize => true, :shrink => 693.233059620366), (NDCG(10) = 0.16872632219578537, RecallAtK(10) = 0.15231583884427402, PrecisionAtK(10) = 0.11611111111111079))
(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :bm25, :weighting_at_inference => true), :topk => 210.0, :normalize => true, :shrink => 0.027243650448104388), (NDCG(10) = 0.17794004484727266, RecallAtK(10) = 0.16963180670598263, PrecisionAtK(10) = 0.13044444444444414))
(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :d

┌ Info: fmin: 20 / 20 trials carried out
└ @ TreeParzen.API /Users/keisuke.yanagi/.julia/packages/TreeParzen/Iw2mh/src/API.jl:176
┌ Info: Successfully completed fmin 
└ @ TreeParzen.API /Users/keisuke.yanagi/.julia/packages/TreeParzen/Iw2mh/src/API.jl:231


Dict{Symbol, Any} with 4 entries:
  :weighting => Dict{Symbol, Any}(:weighting=>:bm25, :weighting_at_inference=>f…
  :topk      => 376.0
  :normalize => false
  :shrink    => 0.00117581

In [12]:
best_model = ItemkNN(convert(Int, best[:topk]), best[:shrink],best[:weighting][:weighting],best[:weighting][:weighting_at_inference],best[:normalize])
evaluate_u2i(best_model, train_valid_table, test_table, metrics, 10, col_user=:userid, col_item=:movieid, col_rating=:rating, drop_history=true)

(NDCG(10) = 0.27437213729458143,
 RecallAtK(10) = 0.22154089430674984,
 PrecisionAtK(10) = 0.21141304347826015,)