In [1]:
using DataFrames, TableOperations, Tables, Random, CSV
using Recommenders: Movielens100k, load_dataset, ratio_split, ItemkNN, evaluate_u2i, PrecisionAtK, RecallAtK, NDCG

In [2]:
ml100k = Movielens100k()
download(ml100k)

In [3]:
train_valid_table = CSV.File(
    joinpath(ml100k.dataset_dir, "ua.base"),
    delim = "\t",
    header = [:userid, :movieid, :rating, :timestamp],
)

test_table= CSV.File(
    joinpath(ml100k.dataset_dir, "ua.test"),
    delim = "\t",
    header = [:userid, :movieid, :rating, :timestamp],
)

9430-element CSV.File{false}:
 CSV.Row: (userid = 1, movieid = 20, rating = 4, timestamp = 887431883)
 CSV.Row: (userid = 1, movieid = 33, rating = 4, timestamp = 878542699)
 CSV.Row: (userid = 1, movieid = 61, rating = 4, timestamp = 878542420)
 CSV.Row: (userid = 1, movieid = 117, rating = 3, timestamp = 874965739)
 CSV.Row: (userid = 1, movieid = 155, rating = 2, timestamp = 878542201)
 CSV.Row: (userid = 1, movieid = 160, rating = 4, timestamp = 875072547)
 CSV.Row: (userid = 1, movieid = 171, rating = 5, timestamp = 889751711)
 CSV.Row: (userid = 1, movieid = 189, rating = 3, timestamp = 888732928)
 CSV.Row: (userid = 1, movieid = 202, rating = 5, timestamp = 875072442)
 CSV.Row: (userid = 1, movieid = 265, rating = 4, timestamp = 878542441)
 CSV.Row: (userid = 2, movieid = 13, rating = 4, timestamp = 888551922)
 CSV.Row: (userid = 2, movieid = 50, rating = 5, timestamp = 888552084)
 CSV.Row: (userid = 2, movieid = 251, rating = 5, timestamp = 888552084)
 ⋮
 CSV.Row: (userid = 942

In [4]:
train_valid_table = train_valid_table |> TableOperations.filter(x->Tables.getcolumn(x, :rating) >= 4)
train_valid_table = train_valid_table |> TableOperations.transform(Dict(:rating=>x->1))

test_table = test_table |> TableOperations.filter(x->Tables.getcolumn(x, :rating) >= 4)
test_table = test_table |> TableOperations.transform(Dict(:rating=>x->1))

TableOperations.Transforms{false, TableOperations.Filter{var"#5#6", CSV.File{false}}, Dict{Symbol, var"#7#8"}}(TableOperations.Filter{var"#5#6", CSV.File{false}}(var"#5#6"(), CSV.File("/Users/keisuke.yanagi/workspace/Recommenders.jl/src/dataset/../../dataset/movielens100k/ua.test"):
Size: 9430 x 4
Tables.Schema:
 :userid     Int64
 :movieid    Int64
 :rating     Int64
 :timestamp  Int64), Dict(:rating => var"#7#8"()))

In [5]:
train_table, valid_table = ratio_split(train_valid_table, 0.8)

((userid = [525, 586, 825, 892, 290, 474, 75, 10, 276, 942  …  807, 65, 73, 321, 115, 164, 271, 381, 630, 864], movieid = [288, 117, 237, 174, 274, 495, 56, 273, 1098, 31  …  498, 660, 100, 607, 23, 282, 661, 898, 12, 729], rating = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1], timestamp = [881085217, 884057578, 880931932, 886608616, 880731874, 887927728, 884051921, 877888613, 880913684, 891283517  …  892529150, 879216880, 888626120, 879440109, 881171348, 889401927, 885848373, 892697869, 885667854, 888889035]), (userid = [823, 303, 38, 177, 505, 909, 933, 764, 91, 452  …  249, 436, 94, 184, 85, 119, 416, 391, 257, 748], movieid = [95, 484, 420, 92, 328, 170, 186, 864, 31, 185  …  93, 83, 436, 517, 27, 655, 204, 705, 181, 176], rating = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1], timestamp = [878439257, 879466966, 892429347, 882142295, 888631175, 891920276, 874938563, 876243232, 891438875, 875264355  …  879640194, 887770115, 891721815, 889909409, 

In [6]:
prec10 = PrecisionAtK(10)
recall10 = RecallAtK(10)
ndcg10 = NDCG(10)
metrics = [prec10, recall10, ndcg10]

3-element Vector{Recommenders.AbstractMetric}:
 PrecisionAtK(10)
 RecallAtK(10)
 NDCG(10)

In [7]:
using TreeParzen

In [8]:
space = Dict(
    :topk=>HP.QuantUniform(:topk, 10., 500., 1.),
    :shrink=>HP.LogUniform(:shrink, log(1e-3), log(1e3)),
    :weighting=>HP.Choice(:weighting, 
        [
            Dict(:weighting=>:dummy, :weighting_at_inference=>false),
            Dict(:weighting=>:tfidf, :weighting_at_inference=>false),
            Dict(:weighting=>:bm25, :weighting_at_inference=>HP.Choice(:weighting_at_inference, [true, false]))
        ]
    ),
    :normalize=>HP.Choice(:normalize, [true, false]),
    :normalize_similarity=>HP.Choice(:normalize_similarity, [true, false])
)

Dict{Symbol, TreeParzen.Types.AbstractDelayed} with 5 entries:
  :weighting            => Choice(Param(:weighting, RandIndex(3)), Dict{Symbol,…
  :topk                 => QuantUniform(:topk, QuantUniform(10.0, 500.0, 1.0))
  :normalize            => Choice(Param(:normalize, RandIndex(2)), Bool[1, 0])
  :normalize_similarity => Choice(Param(:normalize_similarity, RandIndex(2)), B…
  :shrink               => LogUniform(:shrink, LogUniform(-6.90776, 6.90776))

In [9]:
function invert_output(params)
    k = convert(Int, params[:topk])
    model = ItemkNN(k, params[:shrink],params[:weighting][:weighting],params[:weighting][:weighting_at_inference],params[:normalize], params[:normalize_similarity])
    result = evaluate_u2i(model, train_table, valid_table, metrics, 10, col_user=:userid, col_item=:movieid, col_rating=:rating, drop_history=true)
    @show params, result
    return -result[end]
end

invert_output (generic function with 1 method)

In [10]:
best = fmin(invert_output, space, 500, logging_interval=-1)

(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :tfidf, :weighting_at_inference => false), :topk => 386.0, :normalize => false, :normalize_similarity => false, :shrink => 0.013025093105008942), (NDCG(10) = 0.18718737584647194, RecallAtK(10) = 0.1560602770456402, PrecisionAtK(10) = 0.1384790011350733))
(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :dummy, :weighting_at_inference => false), :topk => 386.0, :normalize => true, :normalize_similarity => false, :shrink => 13.524870797933351), (NDCG(10) = 0.19107577407530452, RecallAtK(10) = 0.15361457175156976, PrecisionAtK(10) = 0.14313280363223568))
(params, result) = (Dict{Symbol, Any}(:weighting => Dict{Symbol, Any}(:weighting => :bm25, :weighting_at_inference => false), :topk => 308.0, :normalize => true, :normalize_similarity => true, :shrink => 0.05717054403232302), (NDCG(10) = 0.21293492203615158, RecallAtK(10) = 0.1731128915176947, PrecisionAtK(10) = 0.1611804

┌ Info: fmin: 500 / 500 trials carried out
└ @ TreeParzen.API /Users/keisuke.yanagi/.julia/packages/TreeParzen/Iw2mh/src/API.jl:176
┌ Info: Successfully completed fmin 
└ @ TreeParzen.API /Users/keisuke.yanagi/.julia/packages/TreeParzen/Iw2mh/src/API.jl:231


Dict{Symbol, Any} with 5 entries:
  :weighting            => Dict{Symbol, Any}(:weighting=>:bm25, :weighting_at_i…
  :topk                 => 387.0
  :normalize            => false
  :normalize_similarity => false
  :shrink               => 564.461

In [11]:
best_model = ItemkNN(convert(Int, best[:topk]), best[:shrink],best[:weighting][:weighting],best[:weighting][:weighting_at_inference],best[:normalize],best[:normalize_similarity])
evaluate_u2i(best_model, train_valid_table, test_table, metrics, 10, col_user=:userid, col_item=:movieid, col_rating=:rating, drop_history=true)

(NDCG(10) = 0.22302330722933578,
 RecallAtK(10) = 0.23896154107610218,
 PrecisionAtK(10) = 0.13875802997858616,)