In [1]:
using JLD
using PyPlot
using StatsBase
using Distributions
using VBMatrixFactorization

In [12]:
# load the script with auxilliary functions
include("mil_util.jl");



In [11]:
# list all available data
# CHANGE THE FOLDER TO YOUR LOCAL FOLDER
mil_path = "/home/vit/Dropbox/vyzkum/cisco/data/milproblems";
readdir(mil_path)

20-element Array{String,1}:
 "BrownCreeper.jld"    
 "CorelAfrican.jld"    
 "CorelBeach.jld"      
 "Elephant.jld"        
 "Fox.jld"             
 "Musk1.jld"           
 "Musk2.jld"           
 "Mutagenesis1.jld"    
 "Mutagenesis2.jld"    
 "Newsgroups1.jld"     
 "Newsgroups2.jld"     
 "Newsgroups3.jld"     
 "Protein.jld"         
 "Tiger.jld"           
 "UCSBBreastCancer.jld"
 "Web1.jld"            
 "Web2.jld"            
 "Web3.jld"            
 "Web4.jld"            
 "WinterWren.jld"      

In [13]:
# load a dataset
dataset_name = "BrownCreeper"
data = load(string(mil_path, "/", dataset_name, ".jld"));

In [78]:
@time mer, eer, fp, fn, n0, n1 = validate(0.3, data, 200, "sparse", 2, eps = 1e-3, scale_y = true)

Extracted 123 negative and 41 positive bags.
Factorization finished after 200 iterations, eps = 1.1442912429024088
Factorization finished after 200 iterations, eps = 1.1249628082148653
  3.687087 seconds (11.79 M allocations: 1.786 GB, 14.32% gc time)


(0.2786458333333333,0.26804993252361675,74,33,228,156)

In [81]:
inputs = Dict()
inputs["p_vec"] =  [0.01]
inputs["nclass_iter"] = 10
inputs["niter"] = 200
inputs["eps"] = 1e-3
inputs["solver"] = "basic"
inputs["H"] = 1
inputs["scale_y"] = true
inputs["dataset_name"] = dataset_name

"BrownCreeper"

In [83]:
res_mat = validate_dataset(data, inputs)

p = 0.01
n = 1
Extracted 1 negative and 4 positive bags.
Factorization finished after 15 iterations, eps = 0.0006142311236804268
Factorization finished after 33 iterations, eps = 0.0008921507365281547
n = 2
Extracted 3 negative and 2 positive bags.
Factorization finished after 7 iterations, eps = 0.0006682471443614969
Factorization finished after 29 iterations, eps = 0.0009547282557933324
n = 3
n = 4
Extracted 3 negative and 2 positive bags.
Factorization finished after 7 iterations, eps = 0.0005240072709072365
Factorization finished after 10 iterations, eps = 0.000811888171857013




n = 5
Extracted 3 negative and 2 positive bags.
Factorization finished after 6 iterations, eps = 0.0004706167307214116
Factorization finished after 14 iterations, eps = 0.0006261701411991878
n = 6
Extracted 4 negative and 1 positive bags.
Factorization finished after 6 iterations, eps = 0.00034597448105610053
Factorization finished after 56 iterations, eps = 0.0009689004729153521
n = 7
Extracted 3 negative and 2 positive bags.
Factorization finished after 6 iterations, eps = 0.0007611958734774195
Factorization finished after 14 iterations, eps = 0.0008183279432778816
n = 8
Extracted 3 negative and 2 positive bags.
Factorization finished after 8 iterations, eps = 0.0009710091179917549
Factorization finished after 12 iterations, eps = 0.0009617004755944889
n = 9
Extracted 4 negative and 1 positive bags.
Factorization finished after 11 iterations, eps = 0.0006897770879836985
Factorization finished after 19 iterations, eps = 0.0008687079242295716
n = 10
Extracted 4 negative and 1 positive 

10×7 Array{Float64,2}:
 0.01   0.349908   0.33883   132.0   58.0  350.0  193.0
 0.01   0.300184   0.260124  140.0   23.0  348.0  195.0
 0.01  -1.0       -1.0        -1.0   -1.0   -1.0   -1.0
 0.01   0.320442   0.27931   148.0   26.0  348.0  195.0
 0.01   0.342541   0.334881  126.0   60.0  348.0  195.0
 0.01   0.3407     0.324296  133.0   52.0  347.0  196.0
 0.01   0.335175   0.286295  160.0   22.0  348.0  195.0
 0.01   0.478821   0.55168   102.0  158.0  348.0  195.0
 0.01   0.388582   0.441687   87.0  124.0  347.0  196.0
 0.01   0.296501   0.261961  134.0   27.0  347.0  196.0

In [208]:
# save the outputs and inputs
output_path = "/home/vit/Dropbox/vyzkum/cisco/data/vbmf_classification"
fname = string(dataset_name, "_", inputs["solver"], "_", inputs["H"], "_", inputs["nclass_iter"])
save("$output_path/$fname.jld", "res_mat", res_mat, "inputs", inputs)

In [209]:
# test their loading
output = load("$output_path/$fname.jld")

Dict{String,Any} with 2 entries:
  "res_mat" => [0.01 0.311234 … 347.0 196.0; 0.02 0.453532 … 345.0 193.0; … ; 0…
  "inputs"  => Dict{Any,Any}(Pair{Any,Any}("niter",200),Pair{Any,Any}("eps",0.0…

In [103]:
# evaluate how many columns of H are needed for better results
Hvec = [1; 2; 3; 5; 7; 10]
output_path = "/home/vit/Dropbox/vyzkum/cisco/data/vbmf_classification"

inputs = Dict()
inputs["p_vec"] =  [0.05]
inputs["nclass_iter"] = 50
inputs["niter"] = 200
inputs["eps"] = 1e-3
inputs["solver"] = "basic"
inputs["H"] = 1
inputs["scale_y"] = true
inputs["dataset_name"] = dataset_name

save_path = string(output_path, "/Htest/$(inputs["p_vec"][1])")
mkpath(save_path)

for H in Hvec
    inputs["H"] = H
    println("H = $H")
    res_mat = validate_dataset(data, inputs, verb = false)
    fname = string("$(inputs["p_vec"][1])_$H")
    save("$save_path/$fname.jld", "res_mat", res_mat, "inputs", inputs)
end

H = 1
p = 0.05
n = 1
n = 2
n = 3
n = 4
n = 5
n = 6
n = 7
n = 8
n = 9
n = 10
n = 11
n = 12
n = 13
n = 14
n = 15
n = 16
n = 17
n = 18
n = 19
n = 20
n = 21
n = 22
n = 23
n = 24
n = 25
n = 26
n = 27
n = 28
n = 29
n = 30
n = 31
n = 32
n = 33
n = 34
n = 35
n = 36
n = 37
n = 38
n = 39
n = 40
n = 41
n = 42
n = 43
n = 44
n = 45
n = 46
n = 47
n = 48
n = 49
n = 50
H = 2
p = 0.05
n = 1
n = 2
n = 3
n = 4
n = 5
n = 6
n = 7
n = 8
n = 9
n = 10
n = 11
n = 12
n = 13
n = 14
n = 15
n = 16
n = 17
n = 18
n = 19
n = 20
n = 21
n = 22
n = 23
n = 24
n = 25
n = 26
n = 27
n = 28
n = 29
n = 30
n = 31
n = 32
n = 33
n = 34
n = 35
n = 36
n = 37
n = 38
n = 39
n = 40
n = 41
n = 42
n = 43
n = 44
n = 45
n = 46
n = 47
n = 48
n = 49
n = 50
H = 3
p = 0.05
n = 1
n = 2
n = 3
n = 4
n = 5
n = 6
n = 7
n = 8
n = 9
n = 10
n = 11
n = 12
n = 13
n = 14
n = 15
n = 16
n = 17
n = 18
n = 19
n = 20
n = 21
n = 22
n = 23
n = 24
n = 25
n = 26
n = 27
n = 28
n = 29
n = 30
n = 31
n = 32
n = 33
n = 34
n = 35
n = 36
n = 37
n = 38
n = 39
n = 40
n 

In [107]:
save_path = string(output_path, "/Htest/0.1")
flist = readdir(save_path)
for f in flist
    res = load(string(save_path, "/", f))
    table_summary(res)
end


Mean classsification error, basic solver, dataset BrownCreeper, H = 1, 50 samples: 
 
 perc. of known labels | error rate | EER | false pos. | false neg. | neg. samples | pos. samples 
------------------------------------------------------------------------------------------------------
        0.10                0.331    0.311     120.8       42.6          316.5         177.5 

Mean classsification error, basic solver, dataset BrownCreeper, H = 10, 50 samples: 
 
 perc. of known labels | error rate | EER | false pos. | false neg. | neg. samples | pos. samples 
------------------------------------------------------------------------------------------------------
        0.10                0.294    0.282     102.2       42.8          316.9         177.1 

Mean classsification error, basic solver, dataset BrownCreeper, H = 2, 50 samples: 
 
 perc. of known labels | error rate | EER | false pos. | false neg. | neg. samples | pos. samples 
-----------------------------------------------