From 686b8cce3f52f1803819d61757e30092634a7dfb Mon Sep 17 00:00:00 2001 From: Samuel Jenkins Date: Mon, 7 Jul 2014 11:05:27 +1000 Subject: [PATCH] Added GBLearner to ML API. Added system tests for GBLearner. --- REQUIRE | 1 + src/gb_learner.jl | 4 +- src/ml.jl | 7 ++- test/test_system.jl | 145 ++++++++++++++++++++++++++++++-------------- 4 files changed, 107 insertions(+), 50 deletions(-) diff --git a/REQUIRE b/REQUIRE index 1964555..58cc9e9 100644 --- a/REQUIRE +++ b/REQUIRE @@ -2,3 +2,4 @@ julia 0.3- FactCheck 0.1- DecisionTree 0.3- DataStructures 0.2- +GLM 0.4- diff --git a/src/gb_learner.jl b/src/gb_learner.jl index 478a47d..65a9239 100644 --- a/src/gb_learner.jl +++ b/src/gb_learner.jl @@ -19,8 +19,8 @@ type GBL <: GradientBoost learner function GBL(learner, loss_function=GaussianLoss(), - sampling_rate=0.5, learning_rate=0.1, - num_iterations=10) + sampling_rate=0.8, learning_rate=0.1, + num_iterations=100) new(loss_function, sampling_rate, learning_rate, num_iterations, learner) end diff --git a/src/ml.jl b/src/ml.jl index ba70765..bc9b2b0 100644 --- a/src/ml.jl +++ b/src/ml.jl @@ -4,14 +4,19 @@ module ML importall GradientBoost.LossFunctions importall GradientBoost.GB importall GradientBoost.GBDecisionTree +importall GradientBoost.GBLearner export GBProblem, fit!, predict!, + LossFunction, GaussianLoss, LaplaceLoss, BernoulliLoss, - GBDT + GBDT, + GBL, + learner_fit, + learner_predict # Gradient boosting problem. diff --git a/test/test_system.jl b/test/test_system.jl index 7517e85..84f0031 100644 --- a/test/test_system.jl +++ b/test/test_system.jl @@ -2,19 +2,24 @@ module TestSystem using FactCheck + importall GradientBoost.Util importall GradientBoost.ML +import GLM: fit, predict, LinearModel + # Experiment on GBProblem # # gbp_func is a function that returns an instantiated GBProblem. # score_func is a function that takes predictions, actual and returns as score. -function experiment(gbp_func, score_func, num_experiments, instances, labels) - scores = Array(Float64, num_experiments) +# baseline_func is a function that takes labels. +function experiment(gbp_func, score_func, baseline_func, + num_experiments, instances, labels) + scores = Array(Float64, num_experiments) for i = 1:num_experiments # Obtain training and test set - (train_ind, test_ind) = holdout(size(instances, 1), 0.3) + (train_ind, test_ind) = holdout(size(instances, 1), 0.2) train_instances = instances[train_ind, :] test_instances = instances[test_ind, :] train_labels = labels[train_ind] @@ -30,22 +35,65 @@ function experiment(gbp_func, score_func, num_experiments, instances, labels) scores[i] = score end + # Sanity check, score should be less than baseline. + baseline = baseline_func(labels) + @fact mean(scores) <= baseline => true + scores end +# Error rate +function err_rate(predictions, actual) + 1.0 - mean(predictions .== actual) +end +function baseline_err_rate(labels) + prop_ones = sum(labels) / length(labels) + baseline = 1.0 - max(prop_ones, 1 - prop_ones) +end + +# Mean squared error +function mse(predictions, actual) + mean((actual .- predictions) .^ 2.0) +end +function baseline_mse(labels) + label_mean = mean(labels) + baseline = mse(label_mean, labels) +end + +# Mean absolute deviation +function mad(predictions, actual) + mean(abs(actual .- predictions)) +end +function baseline_mad(labels) + label_median = median(labels) + baseline = mad(label_median, labels) +end + +num_experiments = 10 + +function ML.learner_fit(lf::LossFunction, + learner::Type{LinearModel}, instances, labels) + + model = fit(learner, instances, labels) +end +function ML.learner_predict(lf::LossFunction, + learner::Type{LinearModel}, model, instances) + + predict(model, instances) +end + facts("System tests") do context("iris dataset is handled by GBDT") do # Get data - dataset = readcsv(joinpath(dirname(@__FILE__), "iris.csv")) - instances = dataset[:, 1:(end-1)] - labels = dataset[:, end] + iris = readcsv(joinpath(dirname(@__FILE__), "iris.csv")) + instances = iris[:, 1:(end-1)] + labels = iris[:, end] # Convert data to required format. instances = convert(Matrix{Float64}, instances) labels = [species == "setosa" ? 1.0 : 0.0 for species in labels] - # Train and test multiple times - num_experiments = 5 + # Train and test multiple times (GBDT) function gbp_func() gbdt = GBDT( BernoulliLoss(), @@ -55,32 +103,24 @@ facts("System tests") do ) gbp = GBProblem(gbdt, :class) end - function score_func(predictions, actual) - mean(predictions .== actual) * 100.0 - end - scores = experiment( - gbp_func, score_func, num_experiments, instances, labels + experiment( + gbp_func, err_rate, baseline_err_rate, num_experiments, instances, labels ) - - # Sanity check, accuracy should be greater or equal to baseline - prop_ones = sum(labels) / length(labels) - baseline = max(prop_ones, 1 - prop_ones) * 100.0 - @fact mean(scores) >= baseline => true end - context("mtcars dataset is handled by GBDT") do + context("mtcars dataset is handled") do # Get data - dataset = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv")) - instances = dataset[:, 2:end] - labels = dataset[:, 1] + mtcars = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv")) + instances = mtcars[:, 2:end] + labels = mtcars[:, 1] # Convert data to required format. instances = convert(Matrix{Float64}, instances) labels = convert(Vector{Float64}, labels) # Train and test multiple times (MSE) - num_experiments = 5 - function gbp_mse_func() + gbp_mse_funcs = Function[] + function mse_gbdt_func() gbdt = GBDT( GaussianLoss(), 0.6, @@ -89,21 +129,27 @@ facts("System tests") do ) gbp = GBProblem(gbdt, :regression) end - function mse(predictions, actual) - mean((actual .- predictions) .^ 2.0) + push!(gbp_mse_funcs, mse_gbdt_func) + function mse_gbl_func() + gbl = GBL( + LinearModel, + GaussianLoss(), + 0.8, + 0.1, + 100, + ) + gbp = GBProblem(gbl, :regression) + end + push!(gbp_mse_funcs, mse_gbl_func) + for i = 1:length(gbp_mse_funcs) + experiment( + gbp_mse_funcs[i], mse, baseline_mse, num_experiments, instances, labels + ) end - scores = experiment( - gbp_mse_func, mse, num_experiments, instances, labels - ) - - # Sanity check, MSE should be smaller or equal to baseline - label_mean = mean(labels) - baseline = mse(label_mean, labels) - @fact mean(scores) <= baseline => true # Train and test multiple times (MAD) - num_experiments = 5 - function gbp_mad_func() + gbp_mad_funcs = Function[] + function mad_gbdt_func() gbdt = GBDT( LaplaceLoss(), 0.6, @@ -112,18 +158,23 @@ facts("System tests") do ) gbp = GBProblem(gbdt, :regression) end - function mad(predictions, actual) - mean(abs(actual .- predictions)) + push!(gbp_mad_funcs, mad_gbdt_func) + function mad_gbl_func() + gbl = GBL( + LinearModel, + LaplaceLoss(), + 0.8, + 0.1, + 100, + ) + gbp = GBProblem(gbl, :regression) + end + push!(gbp_mad_funcs, mad_gbl_func) + for i = 1:length(gbp_mad_funcs) + experiment( + gbp_mad_funcs[i], mad, baseline_mad, num_experiments, instances, labels + ) end - scores = experiment( - gbp_mad_func, mad, num_experiments, instances, labels - ) - - # Sanity check, MAD should be smaller or equal to baseline - label_median = median(labels) - baseline = mad(label_median, labels) - - @fact mean(scores) <= baseline => true end end