Added GBLearner to ML API. Added system tests for GBLearner.

svs14 · Jul 7, 2014 · 686b8cc · 686b8cc
1 parent ece6028
commit 686b8cc
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 50 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -2,3 +2,4 @@ julia 0.3-
 FactCheck 0.1-
 DecisionTree 0.3-
 DataStructures 0.2-
+GLM 0.4-
diff --git a/src/gb_learner.jl b/src/gb_learner.jl
@@ -19,8 +19,8 @@ type GBL <: GradientBoost
   learner
 
   function GBL(learner, loss_function=GaussianLoss(),
-    sampling_rate=0.5, learning_rate=0.1, 
-    num_iterations=10)
+    sampling_rate=0.8, learning_rate=0.1, 
+    num_iterations=100)
 
     new(loss_function, sampling_rate, learning_rate, num_iterations, learner)
   end

diff --git a/src/ml.jl b/src/ml.jl
@@ -4,14 +4,19 @@ module ML
 importall GradientBoost.LossFunctions
 importall GradientBoost.GB
 importall GradientBoost.GBDecisionTree
+importall GradientBoost.GBLearner
 
 export GBProblem,
        fit!,
        predict!,
+       LossFunction,
        GaussianLoss,
        LaplaceLoss,
        BernoulliLoss,
-       GBDT
+       GBDT,
+       GBL,
+       learner_fit,
+       learner_predict
 
 
 # Gradient boosting problem.

diff --git a/test/test_system.jl b/test/test_system.jl
@@ -2,19 +2,24 @@
 module TestSystem
 
 using FactCheck
+
 importall GradientBoost.Util
 importall GradientBoost.ML
 
+import GLM: fit, predict, LinearModel
+
 # Experiment on GBProblem
 #
 # gbp_func is a function that returns an instantiated GBProblem.
 # score_func is a function that takes predictions, actual and returns as score.
-function experiment(gbp_func, score_func, num_experiments, instances, labels)
-  scores = Array(Float64, num_experiments)
+# baseline_func is a function that takes labels.
+function experiment(gbp_func, score_func, baseline_func,
+  num_experiments, instances, labels)
 
+  scores = Array(Float64, num_experiments)
   for i = 1:num_experiments
     # Obtain training and test set
-    (train_ind, test_ind) = holdout(size(instances, 1), 0.3)
+    (train_ind, test_ind) = holdout(size(instances, 1), 0.2)
     train_instances = instances[train_ind, :]
     test_instances = instances[test_ind, :]
     train_labels = labels[train_ind]
@@ -30,22 +35,65 @@ function experiment(gbp_func, score_func, num_experiments, instances, labels)
     scores[i] = score
   end
 
+  # Sanity check, score should be less than baseline.
+  baseline = baseline_func(labels)
+  @fact mean(scores) <= baseline => true
+
   scores
 end
 
+# Error rate
+function err_rate(predictions, actual)
+  1.0 - mean(predictions .== actual)
+end
+function baseline_err_rate(labels)
+  prop_ones = sum(labels) / length(labels)
+  baseline = 1.0 - max(prop_ones, 1 - prop_ones)
+end
+
+# Mean squared error
+function mse(predictions, actual)
+  mean((actual .- predictions) .^ 2.0)
+end
+function baseline_mse(labels)
+  label_mean = mean(labels)
+  baseline = mse(label_mean, labels)
+end
+
+# Mean absolute deviation
+function mad(predictions, actual)
+  mean(abs(actual .- predictions))
+end
+function baseline_mad(labels)
+  label_median = median(labels)
+  baseline = mad(label_median, labels)
+end
+
+num_experiments = 10
+
+function ML.learner_fit(lf::LossFunction,
+  learner::Type{LinearModel}, instances, labels)
+
+  model = fit(learner, instances, labels)
+end
+function ML.learner_predict(lf::LossFunction,
+  learner::Type{LinearModel}, model, instances)
+
+  predict(model, instances)
+end
+
 facts("System tests") do
   context("iris dataset is handled by GBDT") do
     # Get data
-    dataset = readcsv(joinpath(dirname(@__FILE__), "iris.csv"))
-    instances = dataset[:, 1:(end-1)]
-    labels = dataset[:, end]
+    iris = readcsv(joinpath(dirname(@__FILE__), "iris.csv"))
+    instances = iris[:, 1:(end-1)]
+    labels = iris[:, end]
 
     # Convert data to required format.
     instances = convert(Matrix{Float64}, instances)
     labels = [species == "setosa" ? 1.0 : 0.0 for species in labels]
 
-    # Train and test multiple times
-    num_experiments = 5
+    # Train and test multiple times (GBDT)
     function gbp_func()
       gbdt = GBDT(
         BernoulliLoss(),
@@ -55,32 +103,24 @@ facts("System tests") do
       )
       gbp = GBProblem(gbdt, :class)
     end
-    function score_func(predictions, actual)
-      mean(predictions .== actual) * 100.0
-    end
-    scores = experiment(
-      gbp_func, score_func, num_experiments, instances, labels
+    experiment(
+      gbp_func, err_rate, baseline_err_rate, num_experiments, instances, labels
     )
-
-    # Sanity check, accuracy should be greater or equal to baseline
-    prop_ones = sum(labels) / length(labels)
-    baseline = max(prop_ones, 1 - prop_ones) * 100.0
-    @fact mean(scores) >= baseline => true
   end
 
-  context("mtcars dataset is handled by GBDT") do
+  context("mtcars dataset is handled") do
     # Get data
-    dataset = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv"))
-    instances = dataset[:, 2:end]
-    labels = dataset[:, 1]
+    mtcars = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv"))
+    instances = mtcars[:, 2:end]
+    labels = mtcars[:, 1]
 
     # Convert data to required format.
     instances = convert(Matrix{Float64}, instances)
     labels = convert(Vector{Float64}, labels)
 
     # Train and test multiple times (MSE)
-    num_experiments = 5
-    function gbp_mse_func()
+    gbp_mse_funcs = Function[]
+    function mse_gbdt_func()
       gbdt = GBDT(
         GaussianLoss(),
         0.6,
@@ -89,21 +129,27 @@ facts("System tests") do
       )
       gbp = GBProblem(gbdt, :regression)
     end
-    function mse(predictions, actual)
-      mean((actual .- predictions) .^ 2.0)
+    push!(gbp_mse_funcs, mse_gbdt_func)
+    function mse_gbl_func()
+      gbl = GBL(
+        LinearModel,
+        GaussianLoss(),
+        0.8,
+        0.1,
+        100,
+      )
+      gbp = GBProblem(gbl, :regression)
+    end
+    push!(gbp_mse_funcs, mse_gbl_func)
+    for i = 1:length(gbp_mse_funcs)
+      experiment(
+        gbp_mse_funcs[i], mse, baseline_mse, num_experiments, instances, labels
+      )
     end
-    scores = experiment(
-      gbp_mse_func, mse, num_experiments, instances, labels
-    )
-
-    # Sanity check, MSE should be smaller or equal to baseline
-    label_mean = mean(labels)
-    baseline = mse(label_mean, labels)
-    @fact mean(scores) <= baseline => true
 
     # Train and test multiple times (MAD)
-    num_experiments = 5
-    function gbp_mad_func()
+    gbp_mad_funcs = Function[]
+    function mad_gbdt_func()
       gbdt = GBDT(
         LaplaceLoss(),
         0.6,
@@ -112,18 +158,23 @@ facts("System tests") do
       )
       gbp = GBProblem(gbdt, :regression)
     end
-    function mad(predictions, actual)
-      mean(abs(actual .- predictions))
+    push!(gbp_mad_funcs, mad_gbdt_func)
+    function mad_gbl_func()
+      gbl = GBL(
+        LinearModel,
+        LaplaceLoss(),
+        0.8,
+        0.1,
+        100,
+      )
+      gbp = GBProblem(gbl, :regression)
+    end
+    push!(gbp_mad_funcs, mad_gbl_func)
+    for i = 1:length(gbp_mad_funcs)
+      experiment(
+        gbp_mad_funcs[i], mad, baseline_mad, num_experiments, instances, labels
+      )
     end
-    scores = experiment(
-      gbp_mad_func, mad, num_experiments, instances, labels
-    )
-
-    # Sanity check, MAD should be smaller or equal to baseline
-    label_median = median(labels)
-    baseline = mad(label_median, labels)
-
-    @fact mean(scores) <= baseline => true
   end
 end