Skip to content

Commit

Permalink
Added GBLearner to ML API. Added system tests for GBLearner.
Browse files Browse the repository at this point in the history
  • Loading branch information
svs14 committed Jul 7, 2014
1 parent ece6028 commit 686b8cc
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 50 deletions.
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ julia 0.3-
FactCheck 0.1-
DecisionTree 0.3-
DataStructures 0.2-
GLM 0.4-
4 changes: 2 additions & 2 deletions src/gb_learner.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ type GBL <: GradientBoost
learner

function GBL(learner, loss_function=GaussianLoss(),
sampling_rate=0.5, learning_rate=0.1,
num_iterations=10)
sampling_rate=0.8, learning_rate=0.1,
num_iterations=100)

new(loss_function, sampling_rate, learning_rate, num_iterations, learner)
end
Expand Down
7 changes: 6 additions & 1 deletion src/ml.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@ module ML
importall GradientBoost.LossFunctions
importall GradientBoost.GB
importall GradientBoost.GBDecisionTree
importall GradientBoost.GBLearner

export GBProblem,
fit!,
predict!,
LossFunction,
GaussianLoss,
LaplaceLoss,
BernoulliLoss,
GBDT
GBDT,
GBL,
learner_fit,
learner_predict


# Gradient boosting problem.
Expand Down
145 changes: 98 additions & 47 deletions test/test_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@
module TestSystem

using FactCheck

importall GradientBoost.Util
importall GradientBoost.ML

import GLM: fit, predict, LinearModel

# Experiment on GBProblem
#
# gbp_func is a function that returns an instantiated GBProblem.
# score_func is a function that takes predictions, actual and returns as score.
function experiment(gbp_func, score_func, num_experiments, instances, labels)
scores = Array(Float64, num_experiments)
# baseline_func is a function that takes labels.
function experiment(gbp_func, score_func, baseline_func,
num_experiments, instances, labels)

scores = Array(Float64, num_experiments)
for i = 1:num_experiments
# Obtain training and test set
(train_ind, test_ind) = holdout(size(instances, 1), 0.3)
(train_ind, test_ind) = holdout(size(instances, 1), 0.2)
train_instances = instances[train_ind, :]
test_instances = instances[test_ind, :]
train_labels = labels[train_ind]
Expand All @@ -30,22 +35,65 @@ function experiment(gbp_func, score_func, num_experiments, instances, labels)
scores[i] = score
end

# Sanity check, score should be less than baseline.
baseline = baseline_func(labels)
@fact mean(scores) <= baseline => true

scores
end

# Error rate
function err_rate(predictions, actual)
1.0 - mean(predictions .== actual)
end
function baseline_err_rate(labels)
prop_ones = sum(labels) / length(labels)
baseline = 1.0 - max(prop_ones, 1 - prop_ones)
end

# Mean squared error
function mse(predictions, actual)
mean((actual .- predictions) .^ 2.0)
end
function baseline_mse(labels)
label_mean = mean(labels)
baseline = mse(label_mean, labels)
end

# Mean absolute deviation
function mad(predictions, actual)
mean(abs(actual .- predictions))
end
function baseline_mad(labels)
label_median = median(labels)
baseline = mad(label_median, labels)
end

num_experiments = 10

function ML.learner_fit(lf::LossFunction,
learner::Type{LinearModel}, instances, labels)

model = fit(learner, instances, labels)
end
function ML.learner_predict(lf::LossFunction,
learner::Type{LinearModel}, model, instances)

predict(model, instances)
end

facts("System tests") do
context("iris dataset is handled by GBDT") do
# Get data
dataset = readcsv(joinpath(dirname(@__FILE__), "iris.csv"))
instances = dataset[:, 1:(end-1)]
labels = dataset[:, end]
iris = readcsv(joinpath(dirname(@__FILE__), "iris.csv"))
instances = iris[:, 1:(end-1)]
labels = iris[:, end]

# Convert data to required format.
instances = convert(Matrix{Float64}, instances)
labels = [species == "setosa" ? 1.0 : 0.0 for species in labels]

# Train and test multiple times
num_experiments = 5
# Train and test multiple times (GBDT)
function gbp_func()
gbdt = GBDT(
BernoulliLoss(),
Expand All @@ -55,32 +103,24 @@ facts("System tests") do
)
gbp = GBProblem(gbdt, :class)
end
function score_func(predictions, actual)
mean(predictions .== actual) * 100.0
end
scores = experiment(
gbp_func, score_func, num_experiments, instances, labels
experiment(
gbp_func, err_rate, baseline_err_rate, num_experiments, instances, labels
)

# Sanity check, accuracy should be greater or equal to baseline
prop_ones = sum(labels) / length(labels)
baseline = max(prop_ones, 1 - prop_ones) * 100.0
@fact mean(scores) >= baseline => true
end

context("mtcars dataset is handled by GBDT") do
context("mtcars dataset is handled") do
# Get data
dataset = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv"))
instances = dataset[:, 2:end]
labels = dataset[:, 1]
mtcars = readcsv(joinpath(dirname(@__FILE__), "mtcars.csv"))
instances = mtcars[:, 2:end]
labels = mtcars[:, 1]

# Convert data to required format.
instances = convert(Matrix{Float64}, instances)
labels = convert(Vector{Float64}, labels)

# Train and test multiple times (MSE)
num_experiments = 5
function gbp_mse_func()
gbp_mse_funcs = Function[]
function mse_gbdt_func()
gbdt = GBDT(
GaussianLoss(),
0.6,
Expand All @@ -89,21 +129,27 @@ facts("System tests") do
)
gbp = GBProblem(gbdt, :regression)
end
function mse(predictions, actual)
mean((actual .- predictions) .^ 2.0)
push!(gbp_mse_funcs, mse_gbdt_func)
function mse_gbl_func()
gbl = GBL(
LinearModel,
GaussianLoss(),
0.8,
0.1,
100,
)
gbp = GBProblem(gbl, :regression)
end
push!(gbp_mse_funcs, mse_gbl_func)
for i = 1:length(gbp_mse_funcs)
experiment(
gbp_mse_funcs[i], mse, baseline_mse, num_experiments, instances, labels
)
end
scores = experiment(
gbp_mse_func, mse, num_experiments, instances, labels
)

# Sanity check, MSE should be smaller or equal to baseline
label_mean = mean(labels)
baseline = mse(label_mean, labels)
@fact mean(scores) <= baseline => true

# Train and test multiple times (MAD)
num_experiments = 5
function gbp_mad_func()
gbp_mad_funcs = Function[]
function mad_gbdt_func()
gbdt = GBDT(
LaplaceLoss(),
0.6,
Expand All @@ -112,18 +158,23 @@ facts("System tests") do
)
gbp = GBProblem(gbdt, :regression)
end
function mad(predictions, actual)
mean(abs(actual .- predictions))
push!(gbp_mad_funcs, mad_gbdt_func)
function mad_gbl_func()
gbl = GBL(
LinearModel,
LaplaceLoss(),
0.8,
0.1,
100,
)
gbp = GBProblem(gbl, :regression)
end
push!(gbp_mad_funcs, mad_gbl_func)
for i = 1:length(gbp_mad_funcs)
experiment(
gbp_mad_funcs[i], mad, baseline_mad, num_experiments, instances, labels
)
end
scores = experiment(
gbp_mad_func, mad, num_experiments, instances, labels
)

# Sanity check, MAD should be smaller or equal to baseline
label_median = median(labels)
baseline = mad(label_median, labels)

@fact mean(scores) <= baseline => true
end
end

Expand Down

0 comments on commit 686b8cc

Please sign in to comment.