Skip to content

Commit

Permalink
Gradient Boosting algorithm. GB Learner and DT. Loss functions.
Browse files Browse the repository at this point in the history
  • Loading branch information
svs14 committed Jul 5, 2014
1 parent 01b6c0a commit 3fc5099
Show file tree
Hide file tree
Showing 14 changed files with 973 additions and 6 deletions.
4 changes: 3 additions & 1 deletion FUTUREWORK.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

Listed below are the key features that are to be developed:

- Multi-class handling
- Monitoring callback (early termination)
- GB-specific grid-search
- Variable importance
- Multi-class handling
- More loss functions
- Optimize codebase (ongoing)
2 changes: 2 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
julia 0.3-
FactCheck 0.1-
DecisionTree 0.3-
DataStructures 0.2-
7 changes: 6 additions & 1 deletion src/GradientBoost.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
module GradientBoost

# package code goes here
# Load sources files
include("util.jl")
include("loss.jl")
include("gb.jl")
include("gb_learner.jl")
include("gb_dt.jl")

end # module
111 changes: 111 additions & 0 deletions src/gb.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Gradient boosting.
module GB

importall GradientBoost.Util
importall GradientBoost.LossFunctions

export GradientBoost,
GBModel,
stochastic_gradient_boost,
fit,
predict,
build_base_func,
create_sample_indices


# Gradient boost algorithm.
abstract GradientBoost

# Gradient boost model.
type GBModel
learning_rate::FloatingPoint
base_funcs::Vector{Function}
end

# Perform stochastic gradient boost.
#
# @param gb Gradient boosting algorithm.
# @param instances Instances.
# @param labels Labels.
# @return Gradient boost model.
function stochastic_gradient_boost(gb::GradientBoost, instances, labels)
# Initialize base functions and psuedo labels
num_iterations = gb.num_iterations
base_funcs = Array(Function, num_iterations+1)
base_funcs[1] = (instances) ->
fill(minimizing_scalar(gb.loss_function, labels), size(instances, 1))

# Build base functions
stage_base_func = base_funcs[1]
psuedo = labels
for iter_ind = 2:num_iterations+1
# Update residuals
prev_func_pred = stage_base_func(instances)
psuedo = negative_gradient(
gb.loss_function,
psuedo,
gb.learning_rate .* prev_func_pred
)

# Sample instances
stage_sample_ind = create_sample_indices(gb, instances, labels)

# Add optimal base function to ensemble
stage_base_func = build_base_func(
gb,
instances[stage_sample_ind],
labels[stage_sample_ind],
prev_func_pred[stage_sample_ind],
psuedo[stage_sample_ind]
)
base_funcs[iter_ind] = stage_base_func
end

# Return model
return GBModel(gb.learning_rate, base_funcs)
end

function fit(gb::GradientBoost, instances, labels)
stochastic_gradient_boost(gb, instances, labels)
end
function predict(gb_model::GBModel, instances)
outputs = zeros(size(instances, 1))
for i = 1:length(gb_model.base_funcs)
outputs .+= gb_model.learning_rate .* gb_model.base_funcs[i](instances)
end
return outputs
end

# Build base (basis) function for gradient boosting algorithm.
#
# @param gb Gradient boosting algorithm.
# @param instances Instances.
# @param labels Labels.
# @param prev_func_pred Previous base function's predictions.
# @param psuedo Psuedo-labels (psuedo-response).
# @return Function of form (instances) -> predictions.
function build_base_func(
gb::GradientBoost,
instances,
labels,
prev_func_pred,
psuedo)

err_must_be_overriden()
end

# Default sample method for gradient boosting algorithms.
# By default, it is sampling without replacement.
#
# @param gb Gradient boosting algorithm.
# @param instances Instances.
# @param labels Labels.
# @return Sample indices.
function create_sample_indices(gb::GradientBoost, instances, labels)
n = size(instances, 1)
prop = gb.sampling_rate

ind = randperm(n)[1:int(prop * n)]
end

end # module
129 changes: 129 additions & 0 deletions src/gb_dt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Gradient Boosted Decision Trees
module GBDecisionTree

using DecisionTree
using DataStructures

importall GradientBoost.GB
importall GradientBoost.LossFunctions

export GBDT,
build_base_func

# Gradient boosted decision tree algorithm.
type GBDT <: GradientBoost
loss_function::LossFunction
sampling_rate::FloatingPoint
learning_rate::FloatingPoint
num_iterations::Int
tree_options::Dict

function GBDT(loss_function=GaussianLoss(),
sampling_rate=0.5, learning_rate=0.01,
num_iterations=100, tree_options=Dict())

default_options = {
:maxlabels => 5,
:nsubfeatures => 0
}
options = merge(default_options, tree_options)
new(loss_function, sampling_rate, learning_rate, num_iterations, options)
end
end

function build_base_func(
gb::GBDT,
instances,
labels,
prev_func_pred,
psuedo)

# Train learner
model = build_tree(
psuedo, instances,
gb.tree_options[:maxlabels],
gb.tree_options[:nsubfeatures]
)
psuedo_pred = apply_tree(model, instances)

# Update regions (leaves)
# NOTE(svs14): Trees are immutable,
# override leaves by having node-to-val mapping.
inst_node_index = InstanceNodeIndex(model, instances)
function val_func(node)
inst_ind = inst_node_index.n2i[node]
val = fit_best_constant(gb.loss_function,
labels[inst_ind],
psuedo[inst_ind],
psuedo_pred[inst_ind],
prev_func_pred[inst_ind]
)
# If loss function is Guassian, we don't need need to change values.
if typeof(gb.loss_function) <: GaussianLoss
val = node.majority
end
end
n2v = Dict{Leaf, Any}()
update_regions!(n2v, model, val_func)

# Prediction function
function pred(instances)
num_instances = size(instances, 1)
predictions = [
n2v[instance_to_node(model, instances[i,:])]
for i in 1:num_instances
]
end

# Produce function that delegates prediction to model
return (instances) -> pred(instances)
end

# DT Helper Functions

type InstanceNodeIndex
i2n::Vector{Leaf}
n2i::DefaultDict{Leaf, Vector{Int}}

function InstanceNodeIndex(tree::Union(Leaf,Node), instances)
num_instances = size(instances, 1)
i2n = Array(Leaf, num_instances)
n2i = DefaultDict(Leaf, Vector{Int}, () -> Int[])

for i = 1:num_instances
node = instance_to_node(tree, instances[i,:])
i2n[i] = node
push!(n2i[node], i)
end

new(i2n, n2i)
end
end

# Returns respective node of instance.
function instance_to_node(tree::Node, instance)
# Code retrofitted from DecisionTree.jl
features = instance
if tree.featval == nothing
return instance_to_node(tree.left, features)
elseif features[tree.featid] < tree.featval
return instance_to_node(tree.left, features)
else
return instance_to_node(tree.right, features)
end
end
function instance_to_node(leaf::Leaf, instance)
return leaf
end

# Update region by having updated leaf value encoded
# in a leaf-to-value mapping.
function update_regions!(n2v::Dict{Leaf, Any}, node::Node, val_func::Function)
update_regions!(n2v, node.left, val_func)
update_regions!(n2v, node.right, val_func)
end
function update_regions!(n2v::Dict{Leaf, Any}, leaf::Leaf, val_func::Function)
n2v[leaf] = val_func(leaf)
end

end # module
71 changes: 71 additions & 0 deletions src/gb_learner.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Gradient Boosted Learner
module GBLearner

export GBL,
build_base_func,
learner_fit,
learner_predict

importall GradientBoost.GB
importall GradientBoost.LossFunctions

# Gradient boosted base learner algorithm.
type GBL <: GradientBoost
loss_function::LossFunction
sampling_rate::FloatingPoint
learning_rate::FloatingPoint
num_iterations::Int
learner

function GBL(learner, loss_function=GaussianLoss(),
sampling_rate=0.5, learning_rate=0.1,
num_iterations=10)

new(loss_function, sampling_rate, learning_rate, num_iterations, learner)
end
end

function build_base_func(
gb::GBL,
instances,
labels,
prev_func_pred,
psuedo)

# Train learner
lf = gb.loss_function
learner = gb.learner
model = learner_fit(lf, learner, instances, psuedo)
psuedo_pred = learner_predict(lf, learner, model, instances)
model_const =
fit_best_constant(lf, labels, psuedo, psuedo_pred, prev_func_pred)

# Produce function that delegates prediction to model
return (instances) ->
model_const .* learner_predict(lf, learner, model, instances)
end

# Fits base learner.
# The learner must be instantiated within this function.
#
# @param lf Loss function (typically, this is not used).
# @param learner Base learner.
# @param instances Instances.
# @param labels Labels.
# @return Model.
function learner_fit(lf::LossFunction, learner, instances, labels)
error("This function must be implemented by $(learner) for $(lf)")
end

# Predicts on base learner.
#
# @param lf Loss function (typically, this is not used).
# @param learner Base learner.
# @param model Model produced by base learner.
# @param instances Instances.
# @return Predictions.
function learner_predict(lf::LossFunction, learner, model, instances)
error("This function must be implemented by $(learner) for $(lf)")
end

end # module
Loading

0 comments on commit 3fc5099

Please sign in to comment.