Gradient Boosting algorithm. GB Learner and DT. Loss functions.

svs14 · Jul 5, 2014 · 3fc5099 · 3fc5099
1 parent 01b6c0a
commit 3fc5099
Show file tree

Hide file tree

Showing 14 changed files with 973 additions and 6 deletions.
diff --git a/FUTUREWORK.md b/FUTUREWORK.md
@@ -2,7 +2,9 @@
 
 Listed below are the key features that are to be developed:
 
+- Multi-class handling
 - Monitoring callback (early termination)
+- GB-specific grid-search
 - Variable importance
-- Multi-class handling
+- More loss functions
 - Optimize codebase (ongoing)
diff --git a/REQUIRE b/REQUIRE
@@ -1,2 +1,4 @@
 julia 0.3-
 FactCheck 0.1-
+DecisionTree 0.3-
+DataStructures 0.2-
diff --git a/src/GradientBoost.jl b/src/GradientBoost.jl
@@ -1,5 +1,10 @@
 module GradientBoost
 
-# package code goes here
+# Load sources files
+include("util.jl")
+include("loss.jl")
+include("gb.jl")
+include("gb_learner.jl")
+include("gb_dt.jl")
 
 end # module
diff --git a/src/gb.jl b/src/gb.jl
@@ -0,0 +1,111 @@
+# Gradient boosting.
+module GB
+
+importall GradientBoost.Util
+importall GradientBoost.LossFunctions
+
+export GradientBoost,
+       GBModel,
+       stochastic_gradient_boost,
+       fit,
+       predict,
+       build_base_func,
+       create_sample_indices
+
+
+# Gradient boost algorithm.
+abstract GradientBoost
+
+# Gradient boost model.
+type GBModel
+  learning_rate::FloatingPoint
+  base_funcs::Vector{Function}
+end
+
+# Perform stochastic gradient boost.
+#
+# @param gb Gradient boosting algorithm.
+# @param instances Instances.
+# @param labels Labels.
+# @return Gradient boost model.
+function stochastic_gradient_boost(gb::GradientBoost, instances, labels)
+  # Initialize base functions and psuedo labels
+  num_iterations = gb.num_iterations
+  base_funcs = Array(Function, num_iterations+1)
+  base_funcs[1] = (instances) ->
+    fill(minimizing_scalar(gb.loss_function, labels), size(instances, 1))
+
+  # Build base functions
+  stage_base_func = base_funcs[1]
+  psuedo = labels
+  for iter_ind = 2:num_iterations+1
+    # Update residuals
+    prev_func_pred = stage_base_func(instances)
+    psuedo = negative_gradient(
+      gb.loss_function,
+      psuedo,
+      gb.learning_rate .* prev_func_pred
+    )
+
+    # Sample instances
+    stage_sample_ind = create_sample_indices(gb, instances, labels)
+
+    # Add optimal base function to ensemble
+    stage_base_func = build_base_func(
+      gb,
+      instances[stage_sample_ind],
+      labels[stage_sample_ind],
+      prev_func_pred[stage_sample_ind],
+      psuedo[stage_sample_ind]
+    )
+    base_funcs[iter_ind] = stage_base_func
+  end
+
+  # Return model
+  return GBModel(gb.learning_rate, base_funcs)
+end
+
+function fit(gb::GradientBoost, instances, labels)
+  stochastic_gradient_boost(gb, instances, labels)
+end
+function predict(gb_model::GBModel, instances)
+  outputs = zeros(size(instances, 1))
+  for i = 1:length(gb_model.base_funcs)
+    outputs .+= gb_model.learning_rate .* gb_model.base_funcs[i](instances)
+  end
+  return outputs
+end
+
+# Build base (basis) function for gradient boosting algorithm.
+#
+# @param gb Gradient boosting algorithm.
+# @param instances Instances.
+# @param labels Labels.
+# @param prev_func_pred Previous base function's predictions.
+# @param psuedo Psuedo-labels (psuedo-response).
+# @return Function of form (instances) -> predictions.
+function build_base_func(
+  gb::GradientBoost,
+  instances,
+  labels,
+  prev_func_pred,
+  psuedo)
+
+  err_must_be_overriden()
+end
+
+# Default sample method for gradient boosting algorithms.
+# By default, it is sampling without replacement.
+#
+# @param gb Gradient boosting algorithm.
+# @param instances Instances.
+# @param labels Labels.
+# @return Sample indices.
+function create_sample_indices(gb::GradientBoost, instances, labels)
+  n = size(instances, 1)
+  prop = gb.sampling_rate
+
+  ind = randperm(n)[1:int(prop * n)]
+end
+
+end # module
diff --git a/src/gb_dt.jl b/src/gb_dt.jl
@@ -0,0 +1,129 @@
+# Gradient Boosted Decision Trees
+module GBDecisionTree
+
+using DecisionTree
+using DataStructures
+
+importall GradientBoost.GB
+importall GradientBoost.LossFunctions
+
+export GBDT,
+       build_base_func
+
+# Gradient boosted decision tree algorithm.
+type GBDT <: GradientBoost
+  loss_function::LossFunction
+  sampling_rate::FloatingPoint
+  learning_rate::FloatingPoint
+  num_iterations::Int
+  tree_options::Dict
+
+  function GBDT(loss_function=GaussianLoss(),
+    sampling_rate=0.5, learning_rate=0.01, 
+    num_iterations=100, tree_options=Dict())
+
+    default_options = {
+      :maxlabels => 5,
+      :nsubfeatures => 0
+    }
+    options = merge(default_options, tree_options)
+    new(loss_function, sampling_rate, learning_rate, num_iterations, options)
+  end
+end
+
+function build_base_func(
+  gb::GBDT,
+  instances,
+  labels,
+  prev_func_pred,
+  psuedo)
+
+  # Train learner
+  model = build_tree(
+    psuedo, instances, 
+    gb.tree_options[:maxlabels],
+    gb.tree_options[:nsubfeatures] 
+  )
+  psuedo_pred = apply_tree(model, instances)
+
+  # Update regions (leaves)
+  # NOTE(svs14): Trees are immutable, 
+  #              override leaves by having node-to-val mapping.
+  inst_node_index = InstanceNodeIndex(model, instances)
+  function val_func(node)
+    inst_ind = inst_node_index.n2i[node]
+    val = fit_best_constant(gb.loss_function,
+      labels[inst_ind],
+      psuedo[inst_ind],
+      psuedo_pred[inst_ind],
+      prev_func_pred[inst_ind]
+    )
+    # If loss function is Guassian, we don't need need to change values.
+    if typeof(gb.loss_function) <: GaussianLoss
+      val = node.majority
+    end
+  end
+  n2v = Dict{Leaf, Any}()
+  update_regions!(n2v, model, val_func)
+
+  # Prediction function
+  function pred(instances)
+    num_instances = size(instances, 1)
+    predictions = [
+      n2v[instance_to_node(model, instances[i,:])]
+      for i in 1:num_instances
+    ]
+  end
+
+  # Produce function that delegates prediction to model
+  return (instances) -> pred(instances)
+end
+
+# DT Helper Functions
+
+type InstanceNodeIndex
+  i2n::Vector{Leaf}
+  n2i::DefaultDict{Leaf, Vector{Int}}
+
+  function InstanceNodeIndex(tree::Union(Leaf,Node), instances)
+    num_instances = size(instances, 1)
+    i2n = Array(Leaf, num_instances)
+    n2i = DefaultDict(Leaf, Vector{Int}, () -> Int[])
+
+    for i = 1:num_instances
+      node = instance_to_node(tree, instances[i,:])
+      i2n[i] = node
+      push!(n2i[node], i)
+    end
+
+    new(i2n, n2i)
+  end
+end
+
+# Returns respective node of instance.
+function instance_to_node(tree::Node, instance)
+  # Code retrofitted from DecisionTree.jl
+  features = instance
+  if tree.featval == nothing
+    return instance_to_node(tree.left, features)
+  elseif features[tree.featid] < tree.featval
+    return instance_to_node(tree.left, features)
+  else
+    return instance_to_node(tree.right, features)
+  end
+end
+function instance_to_node(leaf::Leaf, instance)
+  return leaf
+end
+
+# Update region by having updated leaf value encoded
+# in a leaf-to-value mapping.
+function update_regions!(n2v::Dict{Leaf, Any}, node::Node, val_func::Function)
+  update_regions!(n2v, node.left, val_func)
+  update_regions!(n2v, node.right, val_func)
+end
+function update_regions!(n2v::Dict{Leaf, Any}, leaf::Leaf, val_func::Function)
+  n2v[leaf] = val_func(leaf)
+end
+
+end # module
diff --git a/src/gb_learner.jl b/src/gb_learner.jl
@@ -0,0 +1,71 @@
+# Gradient Boosted Learner
+module GBLearner
+
+export GBL,
+       build_base_func,
+       learner_fit,
+       learner_predict
+
+importall GradientBoost.GB
+importall GradientBoost.LossFunctions
+
+# Gradient boosted base learner algorithm.
+type GBL <: GradientBoost
+  loss_function::LossFunction
+  sampling_rate::FloatingPoint
+  learning_rate::FloatingPoint
+  num_iterations::Int
+  learner
+
+  function GBL(learner, loss_function=GaussianLoss(),
+    sampling_rate=0.5, learning_rate=0.1, 
+    num_iterations=10)
+
+    new(loss_function, sampling_rate, learning_rate, num_iterations, learner)
+  end
+end
+
+function build_base_func(
+    gb::GBL,
+    instances,
+    labels,
+    prev_func_pred,
+    psuedo)
+
+  # Train learner
+  lf = gb.loss_function
+  learner = gb.learner
+  model = learner_fit(lf, learner, instances, psuedo)
+  psuedo_pred = learner_predict(lf, learner, model, instances)
+  model_const =
+    fit_best_constant(lf, labels, psuedo, psuedo_pred, prev_func_pred)
+
+  # Produce function that delegates prediction to model
+  return (instances) ->
+    model_const .* learner_predict(lf, learner, model, instances)
+end
+
+# Fits base learner.
+# The learner must be instantiated within this function.
+#
+# @param lf Loss function (typically, this is not used).
+# @param learner Base learner.
+# @param instances Instances.
+# @param labels Labels.
+# @return Model.
+function learner_fit(lf::LossFunction, learner, instances, labels)
+  error("This function must be implemented by $(learner) for $(lf)")
+end
+
+# Predicts on base learner.
+#
+# @param lf Loss function (typically, this is not used).
+# @param learner Base learner.
+# @param model Model produced by base learner.
+# @param instances Instances.
+# @return Predictions.
+function learner_predict(lf::LossFunction, learner, model, instances)
+  error("This function must be implemented by $(learner) for $(lf)")
+end
+
+end # module