Skip to content

Commit

Permalink
allow line search steps that do not meet strong Wolfe condition
Browse files Browse the repository at this point in the history
  • Loading branch information
yixuan committed May 24, 2023
1 parent e902ae3 commit 6803e70
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 41 deletions.
44 changes: 43 additions & 1 deletion include/LBFGSpp/LineSearchMoreThuente.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ class LineSearchMoreThuente
Scalar I_lo = Scalar(0), I_hi = std::numeric_limits<Scalar>::infinity();
Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits<Scalar>::infinity();
Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits<Scalar>::infinity();
// We also need to save x and grad for step=I_lo, since we want to return the best
// step size along the path when strong Wolfe condition is not met
Vector x_lo = xp, grad_lo = grad;
Scalar fx_lo = fx_init, dg_lo = dg_init;

// Function value and gradient at the current step size
x.noalias() = xp + step * drt;
Expand Down Expand Up @@ -311,6 +315,11 @@ class LineSearchMoreThuente
I_lo = step;
fI_lo = ft;
gI_lo = gt;
// Move x and grad to x_lo and grad_lo, respectively
x_lo.swap(x);
grad_lo.swap(grad);
fx_lo = fx;
dg_lo = dg;

// std::cout << "Case 2: new step = " << new_step << std::endl;
}
Expand All @@ -326,6 +335,11 @@ class LineSearchMoreThuente
I_lo = step;
fI_lo = ft;
gI_lo = gt;
// Move x and grad to x_lo and grad_lo, respectively
x_lo.swap(x);
grad_lo.swap(grad);
fx_lo = fx;
dg_lo = dg;

// std::cout << "Case 3: new step = " << new_step << std::endl;
}
Expand All @@ -343,6 +357,10 @@ class LineSearchMoreThuente
{
// std::cout << "** Maximum step size reached\n\n";
// std::cout << "========================= Leaving line search =========================\n\n";

// Move {x, grad}_lo back before returning
x.swap(x_lo);
grad.swap(grad_lo);
return;
}
// Otherwise, recompute x and fx based on new_step
Expand Down Expand Up @@ -392,8 +410,32 @@ class LineSearchMoreThuente
}
}

// If we have used up all line search iterations, then the strong Wolfe condition
// is not met. We choose not to raise an exception (unless no step satisfying
// sufficient decrease is found), but to return the best step size so far
if (iter >= param.max_linesearch)
throw std::runtime_error("the line search routine reached the maximum number of iterations");
{
// throw std::runtime_error("the line search routine reached the maximum number of iterations");

// First test whether the last step is better than I_lo
// If yes, return the last step
const Scalar ft = fx - fx_init - step * test_decr;
if (ft <= fI_lo)
return;

// Then the best step size so far is I_lo, but it needs to be positive
if (I_lo <= Scalar(0))
throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value");

// Return everything with _lo
step = I_lo;
fx = fx_lo;
dg = dg_lo;
// Move {x, grad}_lo back
x.swap(x_lo);
grad.swap(grad_lo);
return;
}
}
};

Expand Down
143 changes: 103 additions & 40 deletions include/LBFGSpp/LineSearchNocedalWright.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,41 @@ class LineSearchNocedalWright
private:
using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;

// Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
// the function, and the fitted quadratic function is used to
// estimate the minimum
static Scalar quad_interp(const Scalar& step_lo, const Scalar& step_hi,
const Scalar& fx_lo, const Scalar& fx_hi, const Scalar& dg_lo)
{
using std::abs;

// polynomial: p (x) = c0*(x - step)² + c1
// conditions: p (step_hi) = fx_hi
// p (step_lo) = fx_lo
// p'(step_lo) = dg_lo

// We allow fx_hi to be Inf, so first compute a candidate for step size,
// and test whether NaN occurs
const Scalar fdiff = fx_hi - fx_lo;
const Scalar sdiff = step_hi - step_lo;
const Scalar smid = (step_hi + step_lo) / Scalar(2);
Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo;
step_candid = step_candid / (fdiff - sdiff * dg_lo);

// In some cases the interpolation is not a good choice
// This includes (a) NaN values; (b) too close to the end points; (c) outside the interval
// In such cases, a bisection search is used
const bool candid_nan = !(std::isfinite(step_candid));
const Scalar end_dist = std::min(abs(step_candid - step_lo), abs(step_candid - step_hi));
const bool near_end = end_dist < Scalar(0.01) * abs(sdiff);
const bool bisect = candid_nan ||
(step_candid <= std::min(step_lo, step_hi)) ||
(step_candid >= std::max(step_lo, step_hi)) ||
near_end;
const Scalar step = bisect ? smid : step_candid;
return step;
}

public:
///
/// Line search by Nocedal and Wright (2006).
Expand Down Expand Up @@ -73,7 +108,7 @@ class LineSearchNocedalWright
// Save the function value at the current x
const Scalar fx_init = fx;
// Projection of gradient on the search direction
const Scalar dg_init = grad.dot(drt);
const Scalar dg_init = dg;
// Make sure d points to a descent direction
if (dg_init > Scalar(0))
throw std::logic_error("the moving direction increases the objective function value");
Expand All @@ -84,9 +119,19 @@ class LineSearchNocedalWright
// Ends of the line search range (step_lo > step_hi is allowed)
Scalar step_hi, fx_hi, dg_hi;
Scalar step_lo = Scalar(0), fx_lo = fx_init, dg_lo = dg_init;
// We also need to save x and grad for step=step_lo, since we want to return the best
// step size along the path when strong Wolfe condition is not met
Vector x_lo = xp, grad_lo = grad;

// STEP 1: Bracketing Phase
// Find a range guaranteed to contain a step satisfying strong Wolfe.
// The bracketing phase exits if one of the following conditions is satisfied:
// (1) Current step violates the sufficient decrease condition
// (2) Current fx >= previous fx
// (3) Current dg >= 0
// (4) Strong Wolfe condition is met
//
// (4) terminates the whole line search, and (1)-(3) go to the zoom phase
//
// See also:
// "Numerical Optimization", "Algorithm 3.5 (Line Search Algorithm)".
Expand All @@ -96,16 +141,12 @@ class LineSearchNocedalWright
// Evaluate the current step size
x.noalias() = xp + step * drt;
fx = f(x, grad);

iter++;
if (iter >= param.max_linesearch)
throw std::runtime_error("the line search routine reached the maximum number of iterations");

const Scalar dg = grad.dot(drt);
dg = grad.dot(drt);

// Test the sufficient decrease condition
if (fx - fx_init > step * test_decr || (Scalar(0) < step_lo && fx >= fx_lo))
{
// Case (1) and (2)
step_hi = step;
fx_hi = fx;
dg_hi = dg;
Expand All @@ -115,18 +156,40 @@ class LineSearchNocedalWright

// Test the curvature condition
if (std::abs(dg) <= test_curv)
return;
return; // Case (4)

step_hi = step_lo;
fx_hi = fx_lo;
dg_hi = dg_lo;
step_lo = step;
fx_lo = fx;
dg_lo = dg;
// Move x and grad to x_lo and grad_lo, respectively
x_lo.swap(x);
grad_lo.swap(grad);

if (dg >= Scalar(0))
break;
break; // Case (3)

iter++;
// If we have used up all line search iterations in the bracketing phase,
// it means every new step decreases the objective function. Of course,
// the strong Wolfe condition is not met, but we choose not to raise an
// exception; instead, we return the best step size so far. This means that
// we exit the line search with the most recent step size, which has the
// smallest objective function value during the line search
if (iter >= param.max_linesearch)
{
// throw std::runtime_error("the line search routine reached the maximum number of iterations");

// At this point we can guarantee that {step, fx, dg}=={step, fx, dg}_lo
// But we need to move {x, grad}_lo back before returning
x.swap(x_lo);
grad.swap(grad_lo);
return;
}

// If we still stay in the loop, it means we can expand the current step
step *= expansion;
}

Expand All @@ -135,48 +198,23 @@ class LineSearchNocedalWright
// contain a valid strong Wolfe step value, this method
// finds such a value.
//
// If step_lo > 0, then step_lo is, among all step sizes generated so far and
// satisfying the sufficient decrease condition, the one giving the smallest
// objective function value.
//
// See also:
// "Numerical Optimization", "Algorithm 3.6 (Zoom)".
for (;;)
{
// Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
// the function, and the fitted quadratic function is used to
// estimate the minimum
//
// polynomial: p (x) = c0*(x - step)² + c1
// conditions: p (step_hi) = fx_hi
// p (step_lo) = fx_lo
// p'(step_lo) = dg_lo

// We allow fx_hi to be Inf, so first compute a candidate for step size,
// and test whether NaN occurs
const Scalar fdiff = fx_hi - fx_lo;
const Scalar sdiff = step_hi - step_lo;
const Scalar smid = (step_hi + step_lo) / Scalar(2);
Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo;
step_candid = step_candid / (fdiff - sdiff * dg_lo);

// In some cases the interpolation is not a good choice
// This includes (a) NaN values; (b) too close to the end points; (c) outside the interval
// In such cases, a bisection search is used
const bool candid_nan = !(std::isfinite(step_candid));
const Scalar end_dist = std::min(std::abs(step_candid - step_lo), std::abs(step_candid - step_hi));
const bool near_end = end_dist < Scalar(0.01) * std::abs(sdiff);
const bool bisect = candid_nan ||
(step_candid <= std::min(step_lo, step_hi)) ||
(step_candid >= std::max(step_lo, step_hi)) ||
near_end;
step = bisect ? smid : step_candid;
step = quad_interp(step_lo, step_hi, fx_lo, fx_hi, dg_lo);

// Evaluate the current step size
x.noalias() = xp + step * drt;
fx = f(x, grad);

iter++;
if (iter >= param.max_linesearch)
throw std::runtime_error("the line search routine reached the maximum number of iterations");

const Scalar dg = grad.dot(drt);
dg = grad.dot(drt);

// Test the sufficient decrease condition
if (fx - fx_init > step * test_decr || fx >= fx_lo)
Expand Down Expand Up @@ -204,9 +242,34 @@ class LineSearchNocedalWright
if (step == step_lo)
throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision");

// If reaching here, then the current step satisfies sufficient decrease condition
step_lo = step;
fx_lo = fx;
dg_lo = dg;
// Move x and grad to x_lo and grad_lo, respectively
x_lo.swap(x);
grad_lo.swap(grad);
}

iter++;
// If we have used up all line search iterations in the zoom phase,
// then the strong Wolfe condition is not met. We choose not to raise an
// exception (unless no step satisfying sufficient decrease is found),
// but to return the best step size so far, i.e., step_lo
if (iter >= param.max_linesearch)
{
// throw std::runtime_error("the line search routine reached the maximum number of iterations");
if (step_lo <= Scalar(0))
throw std::runtime_error("the line search routine failed, unable to sufficiently decrease the function value");

// Return everything with _lo
step = step_lo;
fx = fx_lo;
dg = dg_lo;
// Move {x, grad}_lo back
x.swap(x_lo);
grad.swap(grad_lo);
return;
}
}
}
Expand Down

0 comments on commit 6803e70

Please sign in to comment.