allow line search steps that do not meet strong Wolfe condition

yixuan · May 24, 2023 · 6803e70 · 6803e70
1 parent e902ae3
commit 6803e70
Show file tree

Hide file tree

Showing 2 changed files with 146 additions and 41 deletions.
diff --git a/include/LBFGSpp/LineSearchMoreThuente.h b/include/LBFGSpp/LineSearchMoreThuente.h
@@ -245,6 +245,10 @@ class LineSearchMoreThuente
         Scalar I_lo = Scalar(0), I_hi = std::numeric_limits<Scalar>::infinity();
         Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits<Scalar>::infinity();
         Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits<Scalar>::infinity();
+        // We also need to save x and grad for step=I_lo, since we want to return the best
+        // step size along the path when strong Wolfe condition is not met
+        Vector x_lo = xp, grad_lo = grad;
+        Scalar fx_lo = fx_init, dg_lo = dg_init;
 
         // Function value and gradient at the current step size
         x.noalias() = xp + step * drt;
@@ -311,6 +315,11 @@ class LineSearchMoreThuente
                 I_lo = step;
                 fI_lo = ft;
                 gI_lo = gt;
+                // Move x and grad to x_lo and grad_lo, respectively
+                x_lo.swap(x);
+                grad_lo.swap(grad);
+                fx_lo = fx;
+                dg_lo = dg;
 
                 // std::cout << "Case 2: new step = " << new_step << std::endl;
             }
@@ -326,6 +335,11 @@ class LineSearchMoreThuente
                 I_lo = step;
                 fI_lo = ft;
                 gI_lo = gt;
+                // Move x and grad to x_lo and grad_lo, respectively
+                x_lo.swap(x);
+                grad_lo.swap(grad);
+                fx_lo = fx;
+                dg_lo = dg;
 
                 // std::cout << "Case 3: new step = " << new_step << std::endl;
             }
@@ -343,6 +357,10 @@ class LineSearchMoreThuente
             {
                 // std::cout << "** Maximum step size reached\n\n";
                 // std::cout << "========================= Leaving line search =========================\n\n";
+
+                // Move {x, grad}_lo back before returning
+                x.swap(x_lo);
+                grad.swap(grad_lo);
                 return;
             }
             // Otherwise, recompute x and fx based on new_step
@@ -392,8 +410,32 @@ class LineSearchMoreThuente
             }
         }
 
+        // If we have used up all line search iterations, then the strong Wolfe condition
+        // is not met. We choose not to raise an exception (unless no step satisfying
+        // sufficient decrease is found), but to return the best step size so far
         if (iter >= param.max_linesearch)
-            throw std::runtime_error("the line search routine reached the maximum number of iterations");
+        {
+            // throw std::runtime_error("the line search routine reached the maximum number of iterations");
+
+            // First test whether the last step is better than I_lo
+            // If yes, return the last step
+            const Scalar ft = fx - fx_init - step * test_decr;
+            if (ft <= fI_lo)
+                return;
+
+            // Then the best step size so far is I_lo, but it needs to be positive
+            if (I_lo <= Scalar(0))
+                throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value");
+
+            // Return everything with _lo
+            step = I_lo;
+            fx = fx_lo;
+            dg = dg_lo;
+            // Move {x, grad}_lo back
+            x.swap(x_lo);
+            grad.swap(grad_lo);
+            return;
+        }
     }
 };
 

diff --git a/include/LBFGSpp/LineSearchNocedalWright.h b/include/LBFGSpp/LineSearchNocedalWright.h
@@ -23,6 +23,41 @@ class LineSearchNocedalWright
 private:
     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 
+    // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
+    // the function, and the fitted quadratic function is used to
+    // estimate the minimum
+    static Scalar quad_interp(const Scalar& step_lo, const Scalar& step_hi,
+        const Scalar& fx_lo, const Scalar& fx_hi, const Scalar& dg_lo)
+    {
+        using std::abs;
+
+        // polynomial: p (x) = c0*(x - step)² + c1
+        // conditions: p (step_hi) = fx_hi
+        //             p (step_lo) = fx_lo
+        //             p'(step_lo) = dg_lo
+
+        // We allow fx_hi to be Inf, so first compute a candidate for step size,
+        // and test whether NaN occurs
+        const Scalar fdiff = fx_hi - fx_lo;
+        const Scalar sdiff = step_hi - step_lo;
+        const Scalar smid = (step_hi + step_lo) / Scalar(2);
+        Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo;
+        step_candid = step_candid / (fdiff - sdiff * dg_lo);
+
+        // In some cases the interpolation is not a good choice
+        // This includes (a) NaN values; (b) too close to the end points; (c) outside the interval
+        // In such cases, a bisection search is used
+        const bool candid_nan = !(std::isfinite(step_candid));
+        const Scalar end_dist = std::min(abs(step_candid - step_lo), abs(step_candid - step_hi));
+        const bool near_end = end_dist < Scalar(0.01) * abs(sdiff);
+        const bool bisect = candid_nan ||
+            (step_candid <= std::min(step_lo, step_hi)) ||
+            (step_candid >= std::max(step_lo, step_hi)) ||
+            near_end;
+        const Scalar step = bisect ? smid : step_candid;
+        return step;
+    }
+
 public:
     ///
     /// Line search by Nocedal and Wright (2006).
@@ -73,7 +108,7 @@ class LineSearchNocedalWright
         // Save the function value at the current x
         const Scalar fx_init = fx;
         // Projection of gradient on the search direction
-        const Scalar dg_init = grad.dot(drt);
+        const Scalar dg_init = dg;
         // Make sure d points to a descent direction
         if (dg_init > Scalar(0))
             throw std::logic_error("the moving direction increases the objective function value");
@@ -84,9 +119,19 @@ class LineSearchNocedalWright
         // Ends of the line search range (step_lo > step_hi is allowed)
         Scalar step_hi, fx_hi, dg_hi;
         Scalar step_lo = Scalar(0), fx_lo = fx_init, dg_lo = dg_init;
+        // We also need to save x and grad for step=step_lo, since we want to return the best
+        // step size along the path when strong Wolfe condition is not met
+        Vector x_lo = xp, grad_lo = grad;
 
         // STEP 1: Bracketing Phase
         //   Find a range guaranteed to contain a step satisfying strong Wolfe.
+        //   The bracketing phase exits if one of the following conditions is satisfied:
+        //   (1) Current step violates the sufficient decrease condition
+        //   (2) Current fx >= previous fx
+        //   (3) Current dg >= 0
+        //   (4) Strong Wolfe condition is met
+        //
+        //   (4) terminates the whole line search, and (1)-(3) go to the zoom phase
         //
         //   See also:
         //     "Numerical Optimization", "Algorithm 3.5 (Line Search Algorithm)".
@@ -96,16 +141,12 @@ class LineSearchNocedalWright
             // Evaluate the current step size
             x.noalias() = xp + step * drt;
             fx = f(x, grad);
-
-            iter++;
-            if (iter >= param.max_linesearch)
-                throw std::runtime_error("the line search routine reached the maximum number of iterations");
-
-            const Scalar dg = grad.dot(drt);
+            dg = grad.dot(drt);
 
             // Test the sufficient decrease condition
             if (fx - fx_init > step * test_decr || (Scalar(0) < step_lo && fx >= fx_lo))
             {
+                // Case (1) and (2)
                 step_hi = step;
                 fx_hi = fx;
                 dg_hi = dg;
@@ -115,18 +156,40 @@ class LineSearchNocedalWright
 
             // Test the curvature condition
             if (std::abs(dg) <= test_curv)
-                return;
+                return;  // Case (4)
 
             step_hi = step_lo;
             fx_hi = fx_lo;
             dg_hi = dg_lo;
             step_lo = step;
             fx_lo = fx;
             dg_lo = dg;
+            // Move x and grad to x_lo and grad_lo, respectively
+            x_lo.swap(x);
+            grad_lo.swap(grad);
 
             if (dg >= Scalar(0))
-                break;
+                break;  // Case (3)
+
+            iter++;
+            // If we have used up all line search iterations in the bracketing phase,
+            // it means every new step decreases the objective function. Of course,
+            // the strong Wolfe condition is not met, but we choose not to raise an
+            // exception; instead, we return the best step size so far. This means that
+            // we exit the line search with the most recent step size, which has the
+            // smallest objective function value during the line search
+            if (iter >= param.max_linesearch)
+            {
+                // throw std::runtime_error("the line search routine reached the maximum number of iterations");
 
+                // At this point we can guarantee that {step, fx, dg}=={step, fx, dg}_lo
+                // But we need to move {x, grad}_lo back before returning
+                x.swap(x_lo);
+                grad.swap(grad_lo);
+                return;
+            }
+
+            // If we still stay in the loop, it means we can expand the current step
             step *= expansion;
         }
 
@@ -135,48 +198,23 @@ class LineSearchNocedalWright
         //   contain a valid strong Wolfe step value, this method
         //   finds such a value.
         //
+        //   If step_lo > 0, then step_lo is, among all step sizes generated so far and
+        //   satisfying the sufficient decrease condition, the one giving the smallest
+        //   objective function value.
+        //
         //   See also:
         //     "Numerical Optimization", "Algorithm 3.6 (Zoom)".
         for (;;)
         {
             // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
             // the function, and the fitted quadratic function is used to
             // estimate the minimum
-            //
-            // polynomial: p (x) = c0*(x - step)² + c1
-            // conditions: p (step_hi) = fx_hi
-            //             p (step_lo) = fx_lo
-            //             p'(step_lo) = dg_lo
-
-            // We allow fx_hi to be Inf, so first compute a candidate for step size,
-            // and test whether NaN occurs
-            const Scalar fdiff = fx_hi - fx_lo;
-            const Scalar sdiff = step_hi - step_lo;
-            const Scalar smid = (step_hi + step_lo) / Scalar(2);
-            Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo;
-            step_candid = step_candid / (fdiff - sdiff * dg_lo);
-
-            // In some cases the interpolation is not a good choice
-            // This includes (a) NaN values; (b) too close to the end points; (c) outside the interval
-            // In such cases, a bisection search is used
-            const bool candid_nan = !(std::isfinite(step_candid));
-            const Scalar end_dist = std::min(std::abs(step_candid - step_lo), std::abs(step_candid - step_hi));
-            const bool near_end = end_dist < Scalar(0.01) * std::abs(sdiff);
-            const bool bisect = candid_nan ||
-                (step_candid <= std::min(step_lo, step_hi)) ||
-                (step_candid >= std::max(step_lo, step_hi)) ||
-                near_end;
-            step = bisect ? smid : step_candid;
+            step = quad_interp(step_lo, step_hi, fx_lo, fx_hi, dg_lo);
 
             // Evaluate the current step size
             x.noalias() = xp + step * drt;
             fx = f(x, grad);
-
-            iter++;
-            if (iter >= param.max_linesearch)
-                throw std::runtime_error("the line search routine reached the maximum number of iterations");
-
-            const Scalar dg = grad.dot(drt);
+            dg = grad.dot(drt);
 
             // Test the sufficient decrease condition
             if (fx - fx_init > step * test_decr || fx >= fx_lo)
@@ -204,9 +242,34 @@ class LineSearchNocedalWright
                 if (step == step_lo)
                     throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision");
 
+                // If reaching here, then the current step satisfies sufficient decrease condition
                 step_lo = step;
                 fx_lo = fx;
                 dg_lo = dg;
+                // Move x and grad to x_lo and grad_lo, respectively
+                x_lo.swap(x);
+                grad_lo.swap(grad);
+            }
+
+            iter++;
+            // If we have used up all line search iterations in the zoom phase,
+            // then the strong Wolfe condition is not met. We choose not to raise an
+            // exception (unless no step satisfying sufficient decrease is found),
+            // but to return the best step size so far, i.e., step_lo
+            if (iter >= param.max_linesearch)
+            {
+                // throw std::runtime_error("the line search routine reached the maximum number of iterations");
+                if (step_lo <= Scalar(0))
+                    throw std::runtime_error("the line search routine failed, unable to sufficiently decrease the function value");
+
+                // Return everything with _lo
+                step = step_lo;
+                fx = fx_lo;
+                dg = dg_lo;
+                // Move {x, grad}_lo back
+                x.swap(x_lo);
+                grad.swap(grad_lo);
+                return;
             }
         }
     }