multiply_grads

zheyuye · Jul 7, 2020 · 007f07e · 007f07e
1 parent b8c85bb
commit 007f07e
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 26 deletions.
diff --git a/scripts/pretraining/run_electra.py b/scripts/pretraining/run_electra.py
@@ -448,7 +448,7 @@ def train(args):
         # We need to change the ratio to be
         #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
         total_norm, ratio, is_finite = clip_grad_global_norm(
-            params, args.max_grad_norm, loss_denom / num_samples_per_update)
+            params, args.max_grad_norm * num_samples_per_update / loss_denom) 
         total_norm = total_norm / (num_samples_per_update / loss_denom)
         trainer.update(num_samples_per_update / loss_denom, ignore_stale_grad=True)
         step_num += 1

diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
@@ -23,7 +23,7 @@
 from gluonnlp.models import get_backbone
 from gluonnlp.utils.misc import grouper, repeat, set_seed, parse_ctx, logging_config, count_parameters
 from gluonnlp.initializer import TruncNorm
-from gluonnlp.utils.parameter import clip_grad_global_norm
+from gluonnlp.utils.parameter import clip_grad_global_norm, multiply_grads
 
 mx.npx.set_np()
 
@@ -565,15 +565,19 @@ def train(args):
                                         for ele in answerable_loss_l]).asnumpy()
         # update
         trainer.allreduce_grads()
-        # Here, the accumulated gradients are
-        # \sum_{n=1}^N g_n / loss_denom
-        # Thus, in order to clip the average gradient
-        #   \frac{1}{N} \sum_{n=1}^N      -->  clip to args.max_grad_norm
-        # We need to change the ratio to be
-        #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
-        total_norm, ratio, is_finite = clip_grad_global_norm(
-            params, args.max_grad_norm, loss_denom / num_samples_per_update)
-        total_norm = total_norm / (num_samples_per_update / loss_denom)
+
+        if args.max_grad_norm > 0:
+            # Here, the accumulated gradients are
+            # \sum_{n=1}^N g_n / loss_denom
+            # Thus, in order to clip the average gradient
+            #   \frac{1}{N} \sum_{n=1}^N      -->  clip to args.max_grad_norm
+            # We need to change the ratio to be
+            #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
+            total_norm, ratio, is_finite = clip_grad_global_norm(
+                params, args.max_grad_norm * num_samples_per_update / loss_denom)
+            total_norm = total_norm / (num_samples_per_update / loss_denom)
+        else:
+            total_norm, is_finite = multiply_grads(params, loss_denom / num_samples_per_update)
 
         trainer.update(num_samples_per_update / loss_denom, ignore_stale_grad=True)
         if args.num_accumulated != 1:

diff --git a/src/gluonnlp/utils/parameter.py b/src/gluonnlp/utils/parameter.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility functions for trainer and parameters."""
-__all__ = ['grad_global_norm', 'clip_grad_global_norm']
+__all__ = ['grad_global_norm', 'clip_grad_global_norm', 'multiply_grads']
 
 
 import warnings
@@ -94,8 +94,7 @@ def grad_global_norm(parameters: Iterable[Parameter]) -> float:
 
 
 def clip_grad_global_norm(parameters: Iterable[Parameter],
-                          max_grad_norm: float, 
-                          multiplier: float = 1.0,
+                          max_norm: float,
                           check_isfinite: bool = True) -> Tuple[float, float, bool]:
     """Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`.
     If gradients exist for more than one context for a parameter, user needs to explicitly call
@@ -124,13 +123,10 @@ def clip_grad_global_norm(parameters: Iterable[Parameter],
     ----------
     parameters
         The list of parameters to calculate the norm
-    max_grad_norm
+    max_norm
         If the gradient norm is larger than max_norm, it will be clipped to have max_norm
-    multiplier
-        Constant multiplier to scale the gradient
     check_isfinite
          If True, check whether the total_norm is finite (not nan or inf).
-
     Returns
     -------
     total_norm
@@ -143,22 +139,50 @@ def clip_grad_global_norm(parameters: Iterable[Parameter],
     """
     total_norm = grad_global_norm(parameters)
     is_finite = bool(np.isfinite(total_norm))
-    if max_grad_norm > 0:
-        ratio = np.maximum(1, total_norm / (max_grad_norm / multiplier))
-        scale = 1 / ratio
-    else:
-        scale = multiplier
-        ratio = float('nan')
-
+    ratio = np.maximum(1, total_norm / max_norm)
     if check_isfinite and not is_finite:
         warnings.warn(
             UserWarning('nan or inf is detected. Clipping results will be undefined.'
                         ' Thus, skip clipping'),
             stacklevel=2)
         return total_norm, ratio, is_finite
-
+    scale = 1 / ratio
     for p in parameters:
         if p.grad_req != 'null':
             for arr in p.list_grad():
                 arr *= scale
     return total_norm, ratio, is_finite
+
+
+def multiply_grads(parameters: Iterable[Parameter],
+                   scale: float,
+                   check_isfinite: bool = True) -> Tuple[float]:
+    """
+    Multiplies grads by a constant scale
+
+    Parameters
+    ----------
+    parameters
+        The list of parameters to calculate the norm
+    scale
+        The normalize multiplier to normalize the gradient
+    Returns
+    -------
+    total_norm
+        The total norm
+    is_finite
+        Whether the total norm is finite
+    """
+    total_norm = grad_global_norm(parameters)
+    is_finite = bool(np.isfinite(total_norm))
+    if check_isfinite and not is_finite:
+        warnings.warn(
+            UserWarning('nan or inf is detected. Clipping results will be undefined.'
+                        ' Thus, skip clipping'),
+            stacklevel=2)
+        return total_norm, is_finite
+    for p in parameters:
+        if p.grad_req != 'null':
+            for arr in p.list_grad():
+                arr *= scale
+    return total_norm, is_finite