Fix progress bar to update the total steps with trainer.max_steps (NV…

…IDIA#8499) Signed-off-by: Abhishree <abhishreetm@gmail.com> Signed-off-by: Zeeshan Patel <zeeshanp@berkeley.edu>
zpx01 · Mar 8, 2024 · 64f11b8 · 64f11b8
1 parent a5a680d
commit 64f11b8
Showing 1 changed file with 3 additions and 6 deletions.
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -1463,12 +1463,9 @@ def init_train_tqdm(self):
         return self.bar
 
     def on_train_epoch_start(self, trainer, *_):
-        if trainer.max_steps > 0 and (trainer.ckpt_path is not None):
-            # while resuming from a ckpt use trainer.max_steps as the total for progress bar as trainer.num_training_batches
-            # is truncated to max_steps - step being resumed at
-            num_training_batches = trainer.max_steps
-        else:
-            num_training_batches = trainer.num_training_batches
+        # Use trainer.max_steps as the num_training_batches since len(dataloader) aka num_training_batches is returned as the total num of micro batches
+        # instead of total num of global batches with this PR: https://github.com/NVIDIA/NeMo/pull/8426
+        num_training_batches = trainer.max_steps
         self.train_progress_bar.reset(num_training_batches)
         self.train_progress_bar.initial = 0
         self.train_progress_bar.set_description(f"Epoch {trainer.current_epoch}")