Skip to content

Commit

Permalink
Hold failed upload metrics and upload with next upload metrics (#8513)
Browse files Browse the repository at this point in the history
Co-authored-by: hassaanfarooq01 <hassaanfarooq01@gmail.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
  • Loading branch information
3 people committed Mar 1, 2024
1 parent 33fff69 commit 31cf94e
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
8 changes: 8 additions & 0 deletions ultralytics/hub/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(self, identifier):
"heartbeat": 300.0,
} # rate limits (seconds)
self.metrics_queue = {} # holds metrics for each epoch until upload
self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py

# Parse input
Expand Down Expand Up @@ -234,6 +235,9 @@ def retry_request():
self._show_upload_progress(progress_total, response)

if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
# if request related to metrics upload
if kwargs.get("metrics"):
self.metrics_upload_failed_queue = {}
return response # Success, no need to retry

if i == 0:
Expand All @@ -249,6 +253,10 @@ def retry_request():

time.sleep(2**i) # Exponential backoff for retries

# if request related to metrics upload and exceed retries
if response is None and kwargs.get("metrics"):
self.metrics_upload_failed_queue.update(kwargs.get("metrics", None))

return response

if thread:
Expand Down
5 changes: 5 additions & 0 deletions ultralytics/utils/callbacks/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ def on_fit_epoch_end(trainer):
all_plots = {**all_plots, **model_info_for_loggers(trainer)}

session.metrics_queue[trainer.epoch] = json.dumps(all_plots)

# If any metrics fail to upload, add them to the queue to attempt uploading again.
if session.metrics_upload_failed_queue:
session.metrics_queue.update(session.metrics_upload_failed_queue)

if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
session.upload_metrics()
session.timers["metrics"] = time() # reset timer
Expand Down

0 comments on commit 31cf94e

Please sign in to comment.