Skip to content

Commit

Permalink
mgr/volumes: unregister job upon async threads exception
Browse files Browse the repository at this point in the history
If the async threads hit a temporary exception the job is
never unregistered and therefore gets skipped by the async
threads on subsequent scans.

Patrick hit this in nautilus when one of the purge threads
hit an exception when trying to log a message. The trash
entry was never picked up again by the purge threads.

Fixes: http://tracker.ceph.com/issues/44293
Signed-off-by: Venky Shankar <vshankar@redhat.com>
  • Loading branch information
vshankar committed Feb 27, 2020
1 parent f83f38a commit 46476ef
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions src/pybind/mgr/volumes/fs/async_job.py
Expand Up @@ -28,6 +28,7 @@ def run(self):
thread_name = thread_id.getName()

while retries < JobThread.MAX_RETRIES_ON_EXCEPTION:
vol_job = None
try:
# fetch next job to execute
with self.async_job.lock:
Expand All @@ -40,10 +41,6 @@ def run(self):

# execute the job (outside lock)
self.async_job.execute_job(vol_job[0], vol_job[1], should_cancel=lambda: thread_id.should_cancel())

# when done, unregister the job
with self.async_job.lock:
self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id)
retries = 0
except NotImplementedException:
raise
Expand All @@ -56,7 +53,12 @@ def run(self):
exc_type, exc_value, exc_traceback = sys.exc_info()
log.warning("traceback: {0}".format("".join(
traceback.format_exception(exc_type, exc_value, exc_traceback))))
time.sleep(1)
finally:
# when done, unregister the job
if vol_job:
with self.async_job.lock:
self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id)
time.sleep(1)
log.error("thread [{0}] reached exception limit, bailing out...".format(thread_name))
self.vc.cluster_log("thread {0} bailing out due to exception".format(thread_name))

Expand Down

0 comments on commit 46476ef

Please sign in to comment.