Skip to content

Commit e609df2

Browse files
authored
Merge pull request noironetworks#271 from noironetworks/fix-stuck-aim-aid
Fix deadlocks
2 parents 21f8e3e + 0c3e2b6 commit e609df2

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

aim/agent/aid/service.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
DAEMON_LOOP_MAX_RETRIES = 5
4747
HB_LOOP_MAX_WAIT = 60
4848
HB_LOOP_MAX_RETRY = 10
49+
DEADLOCK_TIME = 300
4950

5051
logging.register_options(aim_cfg.CONF)
5152

@@ -111,6 +112,7 @@ def __init__(self, conf):
111112
self.events = event_handler.EventHandler().initialize(
112113
self.conf_manager)
113114
self.max_down_time = 4 * self.report_interval
115+
self.daemon_loop_time = time.time()
114116

115117
def daemon_loop(self):
116118
# Serve tenants the very first time regardless of the events received
@@ -201,6 +203,7 @@ def _reconciliation_cycle(self, serve=True):
201203
LOG.info("%s removing tenant from AID %s" %
202204
(universe.name, tenant))
203205
universe.cleanup_state(aim_ctx, tenant)
206+
self.daemon_loop_time = time.time()
204207

205208
def _spawn_heartbeat_loop(self):
206209
utils.spawn_thread(self._heartbeat_loop)
@@ -210,6 +213,14 @@ def _heartbeat_loop(self):
210213
start_time = time.time()
211214
aim_ctx = context.AimContext(store=api.get_store())
212215
self._send_heartbeat(aim_ctx)
216+
# REVISIT: This code should be removed once we've
217+
# removed all the locking in AID.
218+
if start_time > self.daemon_loop_time:
219+
down_time = start_time - self.daemon_loop_time
220+
if down_time > DEADLOCK_TIME:
221+
utils.perform_harakiri(LOG, "Agent has been down for %s "
222+
"seconds." % down_time)
223+
213224
utils.wait_for_next_cycle(start_time, self.report_interval,
214225
LOG, readable_caller='AID-HB',
215226
notify_exceeding_timeout=False)

0 commit comments

Comments
 (0)