46
46
DAEMON_LOOP_MAX_RETRIES = 5
47
47
HB_LOOP_MAX_WAIT = 60
48
48
HB_LOOP_MAX_RETRY = 10
49
+ DEADLOCK_TIME = 300
49
50
50
51
logging .register_options (aim_cfg .CONF )
51
52
@@ -111,6 +112,7 @@ def __init__(self, conf):
111
112
self .events = event_handler .EventHandler ().initialize (
112
113
self .conf_manager )
113
114
self .max_down_time = 4 * self .report_interval
115
+ self .daemon_loop_time = time .time ()
114
116
115
117
def daemon_loop (self ):
116
118
# Serve tenants the very first time regardless of the events received
@@ -201,6 +203,7 @@ def _reconciliation_cycle(self, serve=True):
201
203
LOG .info ("%s removing tenant from AID %s" %
202
204
(universe .name , tenant ))
203
205
universe .cleanup_state (aim_ctx , tenant )
206
+ self .daemon_loop_time = time .time ()
204
207
205
208
def _spawn_heartbeat_loop (self ):
206
209
utils .spawn_thread (self ._heartbeat_loop )
@@ -210,6 +213,14 @@ def _heartbeat_loop(self):
210
213
start_time = time .time ()
211
214
aim_ctx = context .AimContext (store = api .get_store ())
212
215
self ._send_heartbeat (aim_ctx )
216
+ # REVISIT: This code should be removed once we've
217
+ # removed all the locking in AID.
218
+ if start_time > self .daemon_loop_time :
219
+ down_time = start_time - self .daemon_loop_time
220
+ if down_time > DEADLOCK_TIME :
221
+ utils .perform_harakiri (LOG , "Agent has been down for %s "
222
+ "seconds." % down_time )
223
+
213
224
utils .wait_for_next_cycle (start_time , self .report_interval ,
214
225
LOG , readable_caller = 'AID-HB' ,
215
226
notify_exceeding_timeout = False )
0 commit comments