-
Notifications
You must be signed in to change notification settings - Fork 0
/
watchdog.py
136 lines (112 loc) · 4.08 KB
/
watchdog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Copyright IBM Corp, All Rights Reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
import time
import logging
from threading import Thread
from modules import host_handler, cluster_handler
from common import LOG_LEVEL, log_handler, NETWORK_STATUS_RUNNING
from mongoengine import connect
import os
MONGODB_HOST = os.getenv('MONGODB_HOST', 'mongo')
MONGODB_DB = os.getenv('MONGODB_DB', 'dev')
MONGODB_PORT = int(os.getenv('MONGODB_PORT', 27017))
MONGODB_USERNAME = os.getenv('MONGODB_USERNAME', '')
MONGODB_PASSWORD = os.getenv('MONGODB_PASSWORD', '')
connect(MONGODB_DB, host=MONGODB_HOST, username=MONGODB_USERNAME,
password=MONGODB_PASSWORD, connect=False, tz_aware=True)
logger = logging.getLogger(__name__)
logger.setLevel(LOG_LEVEL)
logger.addHandler(log_handler)
def network_check_health(net_id, retries=60, period=5):
"""
Check the chain health. If not healthy, will reset the chain
:param net_id: id of the chain
:param retries: how many retries before thinking not health
:param period: wait between two retries
:return:
"""
net = cluster_handler.get_by_id(net_id)
if not net:
logger.warning("Not find chain {}".format(net_id))
return
if net.get("status") != NETWORK_STATUS_RUNNING: # check running one
return
net_name = net.get("name")
logger.debug("Chain {}/{}: checking health".format(net_name, net_id))
# free or used by user, then check its health
for i in range(retries):
if cluster_handler.refresh_health(net_id): # chain is healthy
return
else:
logger.debug("Health Check {}: cluster {}/{} is unhealthy!".format(
i, net_name, net_id))
time.sleep(period)
logger.warning("Chain {}/{} is unhealthy!".format(net_name, net_id))
# only reset free chains
if cluster_handler.get_by_id(net_id).get("user_id") == "":
logger.info("Timeout....resetting free unhealthy chain {}/{}".format(
net_name, net_id))
cluster_handler.reset_free_one(net_id)
def host_check_networks(host_id):
"""
Check the chain health on the host.
:param host_id:
:return:
"""
host = host_handler.get_by_id(host_id)
logger.debug("Host {}/{}: checking chains".format(
host.name, host_id))
clusters = cluster_handler.list(filter_data={
"status": "running"})
for c in clusters: # concurrent health check is safe for multi-chains
t = Thread(target=network_check_health, args=(c.get("id"),))
t.start()
t.join(timeout=15)
def host_check_fillup(host_id):
"""
Check one host.
:param host_id:
:return:
"""
host = host_handler.get_by_id(host_id)
if host.autofill:
logger.info("Host {}/{}: checking auto-fillup".format(
host_handler.get_by_id(host_id).name, host_id))
host_handler.fillup(host_id)
def host_check(host_id, retries=3, period=3):
"""
Run check on specific host.
Check status and check each chain's health.
:param host_id: id of the checked host
:param retries: how many retries before thnking it's inactive
:param period: retry wait
:return:
"""
for _ in range(retries):
if host_handler.refresh_status(host_id): # host is active
logger.debug("Host {}/{} is active, start checking".format(
host_handler.get_by_id(host_id).name, host_id))
host_check_networks(host_id)
time.sleep(period)
host_check_fillup(host_id)
break
time.sleep(period)
def watch_run(period=15):
"""
Run the checking in period.
:param period: Wait period between two checking
:return:
"""
while True:
logger.info("Watchdog run checks with period = %d s", period)
hosts = list(host_handler.list())
logger.info("Found {} hosts".format(len(hosts)))
for h in hosts: # operating on different host is safe
t = Thread(target=host_check, args=(h.get("id"),))
t.start()
t.join(timeout=2 * period)
time.sleep(period)
if __name__ == '__main__':
watch_run()