xorbitsai · UranusSeven · Jul 6, 2023 · Jul 5, 2023
diff --git a/plexar/core/resource.py b/plexar/core/resource.py
@@ -0,0 +1,46 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Dict
+
+from xorbits._mars import resource
+
+
+@dataclass
+class ResourceStatus:
+    available: float
+    total: float
+    memory_available: float
+    memory_total: float
+
+
+def gather_node_info() -> Dict[str, ResourceStatus]:
+    node_resource = dict()
+    mem_info = resource.virtual_memory()
+    node_resource["cpu"] = ResourceStatus(
+        available=resource.cpu_percent() / 100.0,
+        total=resource.cpu_count(),
+        memory_available=mem_info.available,
+        memory_total=mem_info.total,
+    )
+    for idx, gpu_card_stat in enumerate(resource.cuda_card_stats()):
+        node_resource[f"gpu-{idx}"] = ResourceStatus(
+            available=gpu_card_stat.gpu_usage / 100.0,
+            total=1,
+            memory_available=gpu_card_stat.fb_mem_info.available,
+            memory_total=gpu_card_stat.fb_mem_info.total,
+        )
+
+    return node_resource
diff --git a/plexar/core/service.py b/plexar/core/service.py
@@ -12,17 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
+import time
+from dataclasses import dataclass
 from logging import getLogger
 from typing import Callable, Dict, List, Optional, Tuple
 
 import xoscar as xo
 
-from plexar.core import ModelActor
-from plexar.model import ModelSpec
+from ..core import ModelActor
+from ..model import ModelSpec
+from .resource import ResourceStatus, gather_node_info
 
 logger = getLogger(__name__)
 
 
+DEFAULT_NODE_DEAD_TIMEOUT = 30
+DEFAULT_NODE_CHECK_INTERVAL = 1
+
+
 def log(func: Callable):
     # TODO: support non-async function
     import time
@@ -41,16 +49,29 @@ async def wrapped(*args, **kwargs):
     return wrapped
 
 
+@dataclass
+class WorkerStatus:
+    update_time: float
+    status: Dict[str, ResourceStatus]
+
+
 class SupervisorActor(xo.Actor):
     def __init__(self):
         super().__init__()
         self._worker_address_to_worker: Dict[str, xo.ActorRefType[WorkerActor]] = {}
         self._model_uid_to_worker: Dict[str, xo.ActorRefType[WorkerActor]] = {}
+        self._worker_status: Dict[str, WorkerStatus] = {}
 
     @classmethod
     def uid(cls) -> str:
         return "plexar_supervisor"
 
+    async def __post_create__(self):
+        self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
+
+    async def __pre_destroy__(self):
+        self._check_dead_nodes_task.cancel()
+
     async def _choose_worker(self) -> xo.ActorRefType["WorkerActor"]:
         # TODO: better allocation strategy.
         min_running_model_count = None
@@ -93,6 +114,14 @@ async def launch_builtin_model(
 
         return model_ref
 
+    async def _check_dead_nodes(self):
+        while True:
+            for address, status in self._worker_status.items():
+                if time.time() - status.update_time > DEFAULT_NODE_DEAD_TIMEOUT:
+                    self._worker_status.pop(address)
+                    self._worker_address_to_worker.pop(address)
+            await asyncio.sleep(5)
+
     @log
     async def terminate_model(self, model_uid: str):
         assert model_uid in self._model_uid_to_worker
@@ -122,11 +151,20 @@ async def add_worker(self, worker_address: str):
         worker_ref = await xo.actor_ref(address=worker_address, uid=WorkerActor.uid())
         self._worker_address_to_worker[worker_address] = worker_ref
 
+    @log
+    async def report_worker_status(
+        self, worker_address: str, status: Dict[str, ResourceStatus]
+    ):
+        self._worker_status[worker_address] = WorkerStatus(
+            update_time=time.time(), status=status
+        )
+
 
 class WorkerActor(xo.Actor):
     def __init__(self, supervisor_address: str):
         super().__init__()
         self._supervisor_address = supervisor_address
+        self._supervisor_ref = None
         self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
         self._model_uid_to_model_spec: Dict[str, ModelSpec] = {}
 
@@ -135,10 +173,14 @@ def uid(cls) -> str:
         return "plexar_worker"
 
     async def __post_create__(self):
-        supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
+        self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
             address=self._supervisor_address, uid=SupervisorActor.uid()
         )
-        await supervisor_ref.add_worker(self.address)
+        await self._supervisor_ref.add_worker(self.address)
+        self._upload_task = asyncio.create_task(self._periodical_report_status())
+
+    async def __pre_destroy__(self):
+        self._upload_task.cancel()
 
     async def get_model_count(self) -> int:
         return len(self._model_uid_to_model)
@@ -206,3 +248,27 @@ async def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
         assert model_uid in self._model_uid_to_model
 
         return self._model_uid_to_model[model_uid]
+
+    async def report_status(self):
+        status = await asyncio.to_thread(gather_node_info)
+        await self._supervisor_ref.report_worker_status(self.address, status)
+
+    async def _periodical_report_status(self):
+        while True:
+            try:
+                await self.report_status()
+            except asyncio.CancelledError:  # pragma: no cover
+                break
+            except RuntimeError as ex:  # pragma: no cover
+                if "cannot schedule new futures" not in str(ex):
+                    # when atexit is triggered, the default pool might be shutdown
+                    # and to_thread will fail
+                    break
+            except (
+                Exception
+            ) as ex:  # pragma: no cover  # noqa: E722  # nosec  # pylint: disable=bare-except
+                logger.error(f"Failed to upload node info: {ex}")
+            try:
+                await asyncio.sleep(DEFAULT_NODE_CHECK_INTERVAL)
+            except asyncio.CancelledError:  # pragma: no cover
+                break
diff --git a/setup.cfg b/setup.cfg
@@ -25,6 +25,7 @@ include_package_data = True
 packages = find:
 install_requires =
     xoscar
+    xorbits
     gradio
     click
     tqdm