From b5a817166b0e3528e6f00ee12ac0cde6c3321ff2 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 12:06:04 +0100 Subject: [PATCH 01/39] add cpu monitoring --- pyproject.toml | 56 +++++++++++++++++++++++++----- src/dvclive/live.py | 13 ++++++- src/dvclive/system_metrics.py | 65 +++++++++++++++++++++++++++++++++++ src/dvclive/utils.py | 12 ++++++- 4 files changed, 136 insertions(+), 10 deletions(-) create mode 100644 src/dvclive/system_metrics.py diff --git a/pyproject.toml b/pyproject.toml index c2e4f16e..7e06166e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "gto", "ruamel.yaml", "scmrepo", + "psutil", ] [project.optional-dependencies] @@ -53,11 +54,7 @@ tests = [ "dvclive[image,plots,markdown]", "ipython", ] -dev = [ - "dvclive[all,tests]", - "mypy>=1.1.1", - "types-PyYAML", -] +dev = ["dvclive[all,tests]", "mypy>=1.1.1", "types-PyYAML"] mmcv = ["mmcv"] tf = ["tensorflow"] xgb = ["xgboost"] @@ -68,7 +65,7 @@ fastai = ["fastai"] lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"] optuna = ["optuna"] all = [ - "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]" + "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]", ] [project.urls] @@ -136,11 +133,54 @@ ignore-words-list = "fpr" [tool.ruff] ignore = ["N818", "UP006", "UP007", "UP035", "UP038", "B905", "PGH003"] -select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T10", "EXE", "ISC", "INP", "PIE", "T20", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PGH", "PL", "TRY", "NPY", "RUF"] +select = [ + "F", + "E", + "W", + "C90", + "N", + "UP", + "YTT", + "S", + "BLE", + "B", + "A", + "C4", + "T10", + "EXE", + "ISC", + "INP", + "PIE", + "T20", + "PT", + "Q", + "RSE", + "RET", + "SLF", + "SIM", + "TID", + "TCH", + "INT", + "ARG", + "PGH", + "PL", + "TRY", + "NPY", + "RUF", +] [tool.ruff.per-file-ignores] "noxfile.py" = ["D", "PTH"] -"tests/*" = ["S101", "INP001", "SLF001", "ARG001", "ARG002", "ARG005", "PLR2004", "NPY002"] +"tests/*" = [ + "S101", + "INP001", + "SLF001", + "ARG001", + "ARG002", + "ARG005", + "PLR2004", + "NPY002", +] [tool.ruff.pylint] max-args = 10 diff --git a/src/dvclive/live.py b/src/dvclive/live.py index ce8d5357..abac3443 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -8,7 +8,7 @@ import shutil import tempfile from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Union, TYPE_CHECKING +from typing import Any, Dict, List, Optional, Set, Callable, Union, TYPE_CHECKING if TYPE_CHECKING: import numpy as np @@ -37,6 +37,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio +from .system_metrics import CPUMetricsCallback from .utils import ( StrPath, catch_and_warn, @@ -75,6 +76,7 @@ def __init__( cache_images: bool = False, exp_name: Optional[str] = None, exp_message: Optional[str] = None, + callbacks: Optional[List[Callable]] = None, ): self.summary: Dict[str, Any] = {} @@ -120,6 +122,10 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() + self._callbacks = callbacks or [CPUMetricsCallback()] + for callback in self._callbacks: + callback(self) + def _init_resume(self): self._read_params() self.summary = self.read_latest() @@ -589,6 +595,11 @@ def end(self): # If next_step called before end, don't want to update step number if "step" in self.summary: self.step = self.summary["step"] + + # Kill all independent threads that can be found in callbacks + for callback in self._callbacks: + callback.end() + self.sync() if self._inside_dvc_exp and self._dvc_repo: diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py new file mode 100644 index 00000000..63e09aa6 --- /dev/null +++ b/src/dvclive/system_metrics.py @@ -0,0 +1,65 @@ +import logging +from typing import Dict, Union + +import psutil +from statistics import mean +from threading import Event, Thread + +from .utils import append_dict + +logger = logging.getLogger("dvclive") +MEGABYTES_DIVIDER = 1024.0**2 + + +class CPUMetricsCallback: + def __init__( + self, + duration: Union[int, float] = 30, + interval: Union[int, float] = 2.0, + plot: bool = True, + ): + self.duration = duration + self.interval = interval + self.plot = plot + + def __call__(self, live): + self._live = live + self._shutdown_event = Event() + Thread( + target=self.monitoring_loop, + ).start() + + def monitoring_loop(self): + while not self._shutdown_event.is_set(): + self._cpu_metrics = {} + for _ in range(int(self.duration // self.interval)): + last_cpu_metrics = {} + try: + last_cpu_metrics = get_cpus_metrics() + except psutil.Error: + logger.exception("Failed to monitor CPU metrics:") + self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics) + + for metric_name, metric_values in self._cpu_metrics.items(): + self._live.log_metric( + metric_name, mean(metric_values), timestamp=True, plot=self.plot + ) + self._shutdown_event.wait(self.interval) + if self._shutdown_event.is_set(): + break + + def end(self): + self._shutdown_event.set() + + +def get_cpus_metrics() -> Dict[str, Union[float, int]]: + ram_info = psutil.virtual_memory() + io_info = psutil.disk_io_counters() + return { + "system/cpu/usage_percent": psutil.cpu_percent(), + "system/cpu/ram_usage_percent": ram_info.percent, + "system/io/read_speed_MB": io_info.read_bytes + / (io_info.read_time * MEGABYTES_DIVIDER), + "system/io/write_speed_MB": io_info.write_bytes + / (io_info.write_time * MEGABYTES_DIVIDER), + } diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py index a168de9e..5c23f490 100644 --- a/src/dvclive/utils.py +++ b/src/dvclive/utils.py @@ -6,7 +6,7 @@ import shutil from pathlib import Path from platform import uname -from typing import Union, List, Dict, TYPE_CHECKING +from typing import Any, Union, List, Dict, TYPE_CHECKING import webbrowser from .error import InvalidDataTypeError @@ -245,3 +245,13 @@ def convert_datapoints_to_list_of_dicts( # Raise an error if the input is not a supported type raise InvalidDataTypeError("datapoints", type(datapoints)) + + +def append_dict(previous_values: Dict[str, List[Any]], new_value: Dict[str, Any]): + return { + **previous_values, + **{ + metric_name: [*previous_values.get(metric_name, []), value] + for metric_name, value in new_value.items() + }, + } From e3b654cfb793d2d76f563becb877e17b7af139f2 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 16:39:20 +0100 Subject: [PATCH 02/39] add unit tests and more cpu metrics --- src/dvclive/system_metrics.py | 31 +++++++-- tests/test_system_metrics.py | 126 ++++++++++++++++++++++++++++++++++ tests/test_utils.py | 24 ++++++- 3 files changed, 174 insertions(+), 7 deletions(-) create mode 100644 tests/test_system_metrics.py diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 63e09aa6..44f4c0cb 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -10,6 +10,8 @@ logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 +MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30 + class CPUMetricsCallback: def __init__( @@ -21,6 +23,7 @@ def __init__( self.duration = duration self.interval = interval self.plot = plot + self._no_plot_metrics = ["system/cpu/count"] def __call__(self, live): self._live = live @@ -39,14 +42,18 @@ def monitoring_loop(self): except psutil.Error: logger.exception("Failed to monitor CPU metrics:") self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics) - + self._shutdown_event.wait(self.interval) + if self._shutdown_event.is_set(): + break for metric_name, metric_values in self._cpu_metrics.items(): self._live.log_metric( - metric_name, mean(metric_values), timestamp=True, plot=self.plot + metric_name, + mean(metric_values), + timestamp=True, + plot=self.plot + if metric_name not in self._no_plot_metrics + else False, ) - self._shutdown_event.wait(self.interval) - if self._shutdown_event.is_set(): - break def end(self): self._shutdown_event.set() @@ -55,8 +62,20 @@ def end(self): def get_cpus_metrics() -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() io_info = psutil.disk_io_counters() + nb_cpus = psutil.cpu_count() + cpus_percent = psutil.cpu_percent(percpu=True) return { - "system/cpu/usage_percent": psutil.cpu_percent(), + "system/cpu/usage_avg_percent": mean(cpus_percent), + "system/cpu/usage_max_percent": max(cpus_percent), + "system/cpu/count": nb_cpus, + "system/cpu/parallelism_percent": len( + [ + percent + for percent in cpus_percent + if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE + ] + ) + / nb_cpus, "system/cpu/ram_usage_percent": ram_info.percent, "system/io/read_speed_MB": io_info.read_bytes / (io_info.read_time * MEGABYTES_DIVIDER), diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py new file mode 100644 index 00000000..cfe9fcd9 --- /dev/null +++ b/tests/test_system_metrics.py @@ -0,0 +1,126 @@ +import time + +import pytest + +from dvclive import Live +from dvclive.system_metrics import CPUMetricsCallback, get_cpus_metrics +from dvclive.utils import parse_metrics + + +def mock_psutil(mocker): + mocker.patch( + "dvclive.system_metrics.psutil.cpu_percent", + return_value=[10, 20, 30, 40, 50, 60], + ) + mocker.patch("dvclive.system_metrics.psutil.cpu_count", return_value=6) + + mocked_virtual_memory = mocker.MagicMock() + mocked_virtual_memory.percent = 20 + + mocked_disk_io_counters = mocker.MagicMock() + mocked_disk_io_counters.read_bytes = 2 * 1024**2 + mocked_disk_io_counters.read_time = 1 + mocked_disk_io_counters.write_bytes = 2 * 1024**2 + mocked_disk_io_counters.write_time = 1 + + mocking_dict = { + "virtual_memory": mocked_virtual_memory, + "disk_io_counters": mocked_disk_io_counters, + } + for function_name, return_value in mocking_dict.items(): + mocker.patch( + f"dvclive.system_metrics.psutil.{function_name}", + return_value=return_value, + ) + + +@pytest.mark.parametrize( + ("metric_name"), + [ + "system/cpu/usage_avg_percent", + "system/cpu/usage_max_percent", + "system/cpu/count", + "system/cpu/parallelism_percent", + "system/cpu/ram_usage_percent", + "system/io/read_speed_MB", + "system/io/write_speed_MB", + ], +) +def test_get_cpus_metrics(mocker, metric_name): + mock_psutil(mocker) + metrics = get_cpus_metrics() + assert metric_name in metrics + + +@pytest.mark.parametrize( + ("duration", "interval", "plot"), + [ + (1, 0.5, True), + (2.0, 1, True), + ], +) +def test_cpumetricscallback_with_plot(duration, interval, plot): + with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live: + time.sleep(duration * 2) + live.next_step() + time.sleep(duration * 2 + 0.1) # allow the thread to finish + history, latest = parse_metrics(live) + dir_path = live.dir + + assert "system" in latest + assert "cpu" in latest["system"] + assert "usage_avg_percent" in latest["system"]["cpu"] + assert "usage_max_percent" in latest["system"]["cpu"] + assert "count" in latest["system"]["cpu"] + assert "parallelism_percent" in latest["system"]["cpu"] + assert "ram_usage_percent" in latest["system"]["cpu"] + assert "io" in latest["system"] + assert "read_speed_MB" in latest["system"]["io"] + assert "write_speed_MB" in latest["system"]["io"] + + prefix = f"{dir_path}/plots/metrics/system" + assert f"{prefix}/cpu/ram_usage_percent.tsv" in history + assert f"{prefix}/cpu/usage_avg_percent.tsv" in history + assert f"{prefix}/cpu/usage_max_percent.tsv" in history + assert f"{prefix}/cpu/parallelism_percent.tsv" in history + assert f"{prefix}/io/write_speed_MB.tsv" in history + assert f"{prefix}/io/read_speed_MB.tsv" in history + assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4 + + assert f"{prefix}/cpu/count.tsv" not in history # no plot for count + + +@pytest.mark.parametrize( + ("duration", "interval", "plot"), + [ + (1, 0.5, False), + (2.0, 1, False), + ], +) +def test_cpumetricscallback_without_plot(duration, interval, plot): + with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live: + time.sleep(duration * 2) + live.next_step() + time.sleep(duration * 2 + 0.1) # allow the thread to finish + history, latest = parse_metrics(live) + dir_path = live.dir + + assert "system" in latest + assert "cpu" in latest["system"] + assert "usage_avg_percent" in latest["system"]["cpu"] + assert "usage_max_percent" in latest["system"]["cpu"] + assert "count" in latest["system"]["cpu"] + assert "parallelism_percent" in latest["system"]["cpu"] + assert "ram_usage_percent" in latest["system"]["cpu"] + assert "io" in latest["system"] + assert "read_speed_MB" in latest["system"]["io"] + assert "write_speed_MB" in latest["system"]["io"] + + prefix = f"{dir_path}/plots/metrics/system" + assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history + assert f"{prefix}/cpu/usage_max_percent.tsv" not in history + assert f"{prefix}/cpu/count.tsv" not in history + assert f"{prefix}/cpu/parallelism_percent.tsv" not in history + assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history + assert f"{prefix}/cpu/write_speed_MB.tsv" not in history + assert f"{prefix}/cpu/read_speed_MB.tsv" not in history diff --git a/tests/test_utils.py b/tests/test_utils.py index 2b5d56ca..ab782a29 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,11 @@ import pandas as pd import pytest -from dvclive.utils import standardize_metric_name, convert_datapoints_to_list_of_dicts +from dvclive.utils import ( + standardize_metric_name, + convert_datapoints_to_list_of_dicts, + append_dict, +) from dvclive.error import InvalidDataTypeError @@ -45,3 +49,21 @@ def test_unsupported_format(): convert_datapoints_to_list_of_dicts("unsupported data format") assert "not supported type" in str(exc_info.value) + + +@pytest.mark.parametrize( + ("aggregate", "new_metrics", "result"), + [ + ({"a": [1], "b": [0.8]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [0.8, 1.6]}), + ({"a": [1]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [1.6]}), + ( + {"a": [1], "b": [0.8], "c": [2]}, + {"a": 2, "b": 1.6}, + {"a": [1, 2], "b": [0.8, 1.6], "c": [2]}, + ), + ({}, {"a": 2, "b": 1.6}, {"a": [2], "b": [1.6]}), + ({}, {}, {}), + ], +) +def test_append_dict(aggregate, new_metrics, result): + assert append_dict(aggregate, new_metrics) == result From e6cff3254875eb11a2464fc2d55edbc8a64aa21a Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 16:44:01 +0100 Subject: [PATCH 03/39] change default value for callback --- src/dvclive/system_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 44f4c0cb..3ffe4a42 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -16,8 +16,8 @@ class CPUMetricsCallback: def __init__( self, - duration: Union[int, float] = 30, - interval: Union[int, float] = 2.0, + duration: Union[int, float] = 5, + interval: Union[int, float] = 1.0, plot: bool = True, ): self.duration = duration From 8663011531007078a88599e325fa9261e2ea656e Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 16:48:04 +0100 Subject: [PATCH 04/39] uses a percentage value for cpu parallelism --- src/dvclive/system_metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 3ffe4a42..7bac6456 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -75,6 +75,7 @@ def get_cpus_metrics() -> Dict[str, Union[float, int]]: if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE ] ) + * 100 / nb_cpus, "system/cpu/ram_usage_percent": ram_info.percent, "system/io/read_speed_MB": io_info.read_bytes From 134675027e3caec95c4742ee0cf83e02df207699 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 17:01:47 +0100 Subject: [PATCH 05/39] add ram total --- src/dvclive/system_metrics.py | 2 ++ tests/test_system_metrics.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 7bac6456..00f0658b 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -9,6 +9,7 @@ logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 +GIGABYTES_DIVIDER = 1024.0**3 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30 @@ -78,6 +79,7 @@ def get_cpus_metrics() -> Dict[str, Union[float, int]]: * 100 / nb_cpus, "system/cpu/ram_usage_percent": ram_info.percent, + "system/cpu/ram_total_GB": ram_info.total / GIGABYTES_DIVIDER, "system/io/read_speed_MB": io_info.read_bytes / (io_info.read_time * MEGABYTES_DIVIDER), "system/io/write_speed_MB": io_info.write_bytes diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index cfe9fcd9..e5e8ed6d 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -16,6 +16,7 @@ def mock_psutil(mocker): mocked_virtual_memory = mocker.MagicMock() mocked_virtual_memory.percent = 20 + mocked_virtual_memory.total = 4 * 1024**3 mocked_disk_io_counters = mocker.MagicMock() mocked_disk_io_counters.read_bytes = 2 * 1024**2 @@ -42,6 +43,7 @@ def mock_psutil(mocker): "system/cpu/count", "system/cpu/parallelism_percent", "system/cpu/ram_usage_percent", + "system/cpu/ram_total_GB", "system/io/read_speed_MB", "system/io/write_speed_MB", ], @@ -74,15 +76,17 @@ def test_cpumetricscallback_with_plot(duration, interval, plot): assert "count" in latest["system"]["cpu"] assert "parallelism_percent" in latest["system"]["cpu"] assert "ram_usage_percent" in latest["system"]["cpu"] + assert "ram_total_GB" in latest["system"]["cpu"] assert "io" in latest["system"] assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] prefix = f"{dir_path}/plots/metrics/system" - assert f"{prefix}/cpu/ram_usage_percent.tsv" in history assert f"{prefix}/cpu/usage_avg_percent.tsv" in history assert f"{prefix}/cpu/usage_max_percent.tsv" in history assert f"{prefix}/cpu/parallelism_percent.tsv" in history + assert f"{prefix}/cpu/ram_usage_percent.tsv" in history + assert f"{prefix}/cpu/ram_total_GB.tsv" in history assert f"{prefix}/io/write_speed_MB.tsv" in history assert f"{prefix}/io/read_speed_MB.tsv" in history assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4 @@ -112,6 +116,7 @@ def test_cpumetricscallback_without_plot(duration, interval, plot): assert "count" in latest["system"]["cpu"] assert "parallelism_percent" in latest["system"]["cpu"] assert "ram_usage_percent" in latest["system"]["cpu"] + assert "ram_total_GB" in latest["system"]["cpu"] assert "io" in latest["system"] assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] @@ -122,5 +127,6 @@ def test_cpumetricscallback_without_plot(duration, interval, plot): assert f"{prefix}/cpu/count.tsv" not in history assert f"{prefix}/cpu/parallelism_percent.tsv" not in history assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history + assert f"{prefix}/cpu/ram_total_GB.tsv" not in history assert f"{prefix}/cpu/write_speed_MB.tsv" not in history assert f"{prefix}/cpu/read_speed_MB.tsv" not in history From 5f90bea9262bb9cb830cb3471344b480e9cd2b7f Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 6 Feb 2024 17:06:01 +0100 Subject: [PATCH 06/39] remove total ram measure from plots --- src/dvclive/system_metrics.py | 2 +- tests/test_system_metrics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 00f0658b..b9b27161 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -24,7 +24,7 @@ def __init__( self.duration = duration self.interval = interval self.plot = plot - self._no_plot_metrics = ["system/cpu/count"] + self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"] def __call__(self, live): self._live = live diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index e5e8ed6d..ae7c24b7 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -86,12 +86,12 @@ def test_cpumetricscallback_with_plot(duration, interval, plot): assert f"{prefix}/cpu/usage_max_percent.tsv" in history assert f"{prefix}/cpu/parallelism_percent.tsv" in history assert f"{prefix}/cpu/ram_usage_percent.tsv" in history - assert f"{prefix}/cpu/ram_total_GB.tsv" in history assert f"{prefix}/io/write_speed_MB.tsv" in history assert f"{prefix}/io/read_speed_MB.tsv" in history assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4 assert f"{prefix}/cpu/count.tsv" not in history # no plot for count + assert f"{prefix}/cpu/ram_total_GB.tsv" not in history @pytest.mark.parametrize( From 2ac50f2e5be954d03108259dcd78f3d64c587179 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Wed, 7 Feb 2024 09:15:18 +0100 Subject: [PATCH 07/39] update pyproject.toml --- pyproject.toml | 56 ++++++++------------------------------------------ 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7e06166e..d5fb01b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,11 @@ tests = [ "dvclive[image,plots,markdown]", "ipython", ] -dev = ["dvclive[all,tests]", "mypy>=1.1.1", "types-PyYAML"] +dev = [ + "dvclive[all,tests]", + "mypy>=1.1.1", + "types-PyYAML", +] mmcv = ["mmcv"] tf = ["tensorflow"] xgb = ["xgboost"] @@ -65,7 +69,7 @@ fastai = ["fastai"] lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"] optuna = ["optuna"] all = [ - "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]", + "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]" ] [project.urls] @@ -133,54 +137,12 @@ ignore-words-list = "fpr" [tool.ruff] ignore = ["N818", "UP006", "UP007", "UP035", "UP038", "B905", "PGH003"] -select = [ - "F", - "E", - "W", - "C90", - "N", - "UP", - "YTT", - "S", - "BLE", - "B", - "A", - "C4", - "T10", - "EXE", - "ISC", - "INP", - "PIE", - "T20", - "PT", - "Q", - "RSE", - "RET", - "SLF", - "SIM", - "TID", - "TCH", - "INT", - "ARG", - "PGH", - "PL", - "TRY", - "NPY", - "RUF", -] +select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T10", "EXE", "ISC", "INP", "PIE", "T20", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PGH", "PL", "TRY", "NPY", "RUF"] [tool.ruff.per-file-ignores] "noxfile.py" = ["D", "PTH"] -"tests/*" = [ - "S101", - "INP001", - "SLF001", - "ARG001", - "ARG002", - "ARG005", - "PLR2004", - "NPY002", -] +"tests/*" = ["S101", "INP001", "SLF001", "ARG001", "ARG002", "ARG005", "PLR2004", "NPY002"] [tool.ruff.pylint] max-args = 10 + From 5c0a2882c81d8696fb68b6841c8e5355978522d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Feb 2024 08:15:37 +0000 Subject: [PATCH 08/39] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d5fb01b1..d6f939df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,4 +145,3 @@ select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T [tool.ruff.pylint] max-args = 10 - From 30315bb327ee9da076c64b65ae88d0b09b4f4a63 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Wed, 7 Feb 2024 17:07:06 +0100 Subject: [PATCH 09/39] add tmpdir to metrics tests --- tests/test_system_metrics.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index ae7c24b7..61c0ed63 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -61,13 +61,16 @@ def test_get_cpus_metrics(mocker, metric_name): (2.0, 1, True), ], ) -def test_cpumetricscallback_with_plot(duration, interval, plot): - with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live: +def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir): + with Live( + tmpdir, + save_dvc_exp=False, + callbacks=[CPUMetricsCallback(duration, interval, plot)], + ) as live: time.sleep(duration * 2) live.next_step() time.sleep(duration * 2 + 0.1) # allow the thread to finish history, latest = parse_metrics(live) - dir_path = live.dir assert "system" in latest assert "cpu" in latest["system"] @@ -81,7 +84,7 @@ def test_cpumetricscallback_with_plot(duration, interval, plot): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - prefix = f"{dir_path}/plots/metrics/system" + prefix = f"{tmpdir}/plots/metrics/system" assert f"{prefix}/cpu/usage_avg_percent.tsv" in history assert f"{prefix}/cpu/usage_max_percent.tsv" in history assert f"{prefix}/cpu/parallelism_percent.tsv" in history @@ -101,13 +104,16 @@ def test_cpumetricscallback_with_plot(duration, interval, plot): (2.0, 1, False), ], ) -def test_cpumetricscallback_without_plot(duration, interval, plot): - with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live: +def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir): + with Live( + tmpdir, + save_dvc_exp=False, + callbacks=[CPUMetricsCallback(duration, interval, plot)], + ) as live: time.sleep(duration * 2) live.next_step() time.sleep(duration * 2 + 0.1) # allow the thread to finish history, latest = parse_metrics(live) - dir_path = live.dir assert "system" in latest assert "cpu" in latest["system"] @@ -121,7 +127,7 @@ def test_cpumetricscallback_without_plot(duration, interval, plot): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - prefix = f"{dir_path}/plots/metrics/system" + prefix = f"{tmpdir}/plots/metrics/system" assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history assert f"{prefix}/cpu/usage_max_percent.tsv" not in history assert f"{prefix}/cpu/count.tsv" not in history From 40686c7ea0a90cd710b78b865fc8056eb30dff50 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Wed, 7 Feb 2024 17:31:54 +0100 Subject: [PATCH 10/39] default to no monitoring callbacks --- src/dvclive/live.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 7ebcc289..beeba117 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -51,7 +51,6 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .system_metrics import CPUMetricsCallback from .utils import ( StrPath, catch_and_warn, @@ -136,7 +135,7 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._callbacks = callbacks or [CPUMetricsCallback()] + self._callbacks = callbacks or [] for callback in self._callbacks: callback(self) From 4b9814486bf02856159d7967833c8f5e8bacf987 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 9 Feb 2024 11:57:07 +0100 Subject: [PATCH 11/39] fix tmp_dir for test on windows and macos --- tests/test_system_metrics.py | 49 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index 61c0ed63..819ebd5e 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -61,16 +61,16 @@ def test_get_cpus_metrics(mocker, metric_name): (2.0, 1, True), ], ) -def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir): +def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot): with Live( - tmpdir, + tmp_dir, save_dvc_exp=False, callbacks=[CPUMetricsCallback(duration, interval, plot)], ) as live: time.sleep(duration * 2) live.next_step() time.sleep(duration * 2 + 0.1) # allow the thread to finish - history, latest = parse_metrics(live) + timeseries, latest = parse_metrics(live) assert "system" in latest assert "cpu" in latest["system"] @@ -84,17 +84,17 @@ def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - prefix = f"{tmpdir}/plots/metrics/system" - assert f"{prefix}/cpu/usage_avg_percent.tsv" in history - assert f"{prefix}/cpu/usage_max_percent.tsv" in history - assert f"{prefix}/cpu/parallelism_percent.tsv" in history - assert f"{prefix}/cpu/ram_usage_percent.tsv" in history - assert f"{prefix}/io/write_speed_MB.tsv" in history - assert f"{prefix}/io/read_speed_MB.tsv" in history - assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4 + assert any("system/cpu/usage_avg_percent.tsv" in key for key in timeseries) + assert any("system/cpu/usage_max_percent.tsv" in key for key in timeseries) + assert any("system/cpu/parallelism_percent.tsv" in key for key in timeseries) + assert any("system/cpu/ram_usage_percent.tsv" in key for key in timeseries) + assert any("system/io/write_speed_MB.tsv" in key for key in timeseries) + assert any("system/io/read_speed_MB.tsv" in key for key in timeseries) + assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key) - assert f"{prefix}/cpu/count.tsv" not in history # no plot for count - assert f"{prefix}/cpu/ram_total_GB.tsv" not in history + # not plot for fix values + assert all("system/cpu/count.tsv" not in key for key in timeseries) + assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries) @pytest.mark.parametrize( @@ -104,16 +104,16 @@ def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir): (2.0, 1, False), ], ) -def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir): +def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot): with Live( - tmpdir, + tmp_dir, save_dvc_exp=False, callbacks=[CPUMetricsCallback(duration, interval, plot)], ) as live: time.sleep(duration * 2) live.next_step() time.sleep(duration * 2 + 0.1) # allow the thread to finish - history, latest = parse_metrics(live) + timeseries, latest = parse_metrics(live) assert "system" in latest assert "cpu" in latest["system"] @@ -127,12 +127,11 @@ def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - prefix = f"{tmpdir}/plots/metrics/system" - assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history - assert f"{prefix}/cpu/usage_max_percent.tsv" not in history - assert f"{prefix}/cpu/count.tsv" not in history - assert f"{prefix}/cpu/parallelism_percent.tsv" not in history - assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history - assert f"{prefix}/cpu/ram_total_GB.tsv" not in history - assert f"{prefix}/cpu/write_speed_MB.tsv" not in history - assert f"{prefix}/cpu/read_speed_MB.tsv" not in history + assert all("system/cpu/usage_avg_percent.tsv" not in key for key in timeseries) + assert all("system/cpu/usage_max_percent.tsv" not in key for key in timeseries) + assert all("system/cpu/count.tsv" not in key for key in timeseries) + assert all("system/cpu/parallelism_percent.tsv" not in key for key in timeseries) + assert all("system/cpu/ram_usage_percent.tsv" not in key for key in timeseries) + assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries) + assert all("system/cpu/write_speed_MB.tsv" not in key for key in timeseries) + assert all("system/cpu/read_speed_MB.tsv" not in key for key in timeseries) From 36fd94dc2041f67dac8e5ef52e6d4ac1345bfdf4 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 9 Feb 2024 14:08:51 +0100 Subject: [PATCH 12/39] fix tmp_dir for test on windows and macos --- tests/test_system_metrics.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index 819ebd5e..6d91d62d 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -84,17 +84,17 @@ def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - assert any("system/cpu/usage_avg_percent.tsv" in key for key in timeseries) - assert any("system/cpu/usage_max_percent.tsv" in key for key in timeseries) - assert any("system/cpu/parallelism_percent.tsv" in key for key in timeseries) - assert any("system/cpu/ram_usage_percent.tsv" in key for key in timeseries) - assert any("system/io/write_speed_MB.tsv" in key for key in timeseries) - assert any("system/io/read_speed_MB.tsv" in key for key in timeseries) - assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key) + assert any("usage_avg_percent" in key for key in timeseries) + assert any("usage_max_percent.tsv" in key for key in timeseries) + assert any("parallelism_percent.tsv" in key for key in timeseries) + assert any("ram_usage_percent.tsv" in key for key in timeseries) + assert any("write_speed_MB.tsv" in key for key in timeseries) + assert any("read_speed_MB.tsv" in key for key in timeseries) + # assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key) - # not plot for fix values - assert all("system/cpu/count.tsv" not in key for key in timeseries) - assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries) + # not plot for constant values + assert all("count.tsv" not in key for key in timeseries) + assert all("ram_total_GB.tsv" not in key for key in timeseries) @pytest.mark.parametrize( @@ -127,11 +127,11 @@ def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot): assert "read_speed_MB" in latest["system"]["io"] assert "write_speed_MB" in latest["system"]["io"] - assert all("system/cpu/usage_avg_percent.tsv" not in key for key in timeseries) - assert all("system/cpu/usage_max_percent.tsv" not in key for key in timeseries) - assert all("system/cpu/count.tsv" not in key for key in timeseries) - assert all("system/cpu/parallelism_percent.tsv" not in key for key in timeseries) - assert all("system/cpu/ram_usage_percent.tsv" not in key for key in timeseries) - assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries) - assert all("system/cpu/write_speed_MB.tsv" not in key for key in timeseries) - assert all("system/cpu/read_speed_MB.tsv" not in key for key in timeseries) + assert all("usage_avg_percent.tsv" not in key for key in timeseries) + assert all("usage_max_percent.tsv" not in key for key in timeseries) + assert all("count.tsv" not in key for key in timeseries) + assert all("parallelism_percent.tsv" not in key for key in timeseries) + assert all("ram_usage_percent.tsv" not in key for key in timeseries) + assert all("ram_total_GB.tsv" not in key for key in timeseries) + assert all("write_speed_MB.tsv" not in key for key in timeseries) + assert all("read_speed_MB.tsv" not in key for key in timeseries) From 59db522626b1f8a535ead622aa0578e0813fbc4a Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 9 Feb 2024 14:45:16 +0100 Subject: [PATCH 13/39] fix update data to studio live experiment --- src/dvclive/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py index 0c293c4d..d487ae43 100644 --- a/src/dvclive/utils.py +++ b/src/dvclive/utils.py @@ -111,7 +111,7 @@ def parse_metrics(live): metrics_path = Path(live.plots_dir) / Metric.subfolder history = {} for suffix in Metric.suffixes: - for scalar_file in metrics_path.rglob(f"*{suffix}"): + for scalar_file in metrics_path.rglob(f"**/*{suffix}"): history[str(scalar_file)] = parse_tsv(scalar_file) latest = parse_json(live.metrics_file) return history, latest From 2393fb0af41c40a103f62d17cb79c2f138f5f52e Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 9 Feb 2024 14:54:32 +0100 Subject: [PATCH 14/39] fix studio update data problem --- src/dvclive/studio.py | 5 ++--- src/dvclive/utils.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index f74db7ab..90c1ed10 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -14,7 +14,7 @@ def _get_unsent_datapoints(plot, latest_step): - return [x for x in plot if int(x["step"]) > latest_step] + return [x for x in plot if int(x["step"]) >= latest_step] def _cast_to_numbers(datapoints): @@ -53,7 +53,7 @@ def _adapt_images(live): return { _adapt_path(live, image.output_path): {"image": _adapt_image(image.output_path)} for image in live._images.values() - if image.step > live._latest_studio_step + if image.step >= live._latest_studio_step } @@ -76,7 +76,6 @@ def get_studio_updates(live): for name, plot in plots.items() } plots = {k: {"data": v} for k, v in plots.items()} - plots.update(_adapt_images(live)) return metrics, params, plots diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py index d487ae43..0c293c4d 100644 --- a/src/dvclive/utils.py +++ b/src/dvclive/utils.py @@ -111,7 +111,7 @@ def parse_metrics(live): metrics_path = Path(live.plots_dir) / Metric.subfolder history = {} for suffix in Metric.suffixes: - for scalar_file in metrics_path.rglob(f"**/*{suffix}"): + for scalar_file in metrics_path.rglob(f"*{suffix}"): history[str(scalar_file)] = parse_tsv(scalar_file) latest = parse_json(live.metrics_file) return history, latest From 3b327c5c7e74b4eccc24a8efd425208039e53759 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 9 Feb 2024 17:25:58 +0100 Subject: [PATCH 15/39] debug studio updates --- src/dvclive/live.py | 5 +++++ src/dvclive/studio.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index beeba117..18867279 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -362,6 +362,7 @@ def sync(self): self.make_report() + logger.warning(f"post_to_studio step{self.step}") self.post_to_studio("data") def next_step(self): @@ -392,6 +393,10 @@ def log_metric( self._metrics[name] = metric metric.step = self.step + + logger.warning( + f"Logging {name} with value {val} and step{self.step} timestamp {timestamp}" + ) if plot: metric.dump(val, timestamp=timestamp) diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index 90c1ed10..80d55b8a 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -14,7 +14,7 @@ def _get_unsent_datapoints(plot, latest_step): - return [x for x in plot if int(x["step"]) >= latest_step] + return [x for x in plot if int(x["step"]) > latest_step] def _cast_to_numbers(datapoints): @@ -53,7 +53,7 @@ def _adapt_images(live): return { _adapt_path(live, image.output_path): {"image": _adapt_image(image.output_path)} for image in live._images.values() - if image.step >= live._latest_studio_step + if image.step > live._latest_studio_step } @@ -71,6 +71,8 @@ def get_studio_updates(live): metrics_file = _adapt_path(live, metrics_file) metrics = {metrics_file: {"data": metrics}} + logger.warning(f"steeep: {live._latest_studio_step}") + logger.warning(plots) plots = { _adapt_path(live, name): _adapt_plot_datapoints(live, plot) for name, plot in plots.items() From b0ac980505c87a0bb69b4145e7498d70da914e6d Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Mon, 12 Feb 2024 11:02:59 +0100 Subject: [PATCH 16/39] improve code readability: hide CPUMetrics properties change duration to nb_samples log only once if problem with psutil --- src/dvclive/live.py | 14 +++----- src/dvclive/studio.py | 2 -- src/dvclive/system_metrics.py | 39 +++++++++++----------- src/dvclive/utils.py | 12 +------ tests/test_system_metrics.py | 61 ++++------------------------------- tests/test_utils.py | 19 ----------- 6 files changed, 34 insertions(+), 113 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 18867279..c3bcbc66 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -11,7 +11,6 @@ from pathlib import Path, PurePath from typing import ( Any, - Callable, Dict, List, Optional, @@ -51,6 +50,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio +from .system_metrics import CPUMetrics from .utils import ( StrPath, catch_and_warn, @@ -89,7 +89,7 @@ def __init__( cache_images: bool = False, exp_name: Optional[str] = None, exp_message: Optional[str] = None, - callbacks: Optional[List[Callable]] = None, + monitor_system: bool = False, ): self.summary: Dict[str, Any] = {} @@ -135,8 +135,8 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._callbacks = callbacks or [] - for callback in self._callbacks: + self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else [] + for callback in self._system_monitoring_callbacks: callback(self) def _init_resume(self): @@ -362,7 +362,6 @@ def sync(self): self.make_report() - logger.warning(f"post_to_studio step{self.step}") self.post_to_studio("data") def next_step(self): @@ -394,9 +393,6 @@ def log_metric( metric.step = self.step - logger.warning( - f"Logging {name} with value {val} and step{self.step} timestamp {timestamp}" - ) if plot: metric.dump(val, timestamp=timestamp) @@ -626,7 +622,7 @@ def end(self): self.step = self.summary["step"] # Kill all independent threads that can be found in callbacks - for callback in self._callbacks: + for callback in self._system_monitoring_callbacks: callback.end() self.sync() diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index 80d55b8a..bff27040 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -71,8 +71,6 @@ def get_studio_updates(live): metrics_file = _adapt_path(live, metrics_file) metrics = {metrics_file: {"data": metrics}} - logger.warning(f"steeep: {live._latest_studio_step}") - logger.warning(plots) plots = { _adapt_path(live, name): _adapt_plot_datapoints(live, plot) for name, plot in plots.items() diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index b9b27161..dd707007 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -4,8 +4,7 @@ import psutil from statistics import mean from threading import Event, Thread - -from .utils import append_dict +from funcy import merge_with logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 @@ -14,44 +13,48 @@ MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30 -class CPUMetricsCallback: +class CPUMetrics: def __init__( self, - duration: Union[int, float] = 5, - interval: Union[int, float] = 1.0, + interval: float = 0.5, + nb_samples: int = 10, plot: bool = True, ): - self.duration = duration - self.interval = interval - self.plot = plot + self._interval = interval # seconds + self._nb_samples = nb_samples + self._plot = plot self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"] + self._warn_user = True def __call__(self, live): self._live = live self._shutdown_event = Event() Thread( - target=self.monitoring_loop, + target=self._monitoring_loop, ).start() - def monitoring_loop(self): + def _monitoring_loop(self): while not self._shutdown_event.is_set(): self._cpu_metrics = {} - for _ in range(int(self.duration // self.interval)): + for _ in range(self._nb_samples): last_cpu_metrics = {} try: - last_cpu_metrics = get_cpus_metrics() + last_cpu_metrics = _get_cpus_metrics() except psutil.Error: - logger.exception("Failed to monitor CPU metrics:") - self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics) - self._shutdown_event.wait(self.interval) + if self._warn_user: + logger.exception("Failed to monitor CPU metrics") + self._warn_user = False + + self._cpu_metrics = merge_with(sum, self._cpu_metrics, last_cpu_metrics) + self._shutdown_event.wait(self._interval) if self._shutdown_event.is_set(): break for metric_name, metric_values in self._cpu_metrics.items(): self._live.log_metric( metric_name, - mean(metric_values), + metric_values / self._nb_samples, timestamp=True, - plot=self.plot + plot=self._plot if metric_name not in self._no_plot_metrics else False, ) @@ -60,7 +63,7 @@ def end(self): self._shutdown_event.set() -def get_cpus_metrics() -> Dict[str, Union[float, int]]: +def _get_cpus_metrics() -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() io_info = psutil.disk_io_counters() nb_cpus = psutil.cpu_count() diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py index 0c293c4d..8b762590 100644 --- a/src/dvclive/utils.py +++ b/src/dvclive/utils.py @@ -6,7 +6,7 @@ import shutil from pathlib import Path, PurePath from platform import uname -from typing import Any, Union, List, Dict, TYPE_CHECKING +from typing import Union, List, Dict, TYPE_CHECKING import webbrowser from .error import InvalidDataTypeError @@ -245,13 +245,3 @@ def convert_datapoints_to_list_of_dicts( # Raise an error if the input is not a supported type raise InvalidDataTypeError("datapoints", type(datapoints)) - - -def append_dict(previous_values: Dict[str, List[Any]], new_value: Dict[str, Any]): - return { - **previous_values, - **{ - metric_name: [*previous_values.get(metric_name, []), value] - for metric_name, value in new_value.items() - }, - } diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index 6d91d62d..3195800c 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -3,7 +3,7 @@ import pytest from dvclive import Live -from dvclive.system_metrics import CPUMetricsCallback, get_cpus_metrics +from dvclive.system_metrics import _get_cpus_metrics from dvclive.utils import parse_metrics @@ -50,26 +50,19 @@ def mock_psutil(mocker): ) def test_get_cpus_metrics(mocker, metric_name): mock_psutil(mocker) - metrics = get_cpus_metrics() + metrics = _get_cpus_metrics() assert metric_name in metrics -@pytest.mark.parametrize( - ("duration", "interval", "plot"), - [ - (1, 0.5, True), - (2.0, 1, True), - ], -) -def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot): +def test_monitor_system(tmp_dir): with Live( tmp_dir, save_dvc_exp=False, - callbacks=[CPUMetricsCallback(duration, interval, plot)], + monitor_system=True, ) as live: - time.sleep(duration * 2) + time.sleep(5 + 1) # allow the thread to finish live.next_step() - time.sleep(duration * 2 + 0.1) # allow the thread to finish + time.sleep(5 + 1) # allow the thread to finish timeseries, latest = parse_metrics(live) assert "system" in latest @@ -90,48 +83,8 @@ def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot): assert any("ram_usage_percent.tsv" in key for key in timeseries) assert any("write_speed_MB.tsv" in key for key in timeseries) assert any("read_speed_MB.tsv" in key for key in timeseries) - # assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key) + assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) # not plot for constant values assert all("count.tsv" not in key for key in timeseries) assert all("ram_total_GB.tsv" not in key for key in timeseries) - - -@pytest.mark.parametrize( - ("duration", "interval", "plot"), - [ - (1, 0.5, False), - (2.0, 1, False), - ], -) -def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot): - with Live( - tmp_dir, - save_dvc_exp=False, - callbacks=[CPUMetricsCallback(duration, interval, plot)], - ) as live: - time.sleep(duration * 2) - live.next_step() - time.sleep(duration * 2 + 0.1) # allow the thread to finish - timeseries, latest = parse_metrics(live) - - assert "system" in latest - assert "cpu" in latest["system"] - assert "usage_avg_percent" in latest["system"]["cpu"] - assert "usage_max_percent" in latest["system"]["cpu"] - assert "count" in latest["system"]["cpu"] - assert "parallelism_percent" in latest["system"]["cpu"] - assert "ram_usage_percent" in latest["system"]["cpu"] - assert "ram_total_GB" in latest["system"]["cpu"] - assert "io" in latest["system"] - assert "read_speed_MB" in latest["system"]["io"] - assert "write_speed_MB" in latest["system"]["io"] - - assert all("usage_avg_percent.tsv" not in key for key in timeseries) - assert all("usage_max_percent.tsv" not in key for key in timeseries) - assert all("count.tsv" not in key for key in timeseries) - assert all("parallelism_percent.tsv" not in key for key in timeseries) - assert all("ram_usage_percent.tsv" not in key for key in timeseries) - assert all("ram_total_GB.tsv" not in key for key in timeseries) - assert all("write_speed_MB.tsv" not in key for key in timeseries) - assert all("read_speed_MB.tsv" not in key for key in timeseries) diff --git a/tests/test_utils.py b/tests/test_utils.py index ab782a29..ffbc66ef 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,6 @@ from dvclive.utils import ( standardize_metric_name, convert_datapoints_to_list_of_dicts, - append_dict, ) from dvclive.error import InvalidDataTypeError @@ -49,21 +48,3 @@ def test_unsupported_format(): convert_datapoints_to_list_of_dicts("unsupported data format") assert "not supported type" in str(exc_info.value) - - -@pytest.mark.parametrize( - ("aggregate", "new_metrics", "result"), - [ - ({"a": [1], "b": [0.8]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [0.8, 1.6]}), - ({"a": [1]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [1.6]}), - ( - {"a": [1], "b": [0.8], "c": [2]}, - {"a": 2, "b": 1.6}, - {"a": [1, 2], "b": [0.8, 1.6], "c": [2]}, - ), - ({}, {"a": 2, "b": 1.6}, {"a": [2], "b": [1.6]}), - ({}, {}, {}), - ], -) -def test_append_dict(aggregate, new_metrics, result): - assert append_dict(aggregate, new_metrics) == result From 5162ac23c244379858a641274dac652ed3576ab5 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Mon, 12 Feb 2024 18:19:19 +0100 Subject: [PATCH 17/39] remove hack lightning --- src/dvclive/lightning.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/dvclive/lightning.py b/src/dvclive/lightning.py index 2c8d6006..dc159526 100644 --- a/src/dvclive/lightning.py +++ b/src/dvclive/lightning.py @@ -71,11 +71,6 @@ def log_metrics( sync: Optional[bool] = False, ) -> None: if not sync and _should_sync(): - if step == self.experiment._latest_studio_step: # noqa: SLF001 - # We are in log_eval_end_metrics but there has been already - # a studio request sent with `step`. - # We decrease the number to bypass `live.studio._get_unsent_datapoints` - self.experiment._latest_studio_step -= 1 # noqa: SLF001 sync = True super().log_metrics(metrics, step, sync) From 93536456d66cf8c0072d9e78b83a914fb826ba8e Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Mon, 12 Feb 2024 18:47:53 +0100 Subject: [PATCH 18/39] fix lightning problem with steps in studio --- src/dvclive/live.py | 1 + src/dvclive/studio.py | 31 ++++++++++++------------------- tests/test_post_to_studio.py | 6 +----- 3 files changed, 14 insertions(+), 24 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index c3bcbc66..bded46e3 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -133,6 +133,7 @@ def __init__( self._latest_studio_step = self.step if resume else -1 self._studio_events_to_skip: Set[str] = set() self._dvc_studio_config: Dict[str, Any] = {} + self._nb_points_sent_to_studio = {} self._init_studio() self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else [] diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index bff27040..4ed80224 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -13,17 +13,11 @@ logger = logging.getLogger("dvclive") -def _get_unsent_datapoints(plot, latest_step): - return [x for x in plot if int(x["step"]) > latest_step] - - def _cast_to_numbers(datapoints): for datapoint in datapoints: for k, v in datapoint.items(): - if k == "step": + if k in ["step", "timestamp"]: datapoint[k] = int(v) - elif k == "timestamp": - continue else: float_v = float(v) if math.isnan(float_v) or math.isinf(float_v): @@ -39,11 +33,6 @@ def _adapt_path(live, name): return name -def _adapt_plot_datapoints(live, plot): - datapoints = _get_unsent_datapoints(plot, live._latest_studio_step) - return _cast_to_numbers(datapoints) - - def _adapt_image(image_path): with open(image_path, "rb") as fobj: return base64.b64encode(fobj.read()).decode("utf-8") @@ -71,14 +60,18 @@ def get_studio_updates(live): metrics_file = _adapt_path(live, metrics_file) metrics = {metrics_file: {"data": metrics}} - plots = { - _adapt_path(live, name): _adapt_plot_datapoints(live, plot) - for name, plot in plots.items() - } - plots = {k: {"data": v} for k, v in plots.items()} - plots.update(_adapt_images(live)) + plots_to_send = {} + for name, plot in plots.items(): + path = _adapt_path(live, name) + nb_points_already_sent = live._nb_points_sent_to_studio.get(path, 0) + live._nb_points_sent_to_studio[path] = len(plot) + + plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:]) + + plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()} + plots_to_send.update(_adapt_images(live)) - return metrics, params, plots + return metrics, params, plots_to_send def get_dvc_studio_config(live): diff --git a/tests/test_post_to_studio.py b/tests/test_post_to_studio.py index a22dfa55..6fc5dcfb 100644 --- a/tests/test_post_to_studio.py +++ b/tests/test_post_to_studio.py @@ -105,11 +105,7 @@ def test_post_to_studio_failed_data_request( "data", exp_name=live._exp_name, step=1, - plots={ - f"{foo_path}": { - "data": [{"step": 0, "foo": 1.0}, {"step": 1, "foo": 2.0}] - } - }, + plots={f"{foo_path}": {"data": [{"step": 1, "foo": 2.0}]}}, ), ) From 871bebc4942c639ba1cee5d1dd76b7e14893ed43 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 13 Feb 2024 09:12:57 +0100 Subject: [PATCH 19/39] simplify the metric names --- src/dvclive/system_metrics.py | 19 ++++++++-------- tests/test_system_metrics.py | 42 +++++++++++++++++------------------ 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index dd707007..97760800 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -10,7 +10,7 @@ MEGABYTES_DIVIDER = 1024.0**2 GIGABYTES_DIVIDER = 1024.0**3 -MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30 +MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 class CPUMetrics: @@ -23,7 +23,7 @@ def __init__( self._interval = interval # seconds self._nb_samples = nb_samples self._plot = plot - self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"] + self._no_plot_metrics = ["system/cpu/count", "system/ram/total (GB)"] self._warn_user = True def __call__(self, live): @@ -69,10 +69,9 @@ def _get_cpus_metrics() -> Dict[str, Union[float, int]]: nb_cpus = psutil.cpu_count() cpus_percent = psutil.cpu_percent(percpu=True) return { - "system/cpu/usage_avg_percent": mean(cpus_percent), - "system/cpu/usage_max_percent": max(cpus_percent), + "system/cpu/usage (%)": mean(cpus_percent), "system/cpu/count": nb_cpus, - "system/cpu/parallelism_percent": len( + "system/cpu/parallelization (%)": len( [ percent for percent in cpus_percent @@ -81,10 +80,12 @@ def _get_cpus_metrics() -> Dict[str, Union[float, int]]: ) * 100 / nb_cpus, - "system/cpu/ram_usage_percent": ram_info.percent, - "system/cpu/ram_total_GB": ram_info.total / GIGABYTES_DIVIDER, - "system/io/read_speed_MB": io_info.read_bytes + "system/ram/usage (%)": ram_info.percent, + "system/ram/usage (GB)": (ram_info.percent / 100) + * (ram_info.total / GIGABYTES_DIVIDER), + "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, + "system/io/read speed (MB)": io_info.read_bytes / (io_info.read_time * MEGABYTES_DIVIDER), - "system/io/write_speed_MB": io_info.write_bytes + "system/io/write speed (MB)": io_info.write_bytes / (io_info.write_time * MEGABYTES_DIVIDER), } diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index 3195800c..97b5ddf8 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -38,14 +38,14 @@ def mock_psutil(mocker): @pytest.mark.parametrize( ("metric_name"), [ - "system/cpu/usage_avg_percent", - "system/cpu/usage_max_percent", + "system/cpu/usage (%)", "system/cpu/count", - "system/cpu/parallelism_percent", - "system/cpu/ram_usage_percent", - "system/cpu/ram_total_GB", - "system/io/read_speed_MB", - "system/io/write_speed_MB", + "system/cpu/parallelization (%)", + "system/ram/usage (%)", + "system/ram/usage (GB)", + "system/ram/total (GB)", + "system/io/read speed (MB)", + "system/io/write speed (MB)", ], ) def test_get_cpus_metrics(mocker, metric_name): @@ -67,24 +67,24 @@ def test_monitor_system(tmp_dir): assert "system" in latest assert "cpu" in latest["system"] - assert "usage_avg_percent" in latest["system"]["cpu"] - assert "usage_max_percent" in latest["system"]["cpu"] + assert "usage (%)" in latest["system"]["cpu"] assert "count" in latest["system"]["cpu"] - assert "parallelism_percent" in latest["system"]["cpu"] - assert "ram_usage_percent" in latest["system"]["cpu"] - assert "ram_total_GB" in latest["system"]["cpu"] + assert "parallelization (%)" in latest["system"]["cpu"] + assert "ram" in latest["system"] + assert "usage (%)" in latest["system"]["ram"] + assert "usage (GB)" in latest["system"]["ram"] + assert "total (GB)" in latest["system"]["ram"] assert "io" in latest["system"] - assert "read_speed_MB" in latest["system"]["io"] - assert "write_speed_MB" in latest["system"]["io"] + assert "read speed (MB)" in latest["system"]["io"] + assert "write speed (MB)" in latest["system"]["io"] - assert any("usage_avg_percent" in key for key in timeseries) - assert any("usage_max_percent.tsv" in key for key in timeseries) - assert any("parallelism_percent.tsv" in key for key in timeseries) - assert any("ram_usage_percent.tsv" in key for key in timeseries) - assert any("write_speed_MB.tsv" in key for key in timeseries) - assert any("read_speed_MB.tsv" in key for key in timeseries) + assert any("usage (%).tsv" in key for key in timeseries) + assert any("parallelization (%).tsv" in key for key in timeseries) + assert any("usage (GB).tsv" in key for key in timeseries) + assert any("read speed (MB).tsv" in key for key in timeseries) + assert any("write speed (MB).tsv" in key for key in timeseries) assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) # not plot for constant values assert all("count.tsv" not in key for key in timeseries) - assert all("ram_total_GB.tsv" not in key for key in timeseries) + assert all("total (GB).tsv" not in key for key in timeseries) From e169abc57e3a925e0c585390b7f27fbaf240b733 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Wed, 14 Feb 2024 16:24:33 +0100 Subject: [PATCH 20/39] don't increment `num_point_sent_to_studio` if studio didn't received the data --- src/dvclive/live.py | 3 ++- src/dvclive/studio.py | 8 +++++--- tests/test_post_to_studio.py | 6 +++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index ac74e2d3..1c1879ca 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -172,7 +172,8 @@ def __init__( self._latest_studio_step = self.step if resume else -1 self._studio_events_to_skip: Set[str] = set() self._dvc_studio_config: Dict[str, Any] = {} - self._nb_points_sent_to_studio = {} + self._num_points_sent_to_studio: Dict[str, int] = {} + self._num_points_read_from_file: Dict[str, int] = {} self._init_studio() self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else [] diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index 4ed80224..8a3109d9 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -61,12 +61,12 @@ def get_studio_updates(live): metrics = {metrics_file: {"data": metrics}} plots_to_send = {} + live._num_points_read_from_file = {} for name, plot in plots.items(): path = _adapt_path(live, name) - nb_points_already_sent = live._nb_points_sent_to_studio.get(path, 0) - live._nb_points_sent_to_studio[path] = len(plot) - + nb_points_already_sent = live._num_points_sent_to_studio.get(path, 0) plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:]) + live._num_points_read_from_file.update({path: len(plot)}) plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()} plots_to_send.update(_adapt_images(live)) @@ -112,6 +112,8 @@ def post_to_studio(live, event): live._studio_events_to_skip.add("data") live._studio_events_to_skip.add("done") elif event == "data": + for path, num_points in live._num_points_read_from_file.items(): + live._num_points_sent_to_studio[path] = num_points live._latest_studio_step = live.step if event == "done": diff --git a/tests/test_post_to_studio.py b/tests/test_post_to_studio.py index 6fc5dcfb..a22dfa55 100644 --- a/tests/test_post_to_studio.py +++ b/tests/test_post_to_studio.py @@ -105,7 +105,11 @@ def test_post_to_studio_failed_data_request( "data", exp_name=live._exp_name, step=1, - plots={f"{foo_path}": {"data": [{"step": 1, "foo": 2.0}]}}, + plots={ + f"{foo_path}": { + "data": [{"step": 0, "foo": 1.0}, {"step": 1, "foo": 2.0}] + } + }, ), ) From aa5b511e414938f07aad66d3502e4a21bdb6de3b Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 10:46:17 +0100 Subject: [PATCH 21/39] add directory metrics to the list of metrics tracked + refacto --- src/dvclive/live.py | 20 ++++-- src/dvclive/system_metrics.py | 106 ++++++++++++++++++++------------ tests/test_system_metrics.py | 112 ++++++++++++++++++++++------------ 3 files changed, 156 insertions(+), 82 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 1c1879ca..ce6ba98f 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -176,9 +176,10 @@ def __init__( self._num_points_read_from_file: Dict[str, int] = {} self._init_studio() - self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else [] - for callback in self._system_monitoring_callbacks: - callback(self) + self._cpu_metrics = None + if monitor_system: + self._cpu_metrics = CPUMetrics() + self._cpu_metrics(self) def _init_resume(self): self._read_params() @@ -398,6 +399,13 @@ def step(self, value: int) -> None: self._step = value logger.debug(f"Step: {self.step}") + @step.setter + def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None: + if self._cpu_metrics is not None: + self._cpu_metrics.end() + self._cpu_metrics = cpu_metrics + self._cpu_metrics(self) + def sync(self): self.make_summary() @@ -887,9 +895,9 @@ def end(self): if "step" in self.summary: self.step = self.summary["step"] - # Kill all independent threads that can be found in callbacks - for callback in self._system_monitoring_callbacks: - callback.end() + # Kill threads that monitor the system metrics + if self._cpu_metrics is not None: + self._cpu_metrics.end() self.sync() diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py index 97760800..9bb4685e 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/system_metrics.py @@ -1,5 +1,6 @@ import logging -from typing import Dict, Union +from typing import Dict, Union, Optional, List +from pathlib import Path import psutil from statistics import mean @@ -13,7 +14,9 @@ MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 -class CPUMetrics: +class _SystemMetrics: + _plot_blacklist_prefix = () + def __init__( self, interval: float = 0.5, @@ -23,7 +26,6 @@ def __init__( self._interval = interval # seconds self._nb_samples = nb_samples self._plot = plot - self._no_plot_metrics = ["system/cpu/count", "system/ram/total (GB)"] self._warn_user = True def __call__(self, live): @@ -35,57 +37,85 @@ def __call__(self, live): def _monitoring_loop(self): while not self._shutdown_event.is_set(): - self._cpu_metrics = {} + self._metrics = {} for _ in range(self._nb_samples): - last_cpu_metrics = {} + last_metrics = {} try: - last_cpu_metrics = _get_cpus_metrics() + last_metrics = self._get_metrics() except psutil.Error: if self._warn_user: logger.exception("Failed to monitor CPU metrics") self._warn_user = False - self._cpu_metrics = merge_with(sum, self._cpu_metrics, last_cpu_metrics) + self._metrics = merge_with(sum, self._metrics, last_metrics) self._shutdown_event.wait(self._interval) if self._shutdown_event.is_set(): break - for metric_name, metric_values in self._cpu_metrics.items(): + for name, values in self._metrics.items(): + blacklisted = any( + name.startswith(prefix) for prefix in self._plot_blacklist_prefix + ) self._live.log_metric( - metric_name, - metric_values / self._nb_samples, + name, + values / self._nb_samples, timestamp=True, - plot=self._plot - if metric_name not in self._no_plot_metrics - else False, + plot=None if blacklisted else self._plot, ) + def _get_metrics() -> Dict[str, Union[float, int]]: + pass + def end(self): self._shutdown_event.set() -def _get_cpus_metrics() -> Dict[str, Union[float, int]]: - ram_info = psutil.virtual_memory() - io_info = psutil.disk_io_counters() - nb_cpus = psutil.cpu_count() - cpus_percent = psutil.cpu_percent(percpu=True) - return { - "system/cpu/usage (%)": mean(cpus_percent), - "system/cpu/count": nb_cpus, - "system/cpu/parallelization (%)": len( - [ - percent - for percent in cpus_percent - if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE - ] +class CPUMetrics(_SystemMetrics): + _plot_blacklist_prefix = ( + "system/cpu/count", + "system/ram/total (GB)", + "system/disk/total (GB)", + ) + + def __init__( + self, + interval: float = 0.5, + nb_samples: int = 10, + directories_to_monitor: Optional[List[str]] = None, + plot: bool = True, + ): + super().__init__(interval=interval, nb_samples=nb_samples, plot=plot) + self._directories_to_monitor = ( + ["."] if directories_to_monitor is None else directories_to_monitor ) - * 100 - / nb_cpus, - "system/ram/usage (%)": ram_info.percent, - "system/ram/usage (GB)": (ram_info.percent / 100) - * (ram_info.total / GIGABYTES_DIVIDER), - "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, - "system/io/read speed (MB)": io_info.read_bytes - / (io_info.read_time * MEGABYTES_DIVIDER), - "system/io/write speed (MB)": io_info.write_bytes - / (io_info.write_time * MEGABYTES_DIVIDER), - } + + def _get_metrics(self) -> Dict[str, Union[float, int]]: + ram_info = psutil.virtual_memory() + nb_cpus = psutil.cpu_count() + cpus_percent = psutil.cpu_percent(percpu=True) + result = { + "system/cpu/usage (%)": mean(cpus_percent), + "system/cpu/count": nb_cpus, + "system/cpu/parallelization (%)": len( + [ + percent + for percent in cpus_percent + if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE + ] + ) + * 100 + / nb_cpus, + "system/ram/usage (%)": ram_info.percent, + "system/ram/usage (GB)": (ram_info.percent / 100) + * (ram_info.total / GIGABYTES_DIVIDER), + "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, + } + for idx, directory in enumerate(self._directories_to_monitor): + if not Path(directory).exists(): + continue + disk_info = psutil.disk_usage(directory) + result[f"system/disk/usage (%)/{idx}"] = disk_info.percent + result[f"system/disk/usage (GB)/{idx}"] = disk_info.used / GIGABYTES_DIVIDER + result[f"system/disk/total (GB)/{idx}"] = ( + disk_info.total / GIGABYTES_DIVIDER + ) + return result diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py index 97b5ddf8..53ad9060 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_system_metrics.py @@ -1,9 +1,8 @@ import time - -import pytest +from pathlib import Path from dvclive import Live -from dvclive.system_metrics import _get_cpus_metrics +from dvclive.system_metrics import CPUMetrics from dvclive.utils import parse_metrics @@ -18,15 +17,14 @@ def mock_psutil(mocker): mocked_virtual_memory.percent = 20 mocked_virtual_memory.total = 4 * 1024**3 - mocked_disk_io_counters = mocker.MagicMock() - mocked_disk_io_counters.read_bytes = 2 * 1024**2 - mocked_disk_io_counters.read_time = 1 - mocked_disk_io_counters.write_bytes = 2 * 1024**2 - mocked_disk_io_counters.write_time = 1 + mocked_disk_usage = mocker.MagicMock() + mocked_disk_usage.used = 16 * 1024**3 + mocked_disk_usage.percent = 50 + mocked_disk_usage.total = 32 * 1024**3 mocking_dict = { "virtual_memory": mocked_virtual_memory, - "disk_io_counters": mocked_disk_io_counters, + "disk_usage": mocked_disk_usage, } for function_name, return_value in mocking_dict.items(): mocker.patch( @@ -35,26 +33,54 @@ def mock_psutil(mocker): ) -@pytest.mark.parametrize( - ("metric_name"), - [ - "system/cpu/usage (%)", - "system/cpu/count", - "system/cpu/parallelization (%)", - "system/ram/usage (%)", - "system/ram/usage (GB)", - "system/ram/total (GB)", - "system/io/read speed (MB)", - "system/io/write speed (MB)", - ], -) -def test_get_cpus_metrics(mocker, metric_name): +def test_get_cpus_metrics_mocker(tmp_dir, mocker): mock_psutil(mocker) - metrics = _get_cpus_metrics() - assert metric_name in metrics + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=False, + ) as live: + monitor = CPUMetrics(directories_to_monitor=["/", "/"]) + monitor(live) + metrics = monitor._get_metrics() + monitor.end() + assert "system/cpu/usage (%)" in metrics + assert "system/cpu/count" in metrics + assert "system/cpu/parallelization (%)" in metrics + assert "system/ram/usage (%)" in metrics + assert "system/ram/usage (GB)" in metrics + assert "system/ram/total (GB)" in metrics + assert "system/disk/usage (%)/0" in metrics + assert "system/disk/usage (%)/1" in metrics + assert "system/disk/usage (GB)/0" in metrics + assert "system/disk/usage (GB)/1" in metrics + assert "system/disk/total (GB)/0" in metrics + assert "system/disk/total (GB)/1" in metrics -def test_monitor_system(tmp_dir): + +def test_ignore_missing_directories(tmp_dir, mocker): + mock_psutil(mocker) + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=False, + ) as live: + missing_directories = "______" + monitor = CPUMetrics(directories_to_monitor=["/", missing_directories]) + monitor(live) + metrics = monitor._get_metrics() + monitor.end() + + assert not Path(missing_directories).exists() + + assert "system/disk/usage (%)/1" not in metrics + assert "system/disk/usage (GB)/1" not in metrics + assert "system/disk/total (GB)/1" not in metrics + + +def test_monitor_system(tmp_dir, mocker): + mock_psutil(mocker) with Live( tmp_dir, save_dvc_exp=False, @@ -65,6 +91,7 @@ def test_monitor_system(tmp_dir): time.sleep(5 + 1) # allow the thread to finish timeseries, latest = parse_metrics(live) + # metrics.json file contains all the system metrics assert "system" in latest assert "cpu" in latest["system"] assert "usage (%)" in latest["system"]["cpu"] @@ -74,17 +101,26 @@ def test_monitor_system(tmp_dir): assert "usage (%)" in latest["system"]["ram"] assert "usage (GB)" in latest["system"]["ram"] assert "total (GB)" in latest["system"]["ram"] - assert "io" in latest["system"] - assert "read speed (MB)" in latest["system"]["io"] - assert "write speed (MB)" in latest["system"]["io"] - - assert any("usage (%).tsv" in key for key in timeseries) - assert any("parallelization (%).tsv" in key for key in timeseries) - assert any("usage (GB).tsv" in key for key in timeseries) - assert any("read speed (MB).tsv" in key for key in timeseries) - assert any("write speed (MB).tsv" in key for key in timeseries) + assert "disk" in latest["system"] + assert "usage (%)" in latest["system"]["disk"] + assert "0" in latest["system"]["disk"]["usage (%)"] + assert "usage (GB)" in latest["system"]["disk"] + assert "total (GB)" in latest["system"]["disk"] + + # timeseries contains all the system metrics + assert any(str(Path("system/cpu/usage (%).tsv")) in key for key in timeseries) + assert any( + str(Path("system/cpu/parallelization (%).tsv")) in key for key in timeseries + ) + assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries) + assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries) + assert any(str(Path("system/disk/usage (%)/0.tsv")) in key for key in timeseries) + assert any(str(Path("system/disk/usage (GB)/0.tsv")) in key for key in timeseries) assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) - # not plot for constant values - assert all("count.tsv" not in key for key in timeseries) - assert all("total (GB).tsv" not in key for key in timeseries) + # blacklisted timeseries + assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries) + assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries) + assert all( + str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries + ) From 9d0e70d264b21e9b7eb0b3556e2d94db5a2e65dd Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 11:05:46 +0100 Subject: [PATCH 22/39] clean code and split features into several PRs --- src/dvclive/lightning.py | 5 +++++ src/dvclive/live.py | 9 +++++---- src/dvclive/studio.py | 32 +++++++++++++++++++------------- tests/test_utils.py | 5 +---- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/dvclive/lightning.py b/src/dvclive/lightning.py index dc159526..2c8d6006 100644 --- a/src/dvclive/lightning.py +++ b/src/dvclive/lightning.py @@ -71,6 +71,11 @@ def log_metrics( sync: Optional[bool] = False, ) -> None: if not sync and _should_sync(): + if step == self.experiment._latest_studio_step: # noqa: SLF001 + # We are in log_eval_end_metrics but there has been already + # a studio request sent with `step`. + # We decrease the number to bypass `live.studio._get_unsent_datapoints` + self.experiment._latest_studio_step -= 1 # noqa: SLF001 sync = True super().log_metrics(metrics, step, sync) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index ce6ba98f..4ae1006e 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -172,8 +172,6 @@ def __init__( self._latest_studio_step = self.step if resume else -1 self._studio_events_to_skip: Set[str] = set() self._dvc_studio_config: Dict[str, Any] = {} - self._num_points_sent_to_studio: Dict[str, int] = {} - self._num_points_read_from_file: Dict[str, int] = {} self._init_studio() self._cpu_metrics = None @@ -399,7 +397,11 @@ def step(self, value: int) -> None: self._step = value logger.debug(f"Step: {self.step}") - @step.setter + @property + def cpu_metrics(self) -> int: + return self._cpu_metrics or None + + @cpu_metrics.setter def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None: if self._cpu_metrics is not None: self._cpu_metrics.end() @@ -474,7 +476,6 @@ def log_metric( self._metrics[name] = metric metric.step = self.step - if plot: metric.dump(val, timestamp=timestamp) diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py index 8a3109d9..f74db7ab 100644 --- a/src/dvclive/studio.py +++ b/src/dvclive/studio.py @@ -13,11 +13,17 @@ logger = logging.getLogger("dvclive") +def _get_unsent_datapoints(plot, latest_step): + return [x for x in plot if int(x["step"]) > latest_step] + + def _cast_to_numbers(datapoints): for datapoint in datapoints: for k, v in datapoint.items(): - if k in ["step", "timestamp"]: + if k == "step": datapoint[k] = int(v) + elif k == "timestamp": + continue else: float_v = float(v) if math.isnan(float_v) or math.isinf(float_v): @@ -33,6 +39,11 @@ def _adapt_path(live, name): return name +def _adapt_plot_datapoints(live, plot): + datapoints = _get_unsent_datapoints(plot, live._latest_studio_step) + return _cast_to_numbers(datapoints) + + def _adapt_image(image_path): with open(image_path, "rb") as fobj: return base64.b64encode(fobj.read()).decode("utf-8") @@ -60,18 +71,15 @@ def get_studio_updates(live): metrics_file = _adapt_path(live, metrics_file) metrics = {metrics_file: {"data": metrics}} - plots_to_send = {} - live._num_points_read_from_file = {} - for name, plot in plots.items(): - path = _adapt_path(live, name) - nb_points_already_sent = live._num_points_sent_to_studio.get(path, 0) - plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:]) - live._num_points_read_from_file.update({path: len(plot)}) + plots = { + _adapt_path(live, name): _adapt_plot_datapoints(live, plot) + for name, plot in plots.items() + } + plots = {k: {"data": v} for k, v in plots.items()} - plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()} - plots_to_send.update(_adapt_images(live)) + plots.update(_adapt_images(live)) - return metrics, params, plots_to_send + return metrics, params, plots def get_dvc_studio_config(live): @@ -112,8 +120,6 @@ def post_to_studio(live, event): live._studio_events_to_skip.add("data") live._studio_events_to_skip.add("done") elif event == "data": - for path, num_points in live._num_points_read_from_file.items(): - live._num_points_sent_to_studio[path] = num_points live._latest_studio_step = live.step if event == "done": diff --git a/tests/test_utils.py b/tests/test_utils.py index ffbc66ef..2b5d56ca 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,10 +2,7 @@ import pandas as pd import pytest -from dvclive.utils import ( - standardize_metric_name, - convert_datapoints_to_list_of_dicts, -) +from dvclive.utils import standardize_metric_name, convert_datapoints_to_list_of_dicts from dvclive.error import InvalidDataTypeError From ba72e0132e313b64cbc1ccbbd52e332b7f3bf724 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 11:22:17 +0100 Subject: [PATCH 23/39] cleaner user interface --- src/dvclive/live.py | 28 +++++++++---------- .../{system_metrics.py => monitor_system.py} | 4 +-- ...stem_metrics.py => test_monitor_system.py} | 12 ++++---- 3 files changed, 22 insertions(+), 22 deletions(-) rename src/dvclive/{system_metrics.py => monitor_system.py} (98%) rename tests/{test_system_metrics.py => test_monitor_system.py} (92%) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 4ae1006e..324414ac 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -51,7 +51,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .system_metrics import CPUMetrics +from .monitor_system import MonitorCPU from .utils import ( StrPath, catch_and_warn, @@ -174,10 +174,10 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._cpu_metrics = None + self._monitor_cpu = None if monitor_system: - self._cpu_metrics = CPUMetrics() - self._cpu_metrics(self) + self._monitor_cpu = MonitorCPU() + self._monitor_cpu(self) def _init_resume(self): self._read_params() @@ -398,15 +398,15 @@ def step(self, value: int) -> None: logger.debug(f"Step: {self.step}") @property - def cpu_metrics(self) -> int: - return self._cpu_metrics or None + def monitor_cpu(self) -> int: + return self._monitor_cpu or None - @cpu_metrics.setter - def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None: - if self._cpu_metrics is not None: - self._cpu_metrics.end() - self._cpu_metrics = cpu_metrics - self._cpu_metrics(self) + @monitor_cpu.setter + def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None: + if self._monitor_cpu is not None: + self._monitor_cpu.end() + self._monitor_cpu = monitor_cpu + self._monitor_cpu(self) def sync(self): self.make_summary() @@ -897,8 +897,8 @@ def end(self): self.step = self.summary["step"] # Kill threads that monitor the system metrics - if self._cpu_metrics is not None: - self._cpu_metrics.end() + if self._monitor_cpu is not None: + self._monitor_cpu.end() self.sync() diff --git a/src/dvclive/system_metrics.py b/src/dvclive/monitor_system.py similarity index 98% rename from src/dvclive/system_metrics.py rename to src/dvclive/monitor_system.py index 9bb4685e..9e4f9f54 100644 --- a/src/dvclive/system_metrics.py +++ b/src/dvclive/monitor_system.py @@ -14,7 +14,7 @@ MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 -class _SystemMetrics: +class _MonitorSystem: _plot_blacklist_prefix = () def __init__( @@ -69,7 +69,7 @@ def end(self): self._shutdown_event.set() -class CPUMetrics(_SystemMetrics): +class MonitorCPU(_MonitorSystem): _plot_blacklist_prefix = ( "system/cpu/count", "system/ram/total (GB)", diff --git a/tests/test_system_metrics.py b/tests/test_monitor_system.py similarity index 92% rename from tests/test_system_metrics.py rename to tests/test_monitor_system.py index 53ad9060..9653c2e1 100644 --- a/tests/test_system_metrics.py +++ b/tests/test_monitor_system.py @@ -2,16 +2,16 @@ from pathlib import Path from dvclive import Live -from dvclive.system_metrics import CPUMetrics +from dvclive.monitor_system import MonitorCPU from dvclive.utils import parse_metrics def mock_psutil(mocker): mocker.patch( - "dvclive.system_metrics.psutil.cpu_percent", + "dvclive.monitor_system.psutil.cpu_percent", return_value=[10, 20, 30, 40, 50, 60], ) - mocker.patch("dvclive.system_metrics.psutil.cpu_count", return_value=6) + mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6) mocked_virtual_memory = mocker.MagicMock() mocked_virtual_memory.percent = 20 @@ -28,7 +28,7 @@ def mock_psutil(mocker): } for function_name, return_value in mocking_dict.items(): mocker.patch( - f"dvclive.system_metrics.psutil.{function_name}", + f"dvclive.monitor_system.psutil.{function_name}", return_value=return_value, ) @@ -40,7 +40,7 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - monitor = CPUMetrics(directories_to_monitor=["/", "/"]) + monitor = MonitorCPU(directories_to_monitor=["/", "/"]) monitor(live) metrics = monitor._get_metrics() monitor.end() @@ -67,7 +67,7 @@ def test_ignore_missing_directories(tmp_dir, mocker): monitor_system=False, ) as live: missing_directories = "______" - monitor = CPUMetrics(directories_to_monitor=["/", missing_directories]) + monitor = MonitorCPU(directories_to_monitor=["/", missing_directories]) monitor(live) metrics = monitor._get_metrics() monitor.end() From 6e29ff6193cd601bc921fec40df179b95e5f2c1c Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 12:26:14 +0100 Subject: [PATCH 24/39] add docstrings --- src/dvclive/monitor_system.py | 47 +++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 9e4f9f54..26324dc5 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -7,6 +7,9 @@ from threading import Event, Thread from funcy import merge_with +from .error import InvalidDataTypeError + + logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 GIGABYTES_DIVIDER = 1024.0**3 @@ -15,16 +18,29 @@ class _MonitorSystem: + """ + Monitor system resources and log them to DVC Live. + Use a separate thread to call a `_get_metrics` function at fix interval and + aggregate the results of this sampling using the average. + """ + _plot_blacklist_prefix = () def __init__( self, interval: float = 0.5, - nb_samples: int = 10, + num_samples: int = 10, plot: bool = True, ): + if not isinstance(interval, (int, float)): + raise InvalidDataTypeError("MonitorSystem.interval", type(interval)) + if not isinstance(num_samples, int): + raise InvalidDataTypeError("MonitorSystem.sample", type(num_samples)) + if not isinstance(plot, bool): + raise InvalidDataTypeError("MonitorSystem.plot", type(plot)) + self._interval = interval # seconds - self._nb_samples = nb_samples + self._nb_samples = num_samples self._plot = plot self._warn_user = True @@ -79,14 +95,35 @@ class MonitorCPU(_MonitorSystem): def __init__( self, interval: float = 0.5, - nb_samples: int = 10, + num_samples: int = 10, directories_to_monitor: Optional[List[str]] = None, plot: bool = True, ): - super().__init__(interval=interval, nb_samples=nb_samples, plot=plot) - self._directories_to_monitor = ( + """Monitor CPU resources and log them to DVC Live. + + Args: + interval (float): interval in seconds between two measurements. + Defaults to 0.5. + num_samples (int): number of samples to average. Defaults to 10. + directories_to_monitor (Optional[List[str]]): paths of the directories that + needs to be monitor for disk storage metrics. Defaults to the current + directory. + plot (bool): should the system metrics be saved as plots. Defaults to True. + + Raises: + InvalidDataTypeError: if the arguments passed to the function don't have a + supported type. + """ + super().__init__(interval=interval, num_samples=num_samples, plot=plot) + directories_to_monitor = ( ["."] if directories_to_monitor is None else directories_to_monitor ) + for idx, path in enumerate(directories_to_monitor): + if not isinstance(path, str): + raise InvalidDataTypeError( + f"MonitorCPU.directories_to_monitor[{idx}]", type(path) + ) + self._directories_to_monitor = directories_to_monitor def _get_metrics(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() From 719087be3d6256b1306581471772aa680c2dca9d Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 12:41:50 +0100 Subject: [PATCH 25/39] mypy conflicts --- src/dvclive/live.py | 2 +- src/dvclive/monitor_system.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 324414ac..05ff9943 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -398,7 +398,7 @@ def step(self, value: int) -> None: logger.debug(f"Step: {self.step}") @property - def monitor_cpu(self) -> int: + def monitor_cpu(self) -> Optional[MonitorCPU]: return self._monitor_cpu or None @monitor_cpu.setter diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 26324dc5..6820a1f2 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Union, Optional, List +from typing import Dict, Union, Optional, List, Tuple from pathlib import Path import psutil @@ -24,7 +24,7 @@ class _MonitorSystem: aggregate the results of this sampling using the average. """ - _plot_blacklist_prefix = () + _plot_blacklist_prefix: Tuple = () def __init__( self, @@ -78,7 +78,7 @@ def _monitoring_loop(self): plot=None if blacklisted else self._plot, ) - def _get_metrics() -> Dict[str, Union[float, int]]: + def _get_metrics(self): pass def end(self): @@ -86,7 +86,7 @@ def end(self): class MonitorCPU(_MonitorSystem): - _plot_blacklist_prefix = ( + _plot_blacklist_prefix: Tuple = ( "system/cpu/count", "system/ram/total (GB)", "system/disk/total (GB)", From 32de58ca4a4ceb119fad7489dba4f37e41b363e7 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 13:08:27 +0100 Subject: [PATCH 26/39] monitor GPU metrics --- pyproject.toml | 3 +- src/dvclive/live.py | 10 +++- src/dvclive/monitor_system.py | 59 +++++++++++++++++++++++ tests/test_monitor_system.py | 91 ++++++++++++++++++++++++++++++++++- 4 files changed, 160 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 62294cfa..e425e8eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ dependencies = [ "gto", "ruamel.yaml", "scmrepo", - "psutil" + "psutil", + "py3nvml" ] [project.optional-dependencies] diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 05ff9943..cecf2e01 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -9,6 +9,7 @@ import tempfile from pathlib import Path, PurePath +from py3nvml import get_num_procs from typing import ( Any, Dict, @@ -51,7 +52,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .monitor_system import MonitorCPU +from .monitor_system import MonitorCPU, MonitorGPU from .utils import ( StrPath, catch_and_warn, @@ -175,10 +176,15 @@ def __init__( self._init_studio() self._monitor_cpu = None + self._monitor_gpu = None if monitor_system: self._monitor_cpu = MonitorCPU() self._monitor_cpu(self) + if len(get_num_procs()): + self._monitor_gpu = MonitorGPU() + self._monitor_gpu(self) + def _init_resume(self): self._read_params() self.summary = self.read_latest() @@ -899,6 +905,8 @@ def end(self): # Kill threads that monitor the system metrics if self._monitor_cpu is not None: self._monitor_cpu.end() + if self._monitor_gpu is not None: + self._monitor_gpu.end() self.sync() diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 6820a1f2..4725066d 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -6,6 +6,14 @@ from statistics import mean from threading import Event, Thread from funcy import merge_with +from py3nvml.py3nvml import ( + nvmlInit, + nvmlDeviceGetCount, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetMemoryInfo, + nvmlDeviceGetUtilizationRates, + nvmlShutdown, +) from .error import InvalidDataTypeError @@ -156,3 +164,54 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: disk_info.total / GIGABYTES_DIVIDER ) return result + + +class MonitorGPU(_MonitorSystem): + _plot_blacklist_prefix = ("system/gpu/count", "system/vram/total (GB)") + + def __init__( + self, + interval: float = 0.5, + num_samples: int = 10, + plot: bool = True, + ): + """Monitor GPU resources and log them to DVC Live. + + Args: + interval (float): interval in seconds between two measurements. + Defaults to 0.5. + num_samples (int): number of samples to average. Defaults to 10. + plot (bool): should the system metrics be saved as plots. Defaults to True. + + Raises: + InvalidDataTypeError: if the arguments passed to the function don't have a + supported type. + """ + super().__init__(interval=interval, num_samples=num_samples, plot=plot) + + def _get_metrics(self) -> Dict[str, Union[float, int]]: + nvmlInit() + num_gpus = nvmlDeviceGetCount() + gpu_metrics = { + "system/gpu/count": num_gpus, + } + + for gpu_idx in range(num_gpus): + gpu_handle = nvmlDeviceGetHandleByIndex(gpu_idx) + memory_info = nvmlDeviceGetMemoryInfo(gpu_handle) + usage_info = nvmlDeviceGetUtilizationRates(gpu_handle) + + gpu_metrics[f"system/gpu/usage (%)/{gpu_idx}"] = ( + 100 * usage_info.memory / usage_info.gpu if usage_info.gpu else 0 + ) + gpu_metrics[f"system/vram/usage (%)/{gpu_idx}"] = ( + 100 * memory_info.used / memory_info.total + ) + gpu_metrics[f"system/vram/usage (GB)/{gpu_idx}"] = ( + memory_info.used / GIGABYTES_DIVIDER + ) + gpu_metrics[f"system/vram/total (GB)/{gpu_idx}"] = ( + memory_info.total / GIGABYTES_DIVIDER + ) + nvmlShutdown() + return gpu_metrics diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 9653c2e1..d0fbffbf 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -2,7 +2,7 @@ from pathlib import Path from dvclive import Live -from dvclive.monitor_system import MonitorCPU +from dvclive.monitor_system import MonitorCPU, MonitorGPU from dvclive.utils import parse_metrics @@ -124,3 +124,92 @@ def test_monitor_system(tmp_dir, mocker): assert all( str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries ) + + +def mock_py3nvml(mocker, nb_gpus=2, crash_index=None): + mocker.patch("dvclive.live.get_num_procs", return_value=[1 for _ in range(nb_gpus)]) + mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=nb_gpus) + + mocked_memory_info = mocker.MagicMock() + mocked_memory_info.used = 3 * 1024**3 + mocked_memory_info.total = 5 * 1024**3 + + mocked_utilization_rate = mocker.MagicMock() + mocked_utilization_rate.memory = 5 + mocked_utilization_rate.gpu = 10 + + mocking_dict = { + "nvmlDeviceGetHandleByIndex": None, + "nvmlDeviceGetMemoryInfo": mocked_memory_info, + "nvmlDeviceGetUtilizationRates": mocked_utilization_rate, + } + + for function_name, return_value in mocking_dict.items(): + mocker.patch( + f"dvclive.monitor_system.{function_name}", + return_value=return_value, + ) + + +def test_get_gpus_metrics_mocker(mocker, tmp_dir): + mock_py3nvml(mocker, nb_gpus=2) + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=False, + ) as live: + monitor = MonitorGPU() + monitor(live) + metrics = monitor._get_metrics() + monitor.end() + assert "system/gpu/usage (%)/0" in metrics + assert "system/gpu/usage (%)/1" in metrics + assert "system/vram/usage (%)/0" in metrics + assert "system/vram/usage (%)/1" in metrics + assert "system/vram/usage (GB)/0" in metrics + assert "system/vram/usage (GB)/1" in metrics + assert "system/vram/total (GB)/0" in metrics + assert "system/vram/total (GB)/1" in metrics + + +def test_monitor_gpu_system(tmp_dir, mocker): + mock_psutil(mocker) + mock_py3nvml(mocker, nb_gpus=1) + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=True, + ) as live: + time.sleep(5 + 1) # allow the thread to finish + live.next_step() + time.sleep(5 + 1) # allow the thread to finish + timeseries, latest = parse_metrics(live) + + # metrics.json records CPU and RAM metrics if GPU detected + assert "system" in latest + assert "cpu" in latest["system"] + assert "ram" in latest["system"] + assert "disk" in latest["system"] + + # metrics.json file contains all the system metrics + assert "gpu" in latest["system"] + assert "count" in latest["system"]["gpu"] + assert "usage (%)" in latest["system"]["gpu"] + assert "0" in latest["system"]["gpu"]["usage (%)"] + assert "vram" in latest["system"] + assert "usage (%)" in latest["system"]["vram"] + assert "0" in latest["system"]["vram"]["usage (%)"] + assert "usage (GB)" in latest["system"]["vram"] + assert "0" in latest["system"]["vram"]["usage (GB)"] + assert "total (GB)" in latest["system"]["vram"] + assert "0" in latest["system"]["vram"]["total (GB)"] + + # timeseries contains all the system metrics + assert any(str(Path("system/gpu/usage (%)/0.tsv")) in key for key in timeseries) + assert any(str(Path("system/vram/usage (%)/0.tsv")) in key for key in timeseries) + assert any(str(Path("system/vram/usage (GB)/0.tsv")) in key for key in timeseries) + assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) + + # blacklisted timeseries + assert all(str(Path("system/gpu/count.tsv")) not in key for key in timeseries) + assert all(str(Path("system/vram/total (GB).tsv")) not in key for key in timeseries) From b9e9bac9999384b6246ad8b55f3fdaf0756b4068 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 15 Feb 2024 16:25:05 +0100 Subject: [PATCH 27/39] detect GPUs with nmvl import --- src/dvclive/live.py | 7 +++---- src/dvclive/monitor_system.py | 22 ++++++++++++++-------- tests/test_monitor_system.py | 10 +++++----- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index cecf2e01..c0a85274 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -9,7 +9,6 @@ import tempfile from pathlib import Path, PurePath -from py3nvml import get_num_procs from typing import ( Any, Dict, @@ -22,6 +21,7 @@ Literal, ) + if TYPE_CHECKING: import numpy as np import pandas as pd @@ -52,7 +52,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .monitor_system import MonitorCPU, MonitorGPU +from .monitor_system import GPU_AVAILABLE, MonitorCPU, MonitorGPU from .utils import ( StrPath, catch_and_warn, @@ -180,8 +180,7 @@ def __init__( if monitor_system: self._monitor_cpu = MonitorCPU() self._monitor_cpu(self) - - if len(get_num_procs()): + if GPU_AVAILABLE: self._monitor_gpu = MonitorGPU() self._monitor_gpu(self) diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 4725066d..78710b40 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -6,18 +6,24 @@ from statistics import mean from threading import Event, Thread from funcy import merge_with -from py3nvml.py3nvml import ( - nvmlInit, - nvmlDeviceGetCount, - nvmlDeviceGetHandleByIndex, - nvmlDeviceGetMemoryInfo, - nvmlDeviceGetUtilizationRates, - nvmlShutdown, -) from .error import InvalidDataTypeError +try: + from py3nvml.py3nvml import ( + nvmlInit, + nvmlDeviceGetCount, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetMemoryInfo, + nvmlDeviceGetUtilizationRates, + nvmlShutdown, + ) + + GPU_AVAILABLE = True +except ImportError: + GPU_AVAILABLE = False + logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 GIGABYTES_DIVIDER = 1024.0**3 diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index d0fbffbf..73d0e155 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -126,9 +126,9 @@ def test_monitor_system(tmp_dir, mocker): ) -def mock_py3nvml(mocker, nb_gpus=2, crash_index=None): - mocker.patch("dvclive.live.get_num_procs", return_value=[1 for _ in range(nb_gpus)]) - mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=nb_gpus) +def mock_py3nvml(mocker, num_gpus=2, crash_index=None): + mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus) + mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus) mocked_memory_info = mocker.MagicMock() mocked_memory_info.used = 3 * 1024**3 @@ -152,7 +152,7 @@ def mock_py3nvml(mocker, nb_gpus=2, crash_index=None): def test_get_gpus_metrics_mocker(mocker, tmp_dir): - mock_py3nvml(mocker, nb_gpus=2) + mock_py3nvml(mocker, num_gpus=2) with Live( tmp_dir, save_dvc_exp=False, @@ -174,7 +174,7 @@ def test_get_gpus_metrics_mocker(mocker, tmp_dir): def test_monitor_gpu_system(tmp_dir, mocker): mock_psutil(mocker) - mock_py3nvml(mocker, nb_gpus=1) + mock_py3nvml(mocker, num_gpus=1) with Live( tmp_dir, save_dvc_exp=False, From b44881b0b794ef4b6ef134e179a2c8e0bb65f365 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 16 Feb 2024 09:56:01 +0100 Subject: [PATCH 28/39] replace py3nvml to pynvml and add setter to monitor_gpu --- pyproject.toml | 2 +- src/dvclive/live.py | 11 +++++++++++ src/dvclive/monitor_system.py | 2 +- tests/test_monitor_system.py | 8 +++++--- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e425e8eb..4366ab64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "ruamel.yaml", "scmrepo", "psutil", - "py3nvml" + "pynvml" ] [project.optional-dependencies] diff --git a/src/dvclive/live.py b/src/dvclive/live.py index c0a85274..e1813ddc 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -413,6 +413,17 @@ def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None: self._monitor_cpu = monitor_cpu self._monitor_cpu(self) + @property + def monitor_gpu(self) -> Optional[MonitorGPU]: + return self._monitor_gpu or None + + @monitor_gpu.setter + def monitor_gpu(self, monitor_gpu: MonitorGPU) -> None: + if self._monitor_gpu is not None: + self._monitor_gpu.end() + self._monitor_gpu = monitor_gpu + self._monitor_gpu(self) + def sync(self): self.make_summary() diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 78710b40..33352008 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -11,7 +11,7 @@ try: - from py3nvml.py3nvml import ( + from pynvml import ( nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 73d0e155..5165cde3 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -126,9 +126,11 @@ def test_monitor_system(tmp_dir, mocker): ) -def mock_py3nvml(mocker, num_gpus=2, crash_index=None): +def mock_pynvml(mocker, num_gpus=2, crash_index=None): mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus) mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus) + mocker.patch("dvclive.monitor_system.nvmlInit", return_value=None) + mocker.patch("dvclive.monitor_system.nvmlShutdown", return_value=None) mocked_memory_info = mocker.MagicMock() mocked_memory_info.used = 3 * 1024**3 @@ -152,7 +154,7 @@ def mock_py3nvml(mocker, num_gpus=2, crash_index=None): def test_get_gpus_metrics_mocker(mocker, tmp_dir): - mock_py3nvml(mocker, num_gpus=2) + mock_pynvml(mocker, num_gpus=2) with Live( tmp_dir, save_dvc_exp=False, @@ -174,7 +176,7 @@ def test_get_gpus_metrics_mocker(mocker, tmp_dir): def test_monitor_gpu_system(tmp_dir, mocker): mock_psutil(mocker) - mock_py3nvml(mocker, num_gpus=1) + mock_pynvml(mocker, num_gpus=1) with Live( tmp_dir, save_dvc_exp=False, From 8654aee32e2b4cf01ed1c60c761368808d24e870 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 16 Feb 2024 11:46:30 +0100 Subject: [PATCH 29/39] change error types and `monitor_cpu` to `cpu_monitor` --- src/dvclive/live.py | 40 ++++++++------------ src/dvclive/monitor_system.py | 69 ++++++++++++++++++++--------------- tests/test_monitor_system.py | 39 +++++++++----------- 3 files changed, 73 insertions(+), 75 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 05ff9943..b71f3b6c 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -9,17 +9,7 @@ import tempfile from pathlib import Path, PurePath -from typing import ( - Any, - Dict, - List, - Optional, - Set, - Tuple, - Union, - TYPE_CHECKING, - Literal, -) +from typing import Any, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING, Literal if TYPE_CHECKING: import numpy as np @@ -51,7 +41,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .monitor_system import MonitorCPU +from .monitor_system import CPUMonitor from .utils import ( StrPath, catch_and_warn, @@ -174,10 +164,10 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._monitor_cpu = None + self._cpu_monitor = None if monitor_system: - self._monitor_cpu = MonitorCPU() - self._monitor_cpu(self) + self._cpu_monitor = CPUMonitor() + self._cpu_monitor(self) def _init_resume(self): self._read_params() @@ -398,15 +388,15 @@ def step(self, value: int) -> None: logger.debug(f"Step: {self.step}") @property - def monitor_cpu(self) -> Optional[MonitorCPU]: - return self._monitor_cpu or None + def cpu_monitor(self) -> Optional[CPUMonitor]: + return self._cpu_monitor or None - @monitor_cpu.setter - def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None: - if self._monitor_cpu is not None: - self._monitor_cpu.end() - self._monitor_cpu = monitor_cpu - self._monitor_cpu(self) + @cpu_monitor.setter + def cpu_monitor(self, cpu_monitor: CPUMonitor) -> None: + if self._cpu_monitor is not None: + self._cpu_monitor.end() + self._cpu_monitor = cpu_monitor + self._cpu_monitor(self) def sync(self): self.make_summary() @@ -897,8 +887,8 @@ def end(self): self.step = self.summary["step"] # Kill threads that monitor the system metrics - if self._monitor_cpu is not None: - self._monitor_cpu.end() + if self._cpu_monitor is not None: + self._cpu_monitor.end() self.sync() diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 6820a1f2..67a1fc29 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -1,3 +1,4 @@ +import abc import logging from typing import Dict, Union, Optional, List, Tuple from pathlib import Path @@ -7,8 +8,6 @@ from threading import Event, Thread from funcy import merge_with -from .error import InvalidDataTypeError - logger = logging.getLogger("dvclive") MEGABYTES_DIVIDER = 1024.0**2 @@ -17,10 +16,10 @@ MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 -class _MonitorSystem: +class _SystemMonitor(abc.ABC): """ Monitor system resources and log them to DVC Live. - Use a separate thread to call a `_get_metrics` function at fix interval and + Use a separate thread to call a `_get_metrics` function at fix interval and aggregate the results of this sampling using the average. """ @@ -33,11 +32,19 @@ def __init__( plot: bool = True, ): if not isinstance(interval, (int, float)): - raise InvalidDataTypeError("MonitorSystem.interval", type(interval)) + raise TypeError( # noqa: TRY003 + "System monitoring `interval` should be an `int` or a `float`, but got " + f"{type(interval)}" + ) if not isinstance(num_samples, int): - raise InvalidDataTypeError("MonitorSystem.sample", type(num_samples)) + raise TypeError( # noqa: TRY003 + "System monitoring `num_samples` should be an `int`, but got " + f"{type(num_samples)}" + ) if not isinstance(plot, bool): - raise InvalidDataTypeError("MonitorSystem.plot", type(plot)) + raise TypeError( # noqa: TRY003 + f"System monitoring `plot` should be a `bool`, but got {type(plot)}" + ) self._interval = interval # seconds self._nb_samples = num_samples @@ -78,14 +85,15 @@ def _monitoring_loop(self): plot=None if blacklisted else self._plot, ) - def _get_metrics(self): + @abc.abstractmethod + def _get_metrics(self) -> Dict[str, Union[float, int]]: pass def end(self): self._shutdown_event.set() -class MonitorCPU(_MonitorSystem): +class CPUMonitor(_SystemMonitor): _plot_blacklist_prefix: Tuple = ( "system/cpu/count", "system/ram/total (GB)", @@ -96,7 +104,7 @@ def __init__( self, interval: float = 0.5, num_samples: int = 10, - directories_to_monitor: Optional[List[str]] = None, + disks_to_monitor: Optional[List[str]] = None, plot: bool = True, ): """Monitor CPU resources and log them to DVC Live. @@ -105,25 +113,23 @@ def __init__( interval (float): interval in seconds between two measurements. Defaults to 0.5. num_samples (int): number of samples to average. Defaults to 10. - directories_to_monitor (Optional[List[str]]): paths of the directories that - needs to be monitor for disk storage metrics. Defaults to the current - directory. + disks_to_monitor (Optional[List[str]]): paths to the disks or partitions to + monitor disk usage statistics. Defaults to "/". plot (bool): should the system metrics be saved as plots. Defaults to True. Raises: - InvalidDataTypeError: if the arguments passed to the function don't have a + TypeError: if the arguments passed to the function don't have a supported type. """ super().__init__(interval=interval, num_samples=num_samples, plot=plot) - directories_to_monitor = ( - ["."] if directories_to_monitor is None else directories_to_monitor - ) - for idx, path in enumerate(directories_to_monitor): + disks_to_monitor = ["/"] if disks_to_monitor is None else disks_to_monitor + for idx, path in enumerate(disks_to_monitor): if not isinstance(path, str): - raise InvalidDataTypeError( - f"MonitorCPU.directories_to_monitor[{idx}]", type(path) + raise TypeError( # noqa: TRY003 + "CPU monitoring `partitions_to_monitor` should be a `List[str]`, " + f"but got {type(path)} at position {idx}" ) - self._directories_to_monitor = directories_to_monitor + self._disks_to_monitor = disks_to_monitor def _get_metrics(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() @@ -146,13 +152,18 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: * (ram_info.total / GIGABYTES_DIVIDER), "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, } - for idx, directory in enumerate(self._directories_to_monitor): - if not Path(directory).exists(): + for disk_name in self._disks_to_monitor: + if not Path(disk_name).exists(): continue - disk_info = psutil.disk_usage(directory) - result[f"system/disk/usage (%)/{idx}"] = disk_info.percent - result[f"system/disk/usage (GB)/{idx}"] = disk_info.used / GIGABYTES_DIVIDER - result[f"system/disk/total (GB)/{idx}"] = ( - disk_info.total / GIGABYTES_DIVIDER - ) + disk_info = psutil.disk_usage(disk_name) + disk_path = Path(disk_name).as_posix().lstrip("/") + disk_metrics = { + f"system/disk/usage (%)/{disk_path}": disk_info.percent, + f"system/disk/usage (GB)/{disk_path}": disk_info.used + / GIGABYTES_DIVIDER, + f"system/disk/total (GB)/{disk_path}": disk_info.total + / GIGABYTES_DIVIDER, + } + disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()} + result.update(disk_metrics) return result diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 9653c2e1..1ddee6ec 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -2,7 +2,7 @@ from pathlib import Path from dvclive import Live -from dvclive.monitor_system import MonitorCPU +from dvclive.monitor_system import CPUMonitor from dvclive.utils import parse_metrics @@ -40,7 +40,7 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - monitor = MonitorCPU(directories_to_monitor=["/", "/"]) + monitor = CPUMonitor(disks_to_monitor=["/", "/home"]) monitor(live) metrics = monitor._get_metrics() monitor.end() @@ -51,32 +51,32 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker): assert "system/ram/usage (%)" in metrics assert "system/ram/usage (GB)" in metrics assert "system/ram/total (GB)" in metrics - assert "system/disk/usage (%)/0" in metrics - assert "system/disk/usage (%)/1" in metrics - assert "system/disk/usage (GB)/0" in metrics - assert "system/disk/usage (GB)/1" in metrics - assert "system/disk/total (GB)/0" in metrics - assert "system/disk/total (GB)/1" in metrics + assert "system/disk/usage (%)" in metrics + assert "system/disk/usage (%)/home" in metrics + assert "system/disk/usage (GB)" in metrics + assert "system/disk/usage (GB)/home" in metrics + assert "system/disk/total (GB)" in metrics + assert "system/disk/total (GB)/home" in metrics -def test_ignore_missing_directories(tmp_dir, mocker): +def test_ignore_non_existent_directories(tmp_dir, mocker): mock_psutil(mocker) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - missing_directories = "______" - monitor = MonitorCPU(directories_to_monitor=["/", missing_directories]) + non_existent_directories = "/non-existent" + monitor = CPUMonitor(disks_to_monitor=["/", non_existent_directories]) monitor(live) metrics = monitor._get_metrics() monitor.end() - assert not Path(missing_directories).exists() + assert not Path(non_existent_directories).exists() - assert "system/disk/usage (%)/1" not in metrics - assert "system/disk/usage (GB)/1" not in metrics - assert "system/disk/total (GB)/1" not in metrics + assert "system/disk/usage (%)/non-existent" not in metrics + assert "system/disk/usage (GB)/non-existent" not in metrics + assert "system/disk/total (GB)/non-existent" not in metrics def test_monitor_system(tmp_dir, mocker): @@ -103,7 +103,6 @@ def test_monitor_system(tmp_dir, mocker): assert "total (GB)" in latest["system"]["ram"] assert "disk" in latest["system"] assert "usage (%)" in latest["system"]["disk"] - assert "0" in latest["system"]["disk"]["usage (%)"] assert "usage (GB)" in latest["system"]["disk"] assert "total (GB)" in latest["system"]["disk"] @@ -114,13 +113,11 @@ def test_monitor_system(tmp_dir, mocker): ) assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries) assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries) - assert any(str(Path("system/disk/usage (%)/0.tsv")) in key for key in timeseries) - assert any(str(Path("system/disk/usage (GB)/0.tsv")) in key for key in timeseries) + assert any(str(Path("system/disk/usage (%).tsv")) in key for key in timeseries) + assert any(str(Path("system/disk/usage (GB).tsv")) in key for key in timeseries) assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) # blacklisted timeseries assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries) assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries) - assert all( - str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries - ) + assert all(str(Path("system/disk/total (GB).tsv")) not in key for key in timeseries) From f0d6234e157bd8fa64ad60b282c0e3ef7eb2383a Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 16 Feb 2024 15:18:12 +0100 Subject: [PATCH 30/39] add unit tests about _num_points_sent_to_studio behavior --- pyproject.toml | 6 +- src/dvclive/monitor_system.py | 52 ++++++---- tests/test_monitor_system.py | 172 ++++++++++++++++++++++++---------- 3 files changed, 162 insertions(+), 68 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 62294cfa..68fd9dce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,12 +52,14 @@ tests = [ "pytest-cov>=3.0.0,<4.0", "pytest-mock>=3.8.2,<4.0", "dvclive[image,plots,markdown]", - "ipython" + "ipython", + "pytest_voluptuous" ] dev = [ "dvclive[all,tests]", "mypy>=1.1.1", - "types-PyYAML" + "types-PyYAML", + "voluptuous" ] mmcv = ["mmcv"] tf = ["tensorflow"] diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 67a1fc29..bb9fbfa0 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -1,6 +1,7 @@ import abc import logging -from typing import Dict, Union, Optional, List, Tuple +import os +from typing import Dict, Union, Optional, Tuple from pathlib import Path import psutil @@ -34,16 +35,16 @@ def __init__( if not isinstance(interval, (int, float)): raise TypeError( # noqa: TRY003 "System monitoring `interval` should be an `int` or a `float`, but got " - f"{type(interval)}" + f"'{type(interval)}'" ) if not isinstance(num_samples, int): raise TypeError( # noqa: TRY003 "System monitoring `num_samples` should be an `int`, but got " - f"{type(num_samples)}" + f"'{type(num_samples)}'" ) if not isinstance(plot, bool): raise TypeError( # noqa: TRY003 - f"System monitoring `plot` should be a `bool`, but got {type(plot)}" + f"System monitoring `plot` should be a `bool`, but got '{type(plot)}'" ) self._interval = interval # seconds @@ -104,7 +105,7 @@ def __init__( self, interval: float = 0.5, num_samples: int = 10, - disks_to_monitor: Optional[List[str]] = None, + disks_to_monitor: Optional[Dict[str, str]] = None, plot: bool = True, ): """Monitor CPU resources and log them to DVC Live. @@ -113,8 +114,10 @@ def __init__( interval (float): interval in seconds between two measurements. Defaults to 0.5. num_samples (int): number of samples to average. Defaults to 10. - disks_to_monitor (Optional[List[str]]): paths to the disks or partitions to - monitor disk usage statistics. Defaults to "/". + disks_to_monitor (Optional[Dict[str, str]]): paths to the disks or + partitions to monitor disk usage statistics. The key is the name that + will be displayed in the metric name. The value is the path to the disk + or partition. Defaults to {"main": "/"}. plot (bool): should the system metrics be saved as plots. Defaults to True. Raises: @@ -122,12 +125,24 @@ def __init__( supported type. """ super().__init__(interval=interval, num_samples=num_samples, plot=plot) - disks_to_monitor = ["/"] if disks_to_monitor is None else disks_to_monitor - for idx, path in enumerate(disks_to_monitor): - if not isinstance(path, str): + disks_to_monitor = ( + {"main": "/"} if disks_to_monitor is None else disks_to_monitor + ) + for disk_name, disk_path in disks_to_monitor.items(): + if not isinstance(disk_name, str): raise TypeError( # noqa: TRY003 - "CPU monitoring `partitions_to_monitor` should be a `List[str]`, " - f"but got {type(path)} at position {idx}" + "Keys for `partitions_to_monitor` should be a `str`" + f", but got '{type(disk_name)}'" + ) + if not isinstance(disk_path, str): + raise TypeError( # noqa: TRY003 + "Value for `partitions_to_monitor` should be a `str`" + f", but got '{type(disk_path)}'" + ) + if disk_name != os.path.normpath(disk_name): + raise ValueError( # noqa: TRY003 + "Keys for `partitions_to_monitor` should be a valid name" + f", but got '{disk_name}'" ) self._disks_to_monitor = disks_to_monitor @@ -152,16 +167,15 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: * (ram_info.total / GIGABYTES_DIVIDER), "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, } - for disk_name in self._disks_to_monitor: - if not Path(disk_name).exists(): + for disk_name, disk_path in self._disks_to_monitor.items(): + if not Path(disk_path).exists(): continue - disk_info = psutil.disk_usage(disk_name) - disk_path = Path(disk_name).as_posix().lstrip("/") + disk_info = psutil.disk_usage(disk_path) disk_metrics = { - f"system/disk/usage (%)/{disk_path}": disk_info.percent, - f"system/disk/usage (GB)/{disk_path}": disk_info.used + f"system/disk/usage (%)/{disk_name}": disk_info.percent, + f"system/disk/usage (GB)/{disk_name}": disk_info.used / GIGABYTES_DIVIDER, - f"system/disk/total (GB)/{disk_path}": disk_info.total + f"system/disk/total (GB)/{disk_name}": disk_info.total / GIGABYTES_DIVIDER, } disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()} diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 1ddee6ec..798471e7 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -5,6 +5,9 @@ from dvclive.monitor_system import CPUMonitor from dvclive.utils import parse_metrics +from voluptuous import Union +from pytest_voluptuous import S + def mock_psutil(mocker): mocker.patch( @@ -40,23 +43,53 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - monitor = CPUMonitor(disks_to_monitor=["/", "/home"]) + monitor = CPUMonitor(disks_to_monitor={"main": "/", "home": "/"}) monitor(live) metrics = monitor._get_metrics() monitor.end() - assert "system/cpu/usage (%)" in metrics - assert "system/cpu/count" in metrics - assert "system/cpu/parallelization (%)" in metrics - assert "system/ram/usage (%)" in metrics - assert "system/ram/usage (GB)" in metrics - assert "system/ram/total (GB)" in metrics - assert "system/disk/usage (%)" in metrics - assert "system/disk/usage (%)/home" in metrics - assert "system/disk/usage (GB)" in metrics - assert "system/disk/usage (GB)/home" in metrics - assert "system/disk/total (GB)" in metrics - assert "system/disk/total (GB)/home" in metrics + assert metrics == S( + { + "system/cpu/usage (%)": Union(int, float), + "system/cpu/count": int, + "system/cpu/parallelization (%)": Union(int, float), + "system/ram/usage (%)": Union(int, float), + "system/ram/usage (GB)": Union(int, float), + "system/ram/total (GB)": Union(int, float), + "system/disk/usage (%)/main": Union(int, float), + "system/disk/usage (%)/home": Union(int, float), + "system/disk/usage (GB)/main": Union(int, float), + "system/disk/usage (GB)/home": Union(int, float), + "system/disk/total (GB)/main": Union(int, float), + "system/disk/total (GB)/home": Union(int, float), + } + ) + + +def test_monitor_system_is_false(tmp_dir, mocker): + mock_psutil(mocker) + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=False, + ) as live: + assert live._cpu_monitor is None + + +def test_monitor_system_is_true(tmp_dir, mocker): + mock_psutil(mocker) + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=True, + ) as live: + cpu_monitor = live._cpu_monitor + assert isinstance(live._cpu_monitor, CPUMonitor) + # start is called but end not called + assert cpu_monitor._shutdown_event._flag is False + + # end was called + assert cpu_monitor._shutdown_event._flag is True def test_ignore_non_existent_directories(tmp_dir, mocker): @@ -66,58 +99,103 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - non_existent_directories = "/non-existent" - monitor = CPUMonitor(disks_to_monitor=["/", non_existent_directories]) + non_existent_disk = "/non-existent" + monitor = CPUMonitor( + disks_to_monitor={"main": "/", "non-existent": non_existent_disk} + ) monitor(live) metrics = monitor._get_metrics() monitor.end() - assert not Path(non_existent_directories).exists() + assert not Path(non_existent_disk).exists() assert "system/disk/usage (%)/non-existent" not in metrics assert "system/disk/usage (GB)/non-existent" not in metrics assert "system/disk/total (GB)/non-existent" not in metrics -def test_monitor_system(tmp_dir, mocker): +def test_monitor_system_metrics(tmp_dir, mocker): mock_psutil(mocker) + interval = 0.05 + num_samples = 4 with Live( tmp_dir, save_dvc_exp=False, - monitor_system=True, + monitor_system=False, ) as live: - time.sleep(5 + 1) # allow the thread to finish + live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + time.sleep(interval * num_samples + interval) # log metrics once live.next_step() - time.sleep(5 + 1) # allow the thread to finish - timeseries, latest = parse_metrics(live) + time.sleep(interval * num_samples + interval) # log metrics twice + live.make_summary() + + _, latest = parse_metrics(live) # metrics.json file contains all the system metrics - assert "system" in latest - assert "cpu" in latest["system"] - assert "usage (%)" in latest["system"]["cpu"] - assert "count" in latest["system"]["cpu"] - assert "parallelization (%)" in latest["system"]["cpu"] - assert "ram" in latest["system"] - assert "usage (%)" in latest["system"]["ram"] - assert "usage (GB)" in latest["system"]["ram"] - assert "total (GB)" in latest["system"]["ram"] - assert "disk" in latest["system"] - assert "usage (%)" in latest["system"]["disk"] - assert "usage (GB)" in latest["system"]["disk"] - assert "total (GB)" in latest["system"]["disk"] + assert latest == S( + { + "step": Union(int, float), + "system": { + "cpu": { + "usage (%)": Union(int, float), + "count": Union(int, float), + "parallelization (%)": Union(int, float), + }, + "ram": { + "usage (%)": Union(int, float), + "usage (GB)": Union(int, float), + "total (GB)": Union(int, float), + }, + "disk": { + "usage (%)": {"main": Union(int, float)}, + "usage (GB)": {"main": Union(int, float)}, + "total (GB)": {"main": Union(int, float)}, + }, + }, + } + ) + + +def test_monitor_system_timeseries(tmp_dir, mocker): + mock_psutil(mocker) + interval = 0.05 + num_samples = 4 + with Live( + tmp_dir, + save_dvc_exp=False, + monitor_system=False, + ) as live: + live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + time.sleep(interval * num_samples + interval) # log metrics once + live.next_step() + time.sleep(interval * num_samples + interval) # log metrics twice + live.make_summary() + + timeseries, _ = parse_metrics(live) + + def timeserie_metric_schema(name): + return [{name: str, "timestamp": str, "step": str} for _ in range(2)] # timeseries contains all the system metrics - assert any(str(Path("system/cpu/usage (%).tsv")) in key for key in timeseries) - assert any( - str(Path("system/cpu/parallelization (%).tsv")) in key for key in timeseries + assert timeseries == S( + { + str( + Path(tmp_dir) / "plots/metrics" / "system/cpu/usage (%).tsv" + ): timeserie_metric_schema("usage (%)"), + str( + Path(tmp_dir) / "plots/metrics" / "system/cpu/parallelization (%).tsv" + ): timeserie_metric_schema("parallelization (%)"), + str( + Path(tmp_dir) / "plots/metrics" / "system/ram/usage (%).tsv" + ): timeserie_metric_schema("usage (%)"), + str( + Path(tmp_dir) / "plots/metrics" / "system/ram/usage (GB).tsv" + ): timeserie_metric_schema("usage (GB)"), + str( + Path(tmp_dir) / "plots/metrics" / "system/disk/usage (%)/main.tsv" + ): timeserie_metric_schema("main"), + str( + Path(tmp_dir) / "plots/metrics" / "system/disk/usage (GB)/main.tsv" + ): timeserie_metric_schema("main"), + } ) - assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries) - assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries) - assert any(str(Path("system/disk/usage (%).tsv")) in key for key in timeseries) - assert any(str(Path("system/disk/usage (GB).tsv")) in key for key in timeseries) - assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) - - # blacklisted timeseries - assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries) - assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries) - assert all(str(Path("system/disk/total (GB).tsv")) not in key for key in timeseries) From d2c1c84fdc15acdfbfac92b032caba8e3960b014 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Fri, 16 Feb 2024 17:08:15 +0100 Subject: [PATCH 31/39] use constant values for metrics names --- pyproject.toml | 1 + src/dvclive/monitor_system.py | 34 +++++++--- tests/test_monitor_system.py | 124 ++++++++++++++++++---------------- 3 files changed, 88 insertions(+), 71 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 68fd9dce..c622102d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "dvc>=3.33.3", "dvc-render>=1.0.0,<2", "dvc-studio-client>=0.17.1,<1", + "dpath", "funcy", "gto", "ruamel.yaml", diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index bb9fbfa0..43bd8786 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -16,6 +16,18 @@ MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 +METRIC_CPU_COUNT = "system/cpu/count" +METRIC_CPU_USAGE_PERCENT = "system/cpu/usage (%)" +METRIC_CPU_PARALLELIZATION_PERCENT = "system/cpu/parallelization (%)" + +METRIC_RAM_USAGE_PERCENT = "system/ram/usage (%)" +METRIC_RAM_USAGE_GB = "system/ram/usage (GB)" +METRIC_RAM_TOTAL_GB = "system/ram/total (GB)" + +METRIC_DISK_USAGE_PERCENT = "system/disk/usage (%)" +METRIC_DISK_USAGE_GB = "system/disk/usage (GB)" +METRIC_DISK_TOTAL_GB = "system/disk/total (GB)" + class _SystemMonitor(abc.ABC): """ @@ -96,8 +108,8 @@ def end(self): class CPUMonitor(_SystemMonitor): _plot_blacklist_prefix: Tuple = ( - "system/cpu/count", - "system/ram/total (GB)", + METRIC_CPU_COUNT, + METRIC_RAM_TOTAL_GB, "system/disk/total (GB)", ) @@ -151,9 +163,9 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: nb_cpus = psutil.cpu_count() cpus_percent = psutil.cpu_percent(percpu=True) result = { - "system/cpu/usage (%)": mean(cpus_percent), - "system/cpu/count": nb_cpus, - "system/cpu/parallelization (%)": len( + METRIC_CPU_COUNT: nb_cpus, + METRIC_CPU_USAGE_PERCENT: mean(cpus_percent), + METRIC_CPU_PARALLELIZATION_PERCENT: len( [ percent for percent in cpus_percent @@ -162,20 +174,20 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: ) * 100 / nb_cpus, - "system/ram/usage (%)": ram_info.percent, - "system/ram/usage (GB)": (ram_info.percent / 100) + METRIC_RAM_USAGE_PERCENT: ram_info.percent, + METRIC_RAM_USAGE_GB: (ram_info.percent / 100) * (ram_info.total / GIGABYTES_DIVIDER), - "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER, + METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, } for disk_name, disk_path in self._disks_to_monitor.items(): if not Path(disk_path).exists(): continue disk_info = psutil.disk_usage(disk_path) disk_metrics = { - f"system/disk/usage (%)/{disk_name}": disk_info.percent, - f"system/disk/usage (GB)/{disk_name}": disk_info.used + f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent, + f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used / GIGABYTES_DIVIDER, - f"system/disk/total (GB)/{disk_name}": disk_info.total + f"{METRIC_DISK_TOTAL_GB}/{disk_name}": disk_info.total / GIGABYTES_DIVIDER, } disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()} diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 798471e7..832c576f 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -1,8 +1,20 @@ import time from pathlib import Path +import dpath from dvclive import Live -from dvclive.monitor_system import CPUMonitor +from dvclive.monitor_system import ( + CPUMonitor, + METRIC_CPU_COUNT, + METRIC_CPU_USAGE_PERCENT, + METRIC_CPU_PARALLELIZATION_PERCENT, + METRIC_RAM_USAGE_PERCENT, + METRIC_RAM_USAGE_GB, + METRIC_RAM_TOTAL_GB, + METRIC_DISK_USAGE_PERCENT, + METRIC_DISK_USAGE_GB, + METRIC_DISK_TOTAL_GB, +) from dvclive.utils import parse_metrics from voluptuous import Union @@ -50,18 +62,18 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker): assert metrics == S( { - "system/cpu/usage (%)": Union(int, float), - "system/cpu/count": int, - "system/cpu/parallelization (%)": Union(int, float), - "system/ram/usage (%)": Union(int, float), - "system/ram/usage (GB)": Union(int, float), - "system/ram/total (GB)": Union(int, float), - "system/disk/usage (%)/main": Union(int, float), - "system/disk/usage (%)/home": Union(int, float), - "system/disk/usage (GB)/main": Union(int, float), - "system/disk/usage (GB)/home": Union(int, float), - "system/disk/total (GB)/main": Union(int, float), - "system/disk/total (GB)/home": Union(int, float), + METRIC_CPU_COUNT: int, + METRIC_CPU_USAGE_PERCENT: Union(int, float), + METRIC_CPU_PARALLELIZATION_PERCENT: Union(int, float), + METRIC_RAM_USAGE_PERCENT: Union(int, float), + METRIC_RAM_USAGE_GB: Union(int, float), + METRIC_RAM_TOTAL_GB: Union(int, float), + f"{METRIC_DISK_USAGE_PERCENT}/main": Union(int, float), + f"{METRIC_DISK_USAGE_PERCENT}/home": Union(int, float), + f"{METRIC_DISK_USAGE_GB}/main": Union(int, float), + f"{METRIC_DISK_USAGE_GB}/home": Union(int, float), + f"{METRIC_DISK_TOTAL_GB}/main": Union(int, float), + f"{METRIC_DISK_TOTAL_GB}/home": Union(int, float), } ) @@ -109,9 +121,9 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): assert not Path(non_existent_disk).exists() - assert "system/disk/usage (%)/non-existent" not in metrics - assert "system/disk/usage (GB)/non-existent" not in metrics - assert "system/disk/total (GB)/non-existent" not in metrics + assert f"{METRIC_DISK_USAGE_PERCENT}/non-existent" not in metrics + assert f"{METRIC_DISK_USAGE_GB}/non-existent" not in metrics + assert f"{METRIC_DISK_TOTAL_GB}/non-existent" not in metrics def test_monitor_system_metrics(tmp_dir, mocker): @@ -123,7 +135,7 @@ def test_monitor_system_metrics(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples) time.sleep(interval * num_samples + interval) # log metrics once live.next_step() time.sleep(interval * num_samples + interval) # log metrics twice @@ -131,29 +143,22 @@ def test_monitor_system_metrics(tmp_dir, mocker): _, latest = parse_metrics(live) - # metrics.json file contains all the system metrics - assert latest == S( - { - "step": Union(int, float), - "system": { - "cpu": { - "usage (%)": Union(int, float), - "count": Union(int, float), - "parallelization (%)": Union(int, float), - }, - "ram": { - "usage (%)": Union(int, float), - "usage (GB)": Union(int, float), - "total (GB)": Union(int, float), - }, - "disk": { - "usage (%)": {"main": Union(int, float)}, - "usage (GB)": {"main": Union(int, float)}, - "total (GB)": {"main": Union(int, float)}, - }, - }, - } - ) + schema = {} + for name in [ + "step", + METRIC_CPU_COUNT, + METRIC_CPU_USAGE_PERCENT, + METRIC_CPU_PARALLELIZATION_PERCENT, + METRIC_RAM_USAGE_PERCENT, + METRIC_RAM_USAGE_GB, + METRIC_RAM_TOTAL_GB, + f"{METRIC_DISK_USAGE_PERCENT}/main", + f"{METRIC_DISK_USAGE_GB}/main", + f"{METRIC_DISK_TOTAL_GB}/main", + ]: + dpath.new(schema, name, Union(int, float)) + + assert latest == S(schema) def test_monitor_system_timeseries(tmp_dir, mocker): @@ -165,7 +170,7 @@ def test_monitor_system_timeseries(tmp_dir, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples) time.sleep(interval * num_samples + interval) # log metrics once live.next_step() time.sleep(interval * num_samples + interval) # log metrics twice @@ -173,29 +178,28 @@ def test_monitor_system_timeseries(tmp_dir, mocker): timeseries, _ = parse_metrics(live) - def timeserie_metric_schema(name): + def timeserie_schema(name): return [{name: str, "timestamp": str, "step": str} for _ in range(2)] # timeseries contains all the system metrics + prefix = Path(tmp_dir) / "plots/metrics" assert timeseries == S( { - str( - Path(tmp_dir) / "plots/metrics" / "system/cpu/usage (%).tsv" - ): timeserie_metric_schema("usage (%)"), - str( - Path(tmp_dir) / "plots/metrics" / "system/cpu/parallelization (%).tsv" - ): timeserie_metric_schema("parallelization (%)"), - str( - Path(tmp_dir) / "plots/metrics" / "system/ram/usage (%).tsv" - ): timeserie_metric_schema("usage (%)"), - str( - Path(tmp_dir) / "plots/metrics" / "system/ram/usage (GB).tsv" - ): timeserie_metric_schema("usage (GB)"), - str( - Path(tmp_dir) / "plots/metrics" / "system/disk/usage (%)/main.tsv" - ): timeserie_metric_schema("main"), - str( - Path(tmp_dir) / "plots/metrics" / "system/disk/usage (GB)/main.tsv" - ): timeserie_metric_schema("main"), + str(prefix / f"{METRIC_CPU_USAGE_PERCENT}.tsv"): timeserie_schema( + METRIC_CPU_USAGE_PERCENT.split("/")[-1] + ), + str(prefix / f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv"): timeserie_schema( + METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1] + ), + str(prefix / f"{METRIC_RAM_USAGE_PERCENT}.tsv"): timeserie_schema( + METRIC_RAM_USAGE_PERCENT.split("/")[-1] + ), + str(prefix / f"{METRIC_RAM_USAGE_GB}.tsv"): timeserie_schema( + METRIC_RAM_USAGE_GB.split("/")[-1] + ), + str(prefix / f"{METRIC_DISK_USAGE_PERCENT}/main.tsv"): timeserie_schema( + "main" + ), + str(prefix / f"{METRIC_DISK_USAGE_GB}/main.tsv"): timeserie_schema("main"), } ) From 7ab891500b37a1b17720fa8738b7c70d6a590d7f Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Mon, 19 Feb 2024 16:57:48 +0100 Subject: [PATCH 32/39] improve test and user inputs --- pyproject.toml | 8 +-- src/dvclive/monitor_system.py | 77 +++++++++++----------- tests/test_monitor_system.py | 116 +++++++++++++++++++--------------- 3 files changed, 106 insertions(+), 95 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c622102d..5c0544f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ dependencies = [ "dvc>=3.33.3", "dvc-render>=1.0.0,<2", "dvc-studio-client>=0.17.1,<1", - "dpath", "funcy", "gto", "ruamel.yaml", @@ -54,13 +53,14 @@ tests = [ "pytest-mock>=3.8.2,<4.0", "dvclive[image,plots,markdown]", "ipython", - "pytest_voluptuous" + "pytest_voluptuous", + "dpath", + "voluptuous" ] dev = [ "dvclive[all,tests]", "mypy>=1.1.1", - "types-PyYAML", - "voluptuous" + "types-PyYAML" ] mmcv = ["mmcv"] tf = ["tensorflow"] diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 43bd8786..6277ce26 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -2,7 +2,6 @@ import logging import os from typing import Dict, Union, Optional, Tuple -from pathlib import Path import psutil from statistics import mean @@ -40,23 +39,25 @@ class _SystemMonitor(abc.ABC): def __init__( self, - interval: float = 0.5, - num_samples: int = 10, + interval: float, + num_samples: int, plot: bool = True, ): - if not isinstance(interval, (int, float)): - raise TypeError( # noqa: TRY003 - "System monitoring `interval` should be an `int` or a `float`, but got " - f"'{type(interval)}'" + max_interval = 0.1 + if interval > max_interval: + interval = max_interval + logger.warning( + f"System monitoring `interval` should be less than {max_interval} " + f"seconds. Setting `interval` to {interval} seconds." ) - if not isinstance(num_samples, int): - raise TypeError( # noqa: TRY003 - "System monitoring `num_samples` should be an `int`, but got " - f"'{type(num_samples)}'" - ) - if not isinstance(plot, bool): - raise TypeError( # noqa: TRY003 - f"System monitoring `plot` should be a `bool`, but got '{type(plot)}'" + + min_num_samples = 1 + max_num_samples = 30 + if min_num_samples < num_samples < max_num_samples: + num_samples = max(min(num_samples, max_num_samples), min_num_samples) + logger.warning( + f"System monitoring `num_samples` should be between {min_num_samples} " + f"and {max_num_samples}. Setting `num_samples` to {num_samples}." ) self._interval = interval # seconds @@ -115,9 +116,9 @@ class CPUMonitor(_SystemMonitor): def __init__( self, - interval: float = 0.5, - num_samples: int = 10, - disks_to_monitor: Optional[Dict[str, str]] = None, + interval: float = 0.1, + num_samples: int = 20, + folders_to_monitor: Optional[Dict[str, str]] = None, plot: bool = True, ): """Monitor CPU resources and log them to DVC Live. @@ -126,37 +127,35 @@ def __init__( interval (float): interval in seconds between two measurements. Defaults to 0.5. num_samples (int): number of samples to average. Defaults to 10. - disks_to_monitor (Optional[Dict[str, str]]): paths to the disks or - partitions to monitor disk usage statistics. The key is the name that - will be displayed in the metric name. The value is the path to the disk - or partition. Defaults to {"main": "/"}. + folders_to_monitor (Optional[Dict[str, str]]): monitor disk usage + statistics about the partition which contains the given paths. The + statistics include total and used space in gygabytes and percent. + This argument expect a dict where the key is the name that will be used + in the metric's name and the value is the path to the folder to monitor. + Defaults to {"main": "/"}. plot (bool): should the system metrics be saved as plots. Defaults to True. Raises: - TypeError: if the arguments passed to the function don't have a + ValueError: if the arguments passed to the function don't have a supported type. """ super().__init__(interval=interval, num_samples=num_samples, plot=plot) - disks_to_monitor = ( - {"main": "/"} if disks_to_monitor is None else disks_to_monitor + folders_to_monitor = ( + {"main": "/"} if folders_to_monitor is None else folders_to_monitor ) - for disk_name, disk_path in disks_to_monitor.items(): - if not isinstance(disk_name, str): - raise TypeError( # noqa: TRY003 - "Keys for `partitions_to_monitor` should be a `str`" - f", but got '{type(disk_name)}'" - ) - if not isinstance(disk_path, str): - raise TypeError( # noqa: TRY003 - "Value for `partitions_to_monitor` should be a `str`" - f", but got '{type(disk_path)}'" - ) + self._disks_to_monitor = {} + for disk_name, disk_path in folders_to_monitor.items(): if disk_name != os.path.normpath(disk_name): raise ValueError( # noqa: TRY003 "Keys for `partitions_to_monitor` should be a valid name" - f", but got '{disk_name}'" + f", but got '{disk_name}'." ) - self._disks_to_monitor = disks_to_monitor + try: + psutil.disk_usage(disk_path) + except OSError: + logger.warning(f"Couldn't find partition '{disk_path}', ignoring it.") + continue + self._disks_to_monitor[disk_name] = disk_path def _get_metrics(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() @@ -180,8 +179,6 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, } for disk_name, disk_path in self._disks_to_monitor.items(): - if not Path(disk_path).exists(): - continue disk_info = psutil.disk_usage(disk_path) disk_metrics = { f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent, diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 832c576f..ae06de70 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -1,6 +1,10 @@ import time from pathlib import Path +import pytest + import dpath +from voluptuous import Union +from pytest_voluptuous import S from dvclive import Live from dvclive.monitor_system import ( @@ -17,9 +21,6 @@ ) from dvclive.utils import parse_metrics -from voluptuous import Union -from pytest_voluptuous import S - def mock_psutil(mocker): mocker.patch( @@ -48,64 +49,75 @@ def mock_psutil(mocker): ) -def test_get_cpus_metrics_mocker(tmp_dir, mocker): - mock_psutil(mocker) - with Live( - tmp_dir, - save_dvc_exp=False, - monitor_system=False, - ) as live: - monitor = CPUMonitor(disks_to_monitor={"main": "/", "home": "/"}) - monitor(live) - metrics = monitor._get_metrics() - monitor.end() +def mock_psutil_with_oserror(mocker): + mocker.patch( + "dvclive.monitor_system.psutil.cpu_percent", + return_value=[10, 20, 30, 40, 50, 60], + ) + mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6) - assert metrics == S( - { - METRIC_CPU_COUNT: int, - METRIC_CPU_USAGE_PERCENT: Union(int, float), - METRIC_CPU_PARALLELIZATION_PERCENT: Union(int, float), - METRIC_RAM_USAGE_PERCENT: Union(int, float), - METRIC_RAM_USAGE_GB: Union(int, float), - METRIC_RAM_TOTAL_GB: Union(int, float), - f"{METRIC_DISK_USAGE_PERCENT}/main": Union(int, float), - f"{METRIC_DISK_USAGE_PERCENT}/home": Union(int, float), - f"{METRIC_DISK_USAGE_GB}/main": Union(int, float), - f"{METRIC_DISK_USAGE_GB}/home": Union(int, float), - f"{METRIC_DISK_TOTAL_GB}/main": Union(int, float), - f"{METRIC_DISK_TOTAL_GB}/home": Union(int, float), - } + mocked_virtual_memory = mocker.MagicMock() + mocked_virtual_memory.percent = 20 + mocked_virtual_memory.total = 4 * 1024**3 + + mocked_disk_usage = mocker.MagicMock() + mocked_disk_usage.used = 16 * 1024**3 + mocked_disk_usage.percent = 50 + mocked_disk_usage.total = 32 * 1024**3 + + mocker.patch( + "dvclive.monitor_system.psutil.virtual_memory", + return_value=mocked_virtual_memory, + ) + mocker.patch( + "dvclive.monitor_system.psutil.disk_usage", + side_effect=[ + mocked_disk_usage, + OSError, + mocked_disk_usage, + mocked_disk_usage, + ], ) def test_monitor_system_is_false(tmp_dir, mocker): mock_psutil(mocker) + + cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor") with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - assert live._cpu_monitor is None + assert live.cpu_monitor is None + + cpu_monitor_mock.assert_not_called() def test_monitor_system_is_true(tmp_dir, mocker): mock_psutil(mocker) + cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor) + with Live( tmp_dir, save_dvc_exp=False, monitor_system=True, ) as live: - cpu_monitor = live._cpu_monitor - assert isinstance(live._cpu_monitor, CPUMonitor) - # start is called but end not called - assert cpu_monitor._shutdown_event._flag is False + cpu_monitor = live.cpu_monitor + + assert isinstance(cpu_monitor, CPUMonitor) + cpu_monitor_mock.assert_called_once() - # end was called - assert cpu_monitor._shutdown_event._flag is True + end_spy = mocker.spy(cpu_monitor, "end") + end_spy.assert_not_called() + + # check the monitoring thread is stopped + end_spy.assert_called_once() def test_ignore_non_existent_directories(tmp_dir, mocker): - mock_psutil(mocker) + mock_psutil_with_oserror(mocker) + with Live( tmp_dir, save_dvc_exp=False, @@ -113,7 +125,7 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): ) as live: non_existent_disk = "/non-existent" monitor = CPUMonitor( - disks_to_monitor={"main": "/", "non-existent": non_existent_disk} + folders_to_monitor={"main": "/", "non-existent": non_existent_disk} ) monitor(live) metrics = monitor._get_metrics() @@ -126,20 +138,20 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): assert f"{METRIC_DISK_TOTAL_GB}/non-existent" not in metrics +@pytest.mark.timeout(2) def test_monitor_system_metrics(tmp_dir, mocker): mock_psutil(mocker) - interval = 0.05 - num_samples = 4 with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples) - time.sleep(interval * num_samples + interval) # log metrics once + live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + # wait for the metrics to be logged. + # METRIC_DISK_TOTAL_GB is the last metric to be logged. + while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: + time.sleep(0.001) live.next_step() - time.sleep(interval * num_samples + interval) # log metrics twice - live.make_summary() _, latest = parse_metrics(live) @@ -161,25 +173,27 @@ def test_monitor_system_metrics(tmp_dir, mocker): assert latest == S(schema) +@pytest.mark.timeout(2) def test_monitor_system_timeseries(tmp_dir, mocker): mock_psutil(mocker) - interval = 0.05 - num_samples = 4 with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples) - time.sleep(interval * num_samples + interval) # log metrics once + live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + + # wait for the metrics to be logged. + # METRIC_DISK_TOTAL_GB is the last metric to be logged. + while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: + time.sleep(0.001) + live.next_step() - time.sleep(interval * num_samples + interval) # log metrics twice - live.make_summary() timeseries, _ = parse_metrics(live) def timeserie_schema(name): - return [{name: str, "timestamp": str, "step": str} for _ in range(2)] + return [{name: str, "timestamp": str, "step": str(0)}] # timeseries contains all the system metrics prefix = Path(tmp_dir) / "plots/metrics" From a8d1bfe0063e7a0e52e75d4822f6528ef06bc57e Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 20 Feb 2024 09:30:39 +0100 Subject: [PATCH 33/39] improve tests and error catching --- pyproject.toml | 3 +- src/dvclive/monitor_system.py | 37 ++++++----- tests/test_monitor_system.py | 114 ++++++++++++++++------------------ 3 files changed, 74 insertions(+), 80 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5c0544f2..aec95d71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,8 +54,7 @@ tests = [ "dvclive[image,plots,markdown]", "ipython", "pytest_voluptuous", - "dpath", - "voluptuous" + "dpath" ] dev = [ "dvclive[all,tests]", diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 6277ce26..69a7db6e 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -10,7 +10,6 @@ logger = logging.getLogger("dvclive") -MEGABYTES_DIVIDER = 1024.0**2 GIGABYTES_DIVIDER = 1024.0**3 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20 @@ -118,7 +117,7 @@ def __init__( self, interval: float = 0.1, num_samples: int = 20, - folders_to_monitor: Optional[Dict[str, str]] = None, + directories_to_monitor: Optional[Dict[str, str]] = None, plot: bool = True, ): """Monitor CPU resources and log them to DVC Live. @@ -127,12 +126,12 @@ def __init__( interval (float): interval in seconds between two measurements. Defaults to 0.5. num_samples (int): number of samples to average. Defaults to 10. - folders_to_monitor (Optional[Dict[str, str]]): monitor disk usage + directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage statistics about the partition which contains the given paths. The statistics include total and used space in gygabytes and percent. This argument expect a dict where the key is the name that will be used - in the metric's name and the value is the path to the folder to monitor. - Defaults to {"main": "/"}. + in the metric's name and the value is the path to the directory to + monitor. Defaults to {"main": "/"}. plot (bool): should the system metrics be saved as plots. Defaults to True. Raises: @@ -140,23 +139,20 @@ def __init__( supported type. """ super().__init__(interval=interval, num_samples=num_samples, plot=plot) - folders_to_monitor = ( - {"main": "/"} if folders_to_monitor is None else folders_to_monitor + directories_to_monitor = ( + {"main": "/"} if directories_to_monitor is None else directories_to_monitor ) self._disks_to_monitor = {} - for disk_name, disk_path in folders_to_monitor.items(): + for disk_name, disk_path in directories_to_monitor.items(): if disk_name != os.path.normpath(disk_name): raise ValueError( # noqa: TRY003 - "Keys for `partitions_to_monitor` should be a valid name" + "Keys for `directories_to_monitor` should be a valid name" f", but got '{disk_name}'." ) - try: - psutil.disk_usage(disk_path) - except OSError: - logger.warning(f"Couldn't find partition '{disk_path}', ignoring it.") - continue self._disks_to_monitor[disk_name] = disk_path + self._warn_disk_doesnt_exist: Dict[str, bool] = {} + def _get_metrics(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() nb_cpus = psutil.cpu_count() @@ -174,12 +170,19 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: * 100 / nb_cpus, METRIC_RAM_USAGE_PERCENT: ram_info.percent, - METRIC_RAM_USAGE_GB: (ram_info.percent / 100) - * (ram_info.total / GIGABYTES_DIVIDER), + METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER, METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, } for disk_name, disk_path in self._disks_to_monitor.items(): - disk_info = psutil.disk_usage(disk_path) + try: + disk_info = psutil.disk_usage(disk_path) + except OSError: + if self._warn_disk_doesnt_exist.get(disk_name, True): + logger.warning( + f"Couldn't find directory '{disk_path}', ignoring it." + ) + self._warn_disk_doesnt_exist[disk_name] = False + continue disk_metrics = { f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent, f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index ae06de70..49011639 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -3,7 +3,6 @@ import pytest import dpath -from voluptuous import Union from pytest_voluptuous import S from dvclive import Live @@ -18,71 +17,57 @@ METRIC_DISK_USAGE_PERCENT, METRIC_DISK_USAGE_GB, METRIC_DISK_TOTAL_GB, + GIGABYTES_DIVIDER, ) from dvclive.utils import parse_metrics -def mock_psutil(mocker): +def mock_psutil_cpu(mocker): mocker.patch( "dvclive.monitor_system.psutil.cpu_percent", - return_value=[10, 20, 30, 40, 50, 60], + return_value=[10, 10, 10, 40, 50, 60], ) mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6) - mocked_virtual_memory = mocker.MagicMock() - mocked_virtual_memory.percent = 20 - mocked_virtual_memory.total = 4 * 1024**3 - - mocked_disk_usage = mocker.MagicMock() - mocked_disk_usage.used = 16 * 1024**3 - mocked_disk_usage.percent = 50 - mocked_disk_usage.total = 32 * 1024**3 - - mocking_dict = { - "virtual_memory": mocked_virtual_memory, - "disk_usage": mocked_disk_usage, - } - for function_name, return_value in mocking_dict.items(): - mocker.patch( - f"dvclive.monitor_system.psutil.{function_name}", - return_value=return_value, - ) - -def mock_psutil_with_oserror(mocker): +def mock_psutil_ram(mocker): + mocked_ram = mocker.MagicMock() + mocked_ram.percent = 50 + mocked_ram.used = 2 * GIGABYTES_DIVIDER + mocked_ram.total = 4 * GIGABYTES_DIVIDER mocker.patch( - "dvclive.monitor_system.psutil.cpu_percent", - return_value=[10, 20, 30, 40, 50, 60], + "dvclive.monitor_system.psutil.virtual_memory", return_value=mocked_ram ) - mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6) - mocked_virtual_memory = mocker.MagicMock() - mocked_virtual_memory.percent = 20 - mocked_virtual_memory.total = 4 * 1024**3 - mocked_disk_usage = mocker.MagicMock() - mocked_disk_usage.used = 16 * 1024**3 - mocked_disk_usage.percent = 50 - mocked_disk_usage.total = 32 * 1024**3 +def mock_psutil_disk(mocker): + mocked_disk = mocker.MagicMock() + mocked_disk.percent = 50 + mocked_disk.used = 16 * GIGABYTES_DIVIDER + mocked_disk.total = 32 * GIGABYTES_DIVIDER + mocker.patch("dvclive.monitor_system.psutil.disk_usage", return_value=mocked_disk) - mocker.patch( - "dvclive.monitor_system.psutil.virtual_memory", - return_value=mocked_virtual_memory, - ) + +def mock_psutil_disk_with_oserror(mocker): + mocked_disk = mocker.MagicMock() + mocked_disk.percent = 50 + mocked_disk.used = 16 * GIGABYTES_DIVIDER + mocked_disk.total = 32 * GIGABYTES_DIVIDER mocker.patch( "dvclive.monitor_system.psutil.disk_usage", side_effect=[ - mocked_disk_usage, + mocked_disk, + OSError, + mocked_disk, OSError, - mocked_disk_usage, - mocked_disk_usage, ], ) def test_monitor_system_is_false(tmp_dir, mocker): - mock_psutil(mocker) - + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor") with Live( tmp_dir, @@ -95,7 +80,9 @@ def test_monitor_system_is_false(tmp_dir, mocker): def test_monitor_system_is_true(tmp_dir, mocker): - mock_psutil(mocker) + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor) with Live( @@ -116,8 +103,9 @@ def test_monitor_system_is_true(tmp_dir, mocker): def test_ignore_non_existent_directories(tmp_dir, mocker): - mock_psutil_with_oserror(mocker) - + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk_with_oserror(mocker) with Live( tmp_dir, save_dvc_exp=False, @@ -125,7 +113,7 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): ) as live: non_existent_disk = "/non-existent" monitor = CPUMonitor( - folders_to_monitor={"main": "/", "non-existent": non_existent_disk} + directories_to_monitor={"main": "/", "non-existent": non_existent_disk} ) monitor(live) metrics = monitor._get_metrics() @@ -140,7 +128,9 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): @pytest.mark.timeout(2) def test_monitor_system_metrics(tmp_dir, mocker): - mock_psutil(mocker) + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) with Live( tmp_dir, save_dvc_exp=False, @@ -156,26 +146,28 @@ def test_monitor_system_metrics(tmp_dir, mocker): _, latest = parse_metrics(live) schema = {} - for name in [ - "step", - METRIC_CPU_COUNT, - METRIC_CPU_USAGE_PERCENT, - METRIC_CPU_PARALLELIZATION_PERCENT, - METRIC_RAM_USAGE_PERCENT, - METRIC_RAM_USAGE_GB, - METRIC_RAM_TOTAL_GB, - f"{METRIC_DISK_USAGE_PERCENT}/main", - f"{METRIC_DISK_USAGE_GB}/main", - f"{METRIC_DISK_TOTAL_GB}/main", - ]: - dpath.new(schema, name, Union(int, float)) + for name, value in { + "step": 0, + METRIC_CPU_COUNT: 6, + METRIC_CPU_USAGE_PERCENT: 30.0, + METRIC_CPU_PARALLELIZATION_PERCENT: 50.0, + METRIC_RAM_USAGE_PERCENT: 50.0, + METRIC_RAM_USAGE_GB: 2.0, + METRIC_RAM_TOTAL_GB: 4.0, + f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0, + f"{METRIC_DISK_USAGE_GB}/main": 16.0, + f"{METRIC_DISK_TOTAL_GB}/main": 32.0, + }.items(): + dpath.new(schema, name, value) assert latest == S(schema) @pytest.mark.timeout(2) def test_monitor_system_timeseries(tmp_dir, mocker): - mock_psutil(mocker) + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) with Live( tmp_dir, save_dvc_exp=False, From 33590016daf0476dbea57e252dccf51aa3b8c0ee Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 20 Feb 2024 10:06:32 +0100 Subject: [PATCH 34/39] add docstring --- src/dvclive/live.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 4addceb3..f1a15f63 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -121,6 +121,8 @@ def __init__( provided string will be passed to `dvc exp save --message`. If DVCLive is used inside `dvc exp run`, the option will be ignored, use `dvc exp run --message` instead. + monitor_system (bool): if `True`, DVCLive will monitor CPU, ram, and disk + usage. Defaults to `False`. """ self.summary: Dict[str, Any] = {} From 3f94e241e900d2f8b36878d73ba2a2702c9b15fc Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Tue, 20 Feb 2024 18:27:45 +0100 Subject: [PATCH 35/39] merge cpu and gpu monitoring into a system monitor object general improvements in the code as well --- src/dvclive/live.py | 45 ++---- src/dvclive/monitor_system.py | 245 +++++++++++++++--------------- tests/test_monitor_system.py | 276 ++++++++++++++++++---------------- 3 files changed, 286 insertions(+), 280 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 3f1b2391..51501c0c 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -42,7 +42,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .monitor_system import GPU_AVAILABLE, CPUMonitor, GPUMonitor +from .monitor_system import SystemMonitor from .utils import ( StrPath, catch_and_warn, @@ -169,14 +169,10 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._monitor_cpu = None - self._monitor_gpu = None + self._system_monitor = None if monitor_system: - self._monitor_cpu = CPUMonitor() - self._monitor_cpu(self) - if GPU_AVAILABLE: - self._monitor_gpu = GPUMonitor() - self._monitor_gpu(self) + self._system_monitor = SystemMonitor() + self._system_monitor(self) def _init_resume(self): self._read_params() @@ -382,26 +378,15 @@ def step(self, value: int) -> None: logger.debug(f"Step: {self.step}") @property - def cpu_monitor(self) -> Optional[CPUMonitor]: - return self._monitor_cpu or None + def system_monitor(self) -> Optional[SystemMonitor]: + return self._system_monitor or None - @cpu_monitor.setter - def cpu_monitor(self, cpu_monitor: CPUMonitor) -> None: - if self._monitor_cpu is not None: - self._monitor_cpu.end() - self._monitor_cpu = cpu_monitor - self._monitor_cpu(self) - - @property - def monitor_gpu(self) -> Optional[GPUMonitor]: - return self._monitor_gpu or None - - @monitor_gpu.setter - def monitor_gpu(self, monitor_gpu: GPUMonitor) -> None: - if self._monitor_gpu is not None: - self._monitor_gpu.end() - self._monitor_gpu = monitor_gpu - self._monitor_gpu(self) + @system_monitor.setter + def system_monitor(self, system_monitor: SystemMonitor) -> None: + if self._system_monitor is not None: + self._system_monitor.end() + self._system_monitor = system_monitor + self._system_monitor(self) def sync(self): self.make_summary() @@ -892,10 +877,8 @@ def end(self): self.step = self.summary["step"] # Kill threads that monitor the system metrics - if self._monitor_cpu is not None: - self._monitor_cpu.end() - if self._monitor_gpu is not None: - self._monitor_gpu.end() + if self._system_monitor is not None: + self._system_monitor.end() self.sync() diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index efaa071b..df0293f9 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -1,4 +1,3 @@ -import abc import logging import os from typing import Dict, Union, Optional, Tuple @@ -17,6 +16,7 @@ nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, nvmlShutdown, + NVMLError, ) GPU_AVAILABLE = True @@ -40,43 +40,98 @@ METRIC_DISK_USAGE_GB = "system/disk/usage (GB)" METRIC_DISK_TOTAL_GB = "system/disk/total (GB)" - -class _SystemMonitor(abc.ABC): - """ - Monitor system resources and log them to DVC Live. - Use a separate thread to call a `_get_metrics` function at fix interval and - aggregate the results of this sampling using the average. +METRIC_GPU_COUNT = "system/gpu/count" +METRIC_GPU_USAGE_PERCENT = "system/gpu/usage (%)" +METRIC_VRAM_USAGE_PERCENT = "system/vram/usage (%)" +METRIC_VRAM_USAGE_GB = "system/vram/usage (GB)" +METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)" + + +class SystemMonitor: + """Monitor CPU, ram, and disk resources and log them to DVC Live. Monitor also the + GPU resources if available. + + Args: + interval (float): interval in seconds between two measurements. + Defaults to 0.05. + num_samples (int): number of samples to average. Defaults to 20. + directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage + statistics about the partition which contains the given paths. The + statistics include total and used space in gygabytes and percent. + This argument expect a dict where the key is the name that will be used + in the metric's name and the value is the path to the directory to + monitor. Defaults to {"main": "/"}. + plot (bool): should the system metrics be saved as plots. Defaults to True. + + Raises: + ValueError: if the keys in `directories_to_monitor` contains invalid characters + as defined by `os.path.normpath`. """ - _plot_blacklist_prefix: Tuple = () + _plot_blacklist_prefix: Tuple = ( + METRIC_CPU_COUNT, + METRIC_RAM_TOTAL_GB, + METRIC_DISK_TOTAL_GB, + METRIC_GPU_COUNT, + METRIC_VRAM_TOTAL_GB, + ) def __init__( self, - interval: float, - num_samples: int, + interval: float = 0.05, # seconds + num_samples: int = 20, + directories_to_monitor: Optional[Dict[str, str]] = None, plot: bool = True, ): - max_interval = 0.1 + self._interval = self._check_interval(interval, max_interval=0.1) + self._num_samples = self._check_num_samples( + num_samples, min_num_samples=1, max_num_samples=30 + ) + self._disks_to_monitor = self._check_directories_to_monitor( + directories_to_monitor + ) + self._plot = plot + self._warn_cpu_problem = True + self._warn_gpu_problem = True + self._warn_disk_doesnt_exist: Dict[str, bool] = {} + + def _check_interval(self, interval: float, max_interval: float) -> float: if interval > max_interval: - interval = max_interval logger.warning( f"System monitoring `interval` should be less than {max_interval} " - f"seconds. Setting `interval` to {interval} seconds." + f"seconds. Setting `interval` to {max_interval} seconds." ) + return max_interval + return interval + def _check_num_samples( + self, num_samples: int, min_num_samples: int, max_num_samples: int + ) -> int: min_num_samples = 1 max_num_samples = 30 - if min_num_samples < num_samples < max_num_samples: + if not min_num_samples < num_samples < max_num_samples: num_samples = max(min(num_samples, max_num_samples), min_num_samples) logger.warning( f"System monitoring `num_samples` should be between {min_num_samples} " f"and {max_num_samples}. Setting `num_samples` to {num_samples}." ) + return num_samples - self._interval = interval # seconds - self._nb_samples = num_samples - self._plot = plot - self._warn_user = True + def _check_directories_to_monitor( + self, directories_to_monitor: Optional[Dict[str, str]] + ) -> Dict[str, str]: + if directories_to_monitor is None: + return {"main": "/"} + + disks_to_monitor = {} + for disk_name, disk_path in directories_to_monitor.items(): + if disk_name != os.path.normpath(disk_name): + raise ValueError( # noqa: TRY003 + "Keys for `directories_to_monitor` should be a valid name" + f", but got '{disk_name}'." + ) + disks_to_monitor[disk_name] = disk_path + return disks_to_monitor def __call__(self, live): self._live = live @@ -88,14 +143,17 @@ def __call__(self, live): def _monitoring_loop(self): while not self._shutdown_event.is_set(): self._metrics = {} - for _ in range(self._nb_samples): - last_metrics = {} + for _ in range(self._num_samples): try: last_metrics = self._get_metrics() except psutil.Error: - if self._warn_user: + if self._warn_cpu_problem: logger.exception("Failed to monitor CPU metrics") - self._warn_user = False + self._warn_cpu_problem = False + except NVMLError: + if self._warn_gpu_problem: + logger.exception("Failed to monitor GPU metrics") + self._warn_gpu_problem = False self._metrics = merge_with(sum, self._metrics, last_metrics) self._shutdown_event.wait(self._interval) @@ -107,72 +165,34 @@ def _monitoring_loop(self): ) self._live.log_metric( name, - values / self._nb_samples, + values / self._num_samples, timestamp=True, plot=None if blacklisted else self._plot, ) - @abc.abstractmethod def _get_metrics(self) -> Dict[str, Union[float, int]]: - pass - - def end(self): - self._shutdown_event.set() - - -class CPUMonitor(_SystemMonitor): - _plot_blacklist_prefix: Tuple = ( - METRIC_CPU_COUNT, - METRIC_RAM_TOTAL_GB, - "system/disk/total (GB)", - ) - - def __init__( - self, - interval: float = 0.1, - num_samples: int = 20, - directories_to_monitor: Optional[Dict[str, str]] = None, - plot: bool = True, - ): - """Monitor CPU resources and log them to DVC Live. - - Args: - interval (float): interval in seconds between two measurements. - Defaults to 0.5. - num_samples (int): number of samples to average. Defaults to 10. - directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage - statistics about the partition which contains the given paths. The - statistics include total and used space in gygabytes and percent. - This argument expect a dict where the key is the name that will be used - in the metric's name and the value is the path to the directory to - monitor. Defaults to {"main": "/"}. - plot (bool): should the system metrics be saved as plots. Defaults to True. - - Raises: - ValueError: if the arguments passed to the function don't have a - supported type. - """ - super().__init__(interval=interval, num_samples=num_samples, plot=plot) - directories_to_monitor = ( - {"main": "/"} if directories_to_monitor is None else directories_to_monitor - ) - self._disks_to_monitor = {} - for disk_name, disk_path in directories_to_monitor.items(): - if disk_name != os.path.normpath(disk_name): - raise ValueError( # noqa: TRY003 - "Keys for `directories_to_monitor` should be a valid name" - f", but got '{disk_name}'." - ) - self._disks_to_monitor[disk_name] = disk_path - - self._warn_disk_doesnt_exist: Dict[str, bool] = {} + result = { + **self._get_cpu_info(), + **self._get_ram_info(), + **self._get_disk_info(), + } + if GPU_AVAILABLE: + result.update(self.__get_gpu_info()) + return result - def _get_metrics(self) -> Dict[str, Union[float, int]]: + def _get_ram_info(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() - nb_cpus = psutil.cpu_count() + return { + METRIC_RAM_USAGE_PERCENT: ram_info.percent, + METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER, + METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, + } + + def _get_cpu_info(self) -> Dict[str, Union[float, int]]: + num_cpus = psutil.cpu_count() cpus_percent = psutil.cpu_percent(percpu=True) - result = { - METRIC_CPU_COUNT: nb_cpus, + return { + METRIC_CPU_COUNT: num_cpus, METRIC_CPU_USAGE_PERCENT: mean(cpus_percent), METRIC_CPU_PARALLELIZATION_PERCENT: len( [ @@ -182,11 +202,11 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: ] ) * 100 - / nb_cpus, - METRIC_RAM_USAGE_PERCENT: ram_info.percent, - METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER, - METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER, + / num_cpus, } + + def _get_disk_info(self) -> Dict[str, Union[float, int]]: + result = {} for disk_name, disk_path in self._disks_to_monitor.items(): try: disk_info = psutil.disk_usage(disk_path) @@ -208,31 +228,7 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: result.update(disk_metrics) return result - -class GPUMonitor(_SystemMonitor): - _plot_blacklist_prefix = ("system/gpu/count", "system/vram/total (GB)") - - def __init__( - self, - interval: float = 0.5, - num_samples: int = 10, - plot: bool = True, - ): - """Monitor GPU resources and log them to DVC Live. - - Args: - interval (float): interval in seconds between two measurements. - Defaults to 0.5. - num_samples (int): number of samples to average. Defaults to 10. - plot (bool): should the system metrics be saved as plots. Defaults to True. - - Raises: - InvalidDataTypeError: if the arguments passed to the function don't have a - supported type. - """ - super().__init__(interval=interval, num_samples=num_samples, plot=plot) - - def _get_metrics(self) -> Dict[str, Union[float, int]]: + def __get_gpu_info(self) -> Dict[str, Union[float, int]]: nvmlInit() num_gpus = nvmlDeviceGetCount() gpu_metrics = { @@ -244,17 +240,26 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: memory_info = nvmlDeviceGetMemoryInfo(gpu_handle) usage_info = nvmlDeviceGetUtilizationRates(gpu_handle) - gpu_metrics[f"system/gpu/usage (%)/{gpu_idx}"] = ( - 100 * usage_info.memory / usage_info.gpu if usage_info.gpu else 0 - ) - gpu_metrics[f"system/vram/usage (%)/{gpu_idx}"] = ( - 100 * memory_info.used / memory_info.total - ) - gpu_metrics[f"system/vram/usage (GB)/{gpu_idx}"] = ( - memory_info.used / GIGABYTES_DIVIDER - ) - gpu_metrics[f"system/vram/total (GB)/{gpu_idx}"] = ( - memory_info.total / GIGABYTES_DIVIDER + gpu_metrics.update( + { + f"{METRIC_GPU_USAGE_PERCENT}/{gpu_idx}": ( + 100 * usage_info.memory / usage_info.gpu + if usage_info.gpu + else 0 + ), + f"{METRIC_VRAM_USAGE_PERCENT}/{gpu_idx}": ( + 100 * memory_info.used / memory_info.total + ), + f"{METRIC_VRAM_USAGE_GB}/{gpu_idx}": ( + memory_info.used / GIGABYTES_DIVIDER + ), + f"{METRIC_VRAM_TOTAL_GB}/{gpu_idx}": ( + memory_info.total / GIGABYTES_DIVIDER + ), + } ) nvmlShutdown() return gpu_metrics + + def end(self): + self._shutdown_event.set() diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 8f993659..2476a0f0 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -7,8 +7,7 @@ from dvclive import Live from dvclive.monitor_system import ( - CPUMonitor, - GPUMonitor, + SystemMonitor, METRIC_CPU_COUNT, METRIC_CPU_USAGE_PERCENT, METRIC_CPU_PARALLELIZATION_PERCENT, @@ -18,6 +17,11 @@ METRIC_DISK_USAGE_PERCENT, METRIC_DISK_USAGE_GB, METRIC_DISK_TOTAL_GB, + METRIC_GPU_COUNT, + METRIC_GPU_USAGE_PERCENT, + METRIC_VRAM_USAGE_PERCENT, + METRIC_VRAM_USAGE_GB, + METRIC_VRAM_TOTAL_GB, GIGABYTES_DIVIDER, ) from dvclive.utils import parse_metrics @@ -65,38 +69,115 @@ def mock_psutil_disk_with_oserror(mocker): ) +def mock_pynvml(mocker, num_gpus=2): + prefix = "dvclive.monitor_system" + mocker.patch(f"{prefix}.GPU_AVAILABLE", bool(num_gpus)) + mocker.patch(f"{prefix}.nvmlDeviceGetCount", return_value=num_gpus) + mocker.patch(f"{prefix}.nvmlInit", return_value=None) + mocker.patch(f"{prefix}.nvmlShutdown", return_value=None) + mocker.patch(f"{prefix}.nvmlDeviceGetHandleByIndex", return_value=None) + + vram_info = mocker.MagicMock() + vram_info.used = 3 * 1024**3 + vram_info.total = 6 * 1024**3 + + gpu_usage = mocker.MagicMock() + gpu_usage.memory = 5 + gpu_usage.gpu = 10 + + mocker.patch(f"{prefix}.nvmlDeviceGetMemoryInfo", return_value=vram_info) + mocker.patch(f"{prefix}.nvmlDeviceGetUtilizationRates", return_value=gpu_usage) + + +@pytest.fixture() +def cpu_metrics(): + content = { + METRIC_CPU_COUNT: 6, + METRIC_CPU_USAGE_PERCENT: 30.0, + METRIC_CPU_PARALLELIZATION_PERCENT: 50.0, + METRIC_RAM_USAGE_PERCENT: 50.0, + METRIC_RAM_USAGE_GB: 2.0, + METRIC_RAM_TOTAL_GB: 4.0, + f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0, + f"{METRIC_DISK_USAGE_GB}/main": 16.0, + f"{METRIC_DISK_TOTAL_GB}/main": 32.0, + } + result = {} + for name, value in content.items(): + dpath.new(result, name, value) + return result + + +def _timeserie_schema(name, value): + return [{name: str(value), "timestamp": str, "step": "0"}] + + +@pytest.fixture() +def cpu_timeseries(): + return { + f"{METRIC_CPU_USAGE_PERCENT}.tsv": _timeserie_schema( + METRIC_CPU_USAGE_PERCENT.split("/")[-1], 30.0 + ), + f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv": _timeserie_schema( + METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1], 50.0 + ), + f"{METRIC_RAM_USAGE_PERCENT}.tsv": _timeserie_schema( + METRIC_RAM_USAGE_PERCENT.split("/")[-1], 50.0 + ), + f"{METRIC_RAM_USAGE_GB}.tsv": _timeserie_schema( + METRIC_RAM_USAGE_GB.split("/")[-1], 2.0 + ), + f"{METRIC_DISK_USAGE_PERCENT}/main.tsv": _timeserie_schema("main", 50.0), + f"{METRIC_DISK_USAGE_GB}/main.tsv": _timeserie_schema("main", 16.0), + } + + +@pytest.fixture() +def gpu_timeseries(): + return { + f"{METRIC_GPU_USAGE_PERCENT}/0.tsv": _timeserie_schema("0", 50.0), + f"{METRIC_GPU_USAGE_PERCENT}/1.tsv": _timeserie_schema("1", 50.0), + f"{METRIC_VRAM_USAGE_PERCENT}/0.tsv": _timeserie_schema("0", 50.0), + f"{METRIC_VRAM_USAGE_PERCENT}/1.tsv": _timeserie_schema("1", 50.0), + f"{METRIC_VRAM_USAGE_GB}/0.tsv": _timeserie_schema("0", 3.0), + f"{METRIC_VRAM_USAGE_GB}/1.tsv": _timeserie_schema("1", 3.0), + } + + def test_monitor_system_is_false(tmp_dir, mocker): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk(mocker) - cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor") + mock_pynvml(mocker, num_gpus=0) + system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - assert live.cpu_monitor is None + assert live.system_monitor is None - cpu_monitor_mock.assert_not_called() + system_monitor_mock.assert_not_called() def test_monitor_system_is_true(tmp_dir, mocker): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk(mocker) - cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor) + mock_pynvml(mocker, num_gpus=0) + system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor) with Live( tmp_dir, save_dvc_exp=False, monitor_system=True, ) as live: - cpu_monitor = live.cpu_monitor + system_monitor = live.system_monitor - assert isinstance(cpu_monitor, CPUMonitor) - cpu_monitor_mock.assert_called_once() + assert isinstance(system_monitor, SystemMonitor) + system_monitor_mock.assert_called_once() - end_spy = mocker.spy(cpu_monitor, "end") + end_spy = mocker.spy(system_monitor, "end") end_spy.assert_not_called() # check the monitoring thread is stopped @@ -107,13 +188,14 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk_with_oserror(mocker) + mock_pynvml(mocker, num_gpus=0) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: non_existent_disk = "/non-existent" - monitor = CPUMonitor( + monitor = SystemMonitor( directories_to_monitor={"main": "/", "non-existent": non_existent_disk} ) monitor(live) @@ -128,16 +210,17 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): @pytest.mark.timeout(2) -def test_monitor_system_metrics(tmp_dir, mocker): +def test_monitor_system_metrics(tmp_dir, cpu_metrics, mocker): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk(mocker) + mock_pynvml(mocker, num_gpus=0) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) # wait for the metrics to be logged. # METRIC_DISK_TOTAL_GB is the last metric to be logged. while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: @@ -146,35 +229,22 @@ def test_monitor_system_metrics(tmp_dir, mocker): _, latest = parse_metrics(live) - schema = {} - for name, value in { - "step": 0, - METRIC_CPU_COUNT: 6, - METRIC_CPU_USAGE_PERCENT: 30.0, - METRIC_CPU_PARALLELIZATION_PERCENT: 50.0, - METRIC_RAM_USAGE_PERCENT: 50.0, - METRIC_RAM_USAGE_GB: 2.0, - METRIC_RAM_TOTAL_GB: 4.0, - f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0, - f"{METRIC_DISK_USAGE_GB}/main": 16.0, - f"{METRIC_DISK_TOTAL_GB}/main": 32.0, - }.items(): - dpath.new(schema, name, value) - + schema = {"step": 0, **cpu_metrics} assert latest == S(schema) @pytest.mark.timeout(2) -def test_monitor_system_timeseries(tmp_dir, mocker): +def test_monitor_system_timeseries(tmp_dir, cpu_timeseries, mocker): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk(mocker) + mock_pynvml(mocker, num_gpus=0) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4) + live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) # wait for the metrics to be logged. # METRIC_DISK_TOTAL_GB is the last metric to be logged. @@ -185,121 +255,69 @@ def test_monitor_system_timeseries(tmp_dir, mocker): timeseries, _ = parse_metrics(live) - def timeserie_schema(name): - return [{name: str, "timestamp": str, "step": str(0)}] - - # timeseries contains all the system metrics prefix = Path(tmp_dir) / "plots/metrics" - assert timeseries == S( - { - str(prefix / f"{METRIC_CPU_USAGE_PERCENT}.tsv"): timeserie_schema( - METRIC_CPU_USAGE_PERCENT.split("/")[-1] - ), - str(prefix / f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv"): timeserie_schema( - METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1] - ), - str(prefix / f"{METRIC_RAM_USAGE_PERCENT}.tsv"): timeserie_schema( - METRIC_RAM_USAGE_PERCENT.split("/")[-1] - ), - str(prefix / f"{METRIC_RAM_USAGE_GB}.tsv"): timeserie_schema( - METRIC_RAM_USAGE_GB.split("/")[-1] - ), - str(prefix / f"{METRIC_DISK_USAGE_PERCENT}/main.tsv"): timeserie_schema( - "main" - ), - str(prefix / f"{METRIC_DISK_USAGE_GB}/main.tsv"): timeserie_schema("main"), - } - ) - + schema = {str(prefix / name): value for name, value in cpu_timeseries.items()} + assert timeseries == S(schema) -def mock_pynvml(mocker, num_gpus=2, crash_index=None): - mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus) - mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus) - mocker.patch("dvclive.monitor_system.nvmlInit", return_value=None) - mocker.patch("dvclive.monitor_system.nvmlShutdown", return_value=None) - mocked_memory_info = mocker.MagicMock() - mocked_memory_info.used = 3 * 1024**3 - mocked_memory_info.total = 5 * 1024**3 - - mocked_utilization_rate = mocker.MagicMock() - mocked_utilization_rate.memory = 5 - mocked_utilization_rate.gpu = 10 - - mocking_dict = { - "nvmlDeviceGetHandleByIndex": None, - "nvmlDeviceGetMemoryInfo": mocked_memory_info, - "nvmlDeviceGetUtilizationRates": mocked_utilization_rate, - } - - for function_name, return_value in mocking_dict.items(): - mocker.patch( - f"dvclive.monitor_system.{function_name}", - return_value=return_value, - ) - - -def test_get_gpus_metrics_mocker(mocker, tmp_dir): +@pytest.mark.timeout(2) +def test_monitor_system_metrics_with_gpu(tmp_dir, cpu_metrics, mocker): + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) mock_pynvml(mocker, num_gpus=2) with Live( tmp_dir, save_dvc_exp=False, monitor_system=False, ) as live: - monitor = GPUMonitor() - monitor(live) - metrics = monitor._get_metrics() - monitor.end() - assert "system/gpu/usage (%)/0" in metrics - assert "system/gpu/usage (%)/1" in metrics - assert "system/vram/usage (%)/0" in metrics - assert "system/vram/usage (%)/1" in metrics - assert "system/vram/usage (GB)/0" in metrics - assert "system/vram/usage (GB)/1" in metrics - assert "system/vram/total (GB)/0" in metrics - assert "system/vram/total (GB)/1" in metrics + live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + # wait for the metrics to be logged. + # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU. + while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0: + time.sleep(0.001) + live.next_step() + _, latest = parse_metrics(live) -def test_monitor_gpu_system(tmp_dir, mocker): + schema = {"step": 0, **cpu_metrics} + gpu_content = { + METRIC_GPU_COUNT: 2, + f"{METRIC_GPU_USAGE_PERCENT}": {"0": 50.0, "1": 50.0}, + f"{METRIC_VRAM_USAGE_PERCENT}": {"0": 50.0, "1": 50.0}, + f"{METRIC_VRAM_USAGE_GB}": {"0": 3.0, "1": 3.0}, + f"{METRIC_VRAM_TOTAL_GB}": {"0": 6.0, "1": 6.0}, + } + for name, value in gpu_content.items(): + dpath.new(schema, name, value) + assert latest == S(schema) + + +@pytest.mark.timeout(2) +def test_monitor_system_timeseries_with_gpu( + tmp_dir, cpu_timeseries, gpu_timeseries, mocker +): mock_psutil_cpu(mocker) mock_psutil_ram(mocker) mock_psutil_disk(mocker) - mock_pynvml(mocker, num_gpus=1) + mock_pynvml(mocker, num_gpus=2) with Live( tmp_dir, save_dvc_exp=False, - monitor_system=True, + monitor_system=False, ) as live: - time.sleep(5 + 1) # allow the thread to finish + live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + + # wait for the metrics to be logged. + # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU. + while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0: + time.sleep(0.001) + live.next_step() - time.sleep(5 + 1) # allow the thread to finish - timeseries, latest = parse_metrics(live) - - # metrics.json records CPU and RAM metrics if GPU detected - assert "system" in latest - assert "cpu" in latest["system"] - assert "ram" in latest["system"] - assert "disk" in latest["system"] - - # metrics.json file contains all the system metrics - assert "gpu" in latest["system"] - assert "count" in latest["system"]["gpu"] - assert "usage (%)" in latest["system"]["gpu"] - assert "0" in latest["system"]["gpu"]["usage (%)"] - assert "vram" in latest["system"] - assert "usage (%)" in latest["system"]["vram"] - assert "0" in latest["system"]["vram"]["usage (%)"] - assert "usage (GB)" in latest["system"]["vram"] - assert "0" in latest["system"]["vram"]["usage (GB)"] - assert "total (GB)" in latest["system"]["vram"] - assert "0" in latest["system"]["vram"]["total (GB)"] - - # timeseries contains all the system metrics - assert any(str(Path("system/gpu/usage (%)/0.tsv")) in key for key in timeseries) - assert any(str(Path("system/vram/usage (%)/0.tsv")) in key for key in timeseries) - assert any(str(Path("system/vram/usage (GB)/0.tsv")) in key for key in timeseries) - assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key) - - # blacklisted timeseries - assert all(str(Path("system/gpu/count.tsv")) not in key for key in timeseries) - assert all(str(Path("system/vram/total (GB).tsv")) not in key for key in timeseries) + + timeseries, _ = parse_metrics(live) + + prefix = Path(tmp_dir) / "plots/metrics" + schema = {str(prefix / name): value for name, value in cpu_timeseries.items()} + schema.update({str(prefix / name): value for name, value in gpu_timeseries.items()}) + assert timeseries == S(schema) From 506dd5211e39d33baa7e7a206023a0f73314ef22 Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Wed, 21 Feb 2024 10:09:26 +0100 Subject: [PATCH 36/39] change pyproject dependencies and fix typo --- pyproject.toml | 9 ++++----- src/dvclive/monitor_system.py | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7f1a9575..970ec8ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,7 @@ dependencies = [ "funcy", "gto", "ruamel.yaml", - "scmrepo", - "psutil", - "pynvml" + "scmrepo" ] [project.optional-dependencies] @@ -52,7 +50,7 @@ tests = [ "pytest-sugar>=0.9.6,<1.0", "pytest-cov>=3.0.0,<4.0", "pytest-mock>=3.8.2,<4.0", - "dvclive[image,plots,markdown]", + "dvclive[image,plots,markdown,system]", "ipython", "pytest_voluptuous", "dpath" @@ -71,8 +69,9 @@ catalyst = ["catalyst>22"] fastai = ["fastai"] lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"] optuna = ["optuna"] +system = ["psutil", "pynvml"] all = [ - "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]" + "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown,system]" ] [project.urls] diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index df0293f9..84803917 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -177,7 +177,7 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]: **self._get_disk_info(), } if GPU_AVAILABLE: - result.update(self.__get_gpu_info()) + result.update(self._get_gpu_info()) return result def _get_ram_info(self) -> Dict[str, Union[float, int]]: @@ -228,7 +228,7 @@ def _get_disk_info(self) -> Dict[str, Union[float, int]]: result.update(disk_metrics) return result - def __get_gpu_info(self) -> Dict[str, Union[float, int]]: + def _get_gpu_info(self) -> Dict[str, Union[float, int]]: nvmlInit() num_gpus = nvmlDeviceGetCount() gpu_metrics = { From 5df5d48adf260ff7445897bf6a191b6095f9e55b Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 22 Feb 2024 09:40:56 +0100 Subject: [PATCH 37/39] install psutil and pynvml by default --- pyproject.toml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 970ec8ba..7f1a9575 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,9 @@ dependencies = [ "funcy", "gto", "ruamel.yaml", - "scmrepo" + "scmrepo", + "psutil", + "pynvml" ] [project.optional-dependencies] @@ -50,7 +52,7 @@ tests = [ "pytest-sugar>=0.9.6,<1.0", "pytest-cov>=3.0.0,<4.0", "pytest-mock>=3.8.2,<4.0", - "dvclive[image,plots,markdown,system]", + "dvclive[image,plots,markdown]", "ipython", "pytest_voluptuous", "dpath" @@ -69,9 +71,8 @@ catalyst = ["catalyst>22"] fastai = ["fastai"] lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"] optuna = ["optuna"] -system = ["psutil", "pynvml"] all = [ - "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown,system]" + "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]" ] [project.urls] From 29477ba72d329cf29b7a2d5f62ba9204eb18b1dd Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 22 Feb 2024 10:00:57 +0100 Subject: [PATCH 38/39] remove plot argument in SystemMonitor --- src/dvclive/live.py | 4 ++-- src/dvclive/monitor_system.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 51501c0c..50ed33f0 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -122,8 +122,8 @@ def __init__( provided string will be passed to `dvc exp save --message`. If DVCLive is used inside `dvc exp run`, the option will be ignored, use `dvc exp run --message` instead. - monitor_system (bool): if `True`, DVCLive will monitor CPU, ram, and disk - usage. Defaults to `False`. + monitor_system (bool): if `True`, DVCLive will monitor GPU, CPU, ram, and + disk usage. Defaults to `False`. """ self.summary: Dict[str, Any] = {} diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 84803917..4e3e406c 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -61,7 +61,6 @@ class SystemMonitor: This argument expect a dict where the key is the name that will be used in the metric's name and the value is the path to the directory to monitor. Defaults to {"main": "/"}. - plot (bool): should the system metrics be saved as plots. Defaults to True. Raises: ValueError: if the keys in `directories_to_monitor` contains invalid characters @@ -81,7 +80,6 @@ def __init__( interval: float = 0.05, # seconds num_samples: int = 20, directories_to_monitor: Optional[Dict[str, str]] = None, - plot: bool = True, ): self._interval = self._check_interval(interval, max_interval=0.1) self._num_samples = self._check_num_samples( @@ -90,7 +88,6 @@ def __init__( self._disks_to_monitor = self._check_directories_to_monitor( directories_to_monitor ) - self._plot = plot self._warn_cpu_problem = True self._warn_gpu_problem = True self._warn_disk_doesnt_exist: Dict[str, bool] = {} @@ -167,7 +164,7 @@ def _monitoring_loop(self): name, values / self._num_samples, timestamp=True, - plot=None if blacklisted else self._plot, + plot=None if blacklisted else True, ) def _get_metrics(self) -> Dict[str, Union[float, int]]: From 53d83d5fc14ad479ef917bfcaf45605b9b1c4ddb Mon Sep 17 00:00:00 2001 From: AlexandreKempf Date: Thu, 22 Feb 2024 11:08:18 +0100 Subject: [PATCH 39/39] change call to the SystemMonitor object to be a Live method --- src/dvclive/live.py | 47 ++++++++++++++++++------ src/dvclive/monitor_system.py | 58 ++++++++++-------------------- tests/test_monitor_system.py | 68 ++++++++++++++++++++--------------- 3 files changed, 93 insertions(+), 80 deletions(-) diff --git a/src/dvclive/live.py b/src/dvclive/live.py index 50ed33f0..466887cd 100644 --- a/src/dvclive/live.py +++ b/src/dvclive/live.py @@ -42,7 +42,7 @@ from .report import BLANK_NOTEBOOK_REPORT, make_report from .serialize import dump_json, dump_yaml, load_yaml from .studio import get_dvc_studio_config, post_to_studio -from .monitor_system import SystemMonitor +from .monitor_system import _SystemMonitor from .utils import ( StrPath, catch_and_warn, @@ -169,10 +169,9 @@ def __init__( self._dvc_studio_config: Dict[str, Any] = {} self._init_studio() - self._system_monitor = None + self._system_monitor: Optional[_SystemMonitor] = None # Monitoring thread if monitor_system: - self._system_monitor = SystemMonitor() - self._system_monitor(self) + self.monitor_system() def _init_resume(self): self._read_params() @@ -377,16 +376,42 @@ def step(self, value: int) -> None: self._step = value logger.debug(f"Step: {self.step}") - @property - def system_monitor(self) -> Optional[SystemMonitor]: - return self._system_monitor or None + def monitor_system( + self, + interval: float = 0.05, # seconds + num_samples: int = 20, + directories_to_monitor: Optional[Dict[str, str]] = None, + ) -> None: + """Monitor GPU, CPU, ram, and disk resources and log them to DVC Live. + + Args: + interval (float): the time interval between samples in seconds. To keep the + sampling interval small, the maximum value allowed is 0.1 seconds. + Default to 0.05. + num_samples (int): the number of samples to collect before the aggregation. + The value should be between 1 and 30 samples. Default to 20. + directories_to_monitor (Optional[Dict[str, str]]): a dictionary with the + information about which directories to monitor. The `key` would be the + name of the metric and the `value` is the path to the directory. + The metric tracked concerns the partition that contains the directory. + Default to `{"main": "/"}`. + + Raises: + ValueError: if the keys in `directories_to_monitor` contains invalid + characters as defined by `os.path.normpath`. + """ + if directories_to_monitor is None: + directories_to_monitor = {"main": "/"} - @system_monitor.setter - def system_monitor(self, system_monitor: SystemMonitor) -> None: if self._system_monitor is not None: self._system_monitor.end() - self._system_monitor = system_monitor - self._system_monitor(self) + + self._system_monitor = _SystemMonitor( + live=self, + interval=interval, + num_samples=num_samples, + directories_to_monitor=directories_to_monitor, + ) def sync(self): self.make_summary() diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py index 4e3e406c..f6d26d83 100644 --- a/src/dvclive/monitor_system.py +++ b/src/dvclive/monitor_system.py @@ -1,13 +1,12 @@ import logging import os -from typing import Dict, Union, Optional, Tuple +from typing import Dict, Union, Tuple import psutil from statistics import mean from threading import Event, Thread from funcy import merge_with - try: from pynvml import ( nvmlInit, @@ -47,26 +46,7 @@ METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)" -class SystemMonitor: - """Monitor CPU, ram, and disk resources and log them to DVC Live. Monitor also the - GPU resources if available. - - Args: - interval (float): interval in seconds between two measurements. - Defaults to 0.05. - num_samples (int): number of samples to average. Defaults to 20. - directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage - statistics about the partition which contains the given paths. The - statistics include total and used space in gygabytes and percent. - This argument expect a dict where the key is the name that will be used - in the metric's name and the value is the path to the directory to - monitor. Defaults to {"main": "/"}. - - Raises: - ValueError: if the keys in `directories_to_monitor` contains invalid characters - as defined by `os.path.normpath`. - """ - +class _SystemMonitor: _plot_blacklist_prefix: Tuple = ( METRIC_CPU_COUNT, METRIC_RAM_TOTAL_GB, @@ -77,10 +57,12 @@ class SystemMonitor: def __init__( self, - interval: float = 0.05, # seconds - num_samples: int = 20, - directories_to_monitor: Optional[Dict[str, str]] = None, + live, + interval: float, # seconds + num_samples: int, + directories_to_monitor: Dict[str, str], ): + self._live = live self._interval = self._check_interval(interval, max_interval=0.1) self._num_samples = self._check_num_samples( num_samples, min_num_samples=1, max_num_samples=30 @@ -92,6 +74,11 @@ def __init__( self._warn_gpu_problem = True self._warn_disk_doesnt_exist: Dict[str, bool] = {} + self._shutdown_event = Event() + Thread( + target=self._monitoring_loop, + ).start() + def _check_interval(self, interval: float, max_interval: float) -> float: if interval > max_interval: logger.warning( @@ -115,11 +102,8 @@ def _check_num_samples( return num_samples def _check_directories_to_monitor( - self, directories_to_monitor: Optional[Dict[str, str]] + self, directories_to_monitor: Dict[str, str] ) -> Dict[str, str]: - if directories_to_monitor is None: - return {"main": "/"} - disks_to_monitor = {} for disk_name, disk_path in directories_to_monitor.items(): if disk_name != os.path.normpath(disk_name): @@ -130,13 +114,6 @@ def _check_directories_to_monitor( disks_to_monitor[disk_name] = disk_path return disks_to_monitor - def __call__(self, live): - self._live = live - self._shutdown_event = Event() - Thread( - target=self._monitoring_loop, - ).start() - def _monitoring_loop(self): while not self._shutdown_event.is_set(): self._metrics = {} @@ -168,14 +145,12 @@ def _monitoring_loop(self): ) def _get_metrics(self) -> Dict[str, Union[float, int]]: - result = { + return { + **self._get_gpu_info(), **self._get_cpu_info(), **self._get_ram_info(), **self._get_disk_info(), } - if GPU_AVAILABLE: - result.update(self._get_gpu_info()) - return result def _get_ram_info(self) -> Dict[str, Union[float, int]]: ram_info = psutil.virtual_memory() @@ -226,6 +201,9 @@ def _get_disk_info(self) -> Dict[str, Union[float, int]]: return result def _get_gpu_info(self) -> Dict[str, Union[float, int]]: + if not GPU_AVAILABLE: + return {} + nvmlInit() num_gpus = nvmlDeviceGetCount() gpu_metrics = { diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py index 2476a0f0..d704f00b 100644 --- a/tests/test_monitor_system.py +++ b/tests/test_monitor_system.py @@ -7,7 +7,7 @@ from dvclive import Live from dvclive.monitor_system import ( - SystemMonitor, + _SystemMonitor, METRIC_CPU_COUNT, METRIC_CPU_USAGE_PERCENT, METRIC_CPU_PARALLELIZATION_PERCENT, @@ -149,14 +149,10 @@ def test_monitor_system_is_false(tmp_dir, mocker): mock_psutil_ram(mocker) mock_psutil_disk(mocker) mock_pynvml(mocker, num_gpus=0) - system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor) - with Live( - tmp_dir, - save_dvc_exp=False, - monitor_system=False, - ) as live: - assert live.system_monitor is None - + system_monitor_mock = mocker.patch( + "dvclive.live._SystemMonitor", spec=_SystemMonitor + ) + Live(tmp_dir, save_dvc_exp=False, monitor_system=False) system_monitor_mock.assert_not_called() @@ -165,23 +161,35 @@ def test_monitor_system_is_true(tmp_dir, mocker): mock_psutil_ram(mocker) mock_psutil_disk(mocker) mock_pynvml(mocker, num_gpus=0) - system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor) + system_monitor_mock = mocker.patch( + "dvclive.live._SystemMonitor", spec=_SystemMonitor + ) + + Live(tmp_dir, save_dvc_exp=False, monitor_system=True) + system_monitor_mock.assert_called_once() + + +def test_all_threads_close(tmp_dir, mocker): + mock_psutil_cpu(mocker) + mock_psutil_ram(mocker) + mock_psutil_disk(mocker) + mock_pynvml(mocker, num_gpus=0) with Live( tmp_dir, save_dvc_exp=False, monitor_system=True, ) as live: - system_monitor = live.system_monitor + first_end_spy = mocker.spy(live._system_monitor, "end") + first_end_spy.assert_not_called() - assert isinstance(system_monitor, SystemMonitor) - system_monitor_mock.assert_called_once() + live.monitor_system(interval=0.01) + first_end_spy.assert_called_once() - end_spy = mocker.spy(system_monitor, "end") - end_spy.assert_not_called() + second_end_spy = mocker.spy(live._system_monitor, "end") # check the monitoring thread is stopped - end_spy.assert_called_once() + second_end_spy.assert_called_once() def test_ignore_non_existent_directories(tmp_dir, mocker): @@ -195,12 +203,14 @@ def test_ignore_non_existent_directories(tmp_dir, mocker): monitor_system=False, ) as live: non_existent_disk = "/non-existent" - monitor = SystemMonitor( - directories_to_monitor={"main": "/", "non-existent": non_existent_disk} + system_monitor = _SystemMonitor( + live=live, + interval=0.1, + num_samples=4, + directories_to_monitor={"main": "/", "non-existent": non_existent_disk}, ) - monitor(live) - metrics = monitor._get_metrics() - monitor.end() + metrics = system_monitor._get_metrics() + system_monitor.end() assert not Path(non_existent_disk).exists() @@ -220,7 +230,7 @@ def test_monitor_system_metrics(tmp_dir, cpu_metrics, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + live.monitor_system(interval=0.05, num_samples=4) # wait for the metrics to be logged. # METRIC_DISK_TOTAL_GB is the last metric to be logged. while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: @@ -244,7 +254,7 @@ def test_monitor_system_timeseries(tmp_dir, cpu_timeseries, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + live.monitor_system(interval=0.05, num_samples=4) # wait for the metrics to be logged. # METRIC_DISK_TOTAL_GB is the last metric to be logged. @@ -271,10 +281,10 @@ def test_monitor_system_metrics_with_gpu(tmp_dir, cpu_metrics, mocker): save_dvc_exp=False, monitor_system=False, ) as live: - live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + live.monitor_system(interval=0.05, num_samples=4) # wait for the metrics to be logged. - # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU. - while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0: + # METRIC_DISK_TOTAL_GB is the last metric to be logged. + while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: time.sleep(0.001) live.next_step() @@ -306,11 +316,11 @@ def test_monitor_system_timeseries_with_gpu( save_dvc_exp=False, monitor_system=False, ) as live: - live.system_monitor = SystemMonitor(interval=0.05, num_samples=4) + live.monitor_system(interval=0.05, num_samples=4) # wait for the metrics to be logged. - # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU. - while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0: + # METRIC_DISK_TOTAL_GB is the last metric to be logged. + while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0: time.sleep(0.001) live.next_step()