From b5a817166b0e3528e6f00ee12ac0cde6c3321ff2 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 12:06:04 +0100
Subject: [PATCH 01/39] add cpu monitoring

---
 pyproject.toml                | 56 +++++++++++++++++++++++++-----
 src/dvclive/live.py           | 13 ++++++-
 src/dvclive/system_metrics.py | 65 +++++++++++++++++++++++++++++++++++
 src/dvclive/utils.py          | 12 ++++++-
 4 files changed, 136 insertions(+), 10 deletions(-)
 create mode 100644 src/dvclive/system_metrics.py

diff --git a/pyproject.toml b/pyproject.toml
index c2e4f16e..7e06166e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "gto",
     "ruamel.yaml",
     "scmrepo",
+    "psutil",
 ]
 
 [project.optional-dependencies]
@@ -53,11 +54,7 @@ tests = [
     "dvclive[image,plots,markdown]",
     "ipython",
 ]
-dev = [
-    "dvclive[all,tests]",
-    "mypy>=1.1.1",
-    "types-PyYAML",
-]
+dev = ["dvclive[all,tests]", "mypy>=1.1.1", "types-PyYAML"]
 mmcv = ["mmcv"]
 tf = ["tensorflow"]
 xgb = ["xgboost"]
@@ -68,7 +65,7 @@ fastai = ["fastai"]
 lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"]
 optuna = ["optuna"]
 all = [
-   "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]"
+    "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]",
 ]
 
 [project.urls]
@@ -136,11 +133,54 @@ ignore-words-list = "fpr"
 
 [tool.ruff]
 ignore = ["N818", "UP006", "UP007", "UP035", "UP038", "B905", "PGH003"]
-select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T10", "EXE", "ISC", "INP", "PIE", "T20", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PGH", "PL", "TRY", "NPY", "RUF"]
+select = [
+    "F",
+    "E",
+    "W",
+    "C90",
+    "N",
+    "UP",
+    "YTT",
+    "S",
+    "BLE",
+    "B",
+    "A",
+    "C4",
+    "T10",
+    "EXE",
+    "ISC",
+    "INP",
+    "PIE",
+    "T20",
+    "PT",
+    "Q",
+    "RSE",
+    "RET",
+    "SLF",
+    "SIM",
+    "TID",
+    "TCH",
+    "INT",
+    "ARG",
+    "PGH",
+    "PL",
+    "TRY",
+    "NPY",
+    "RUF",
+]
 
 [tool.ruff.per-file-ignores]
 "noxfile.py" = ["D", "PTH"]
-"tests/*" = ["S101", "INP001", "SLF001", "ARG001", "ARG002", "ARG005", "PLR2004", "NPY002"]
+"tests/*" = [
+    "S101",
+    "INP001",
+    "SLF001",
+    "ARG001",
+    "ARG002",
+    "ARG005",
+    "PLR2004",
+    "NPY002",
+]
 
 [tool.ruff.pylint]
 max-args = 10
diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index ce8d5357..abac3443 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -8,7 +8,7 @@
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Union, TYPE_CHECKING
+from typing import Any, Dict, List, Optional, Set, Callable, Union, TYPE_CHECKING
 
 if TYPE_CHECKING:
     import numpy as np
@@ -37,6 +37,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
+from .system_metrics import CPUMetricsCallback
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -75,6 +76,7 @@ def __init__(
         cache_images: bool = False,
         exp_name: Optional[str] = None,
         exp_message: Optional[str] = None,
+        callbacks: Optional[List[Callable]] = None,
     ):
         self.summary: Dict[str, Any] = {}
 
@@ -120,6 +122,10 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
+        self._callbacks = callbacks or [CPUMetricsCallback()]
+        for callback in self._callbacks:
+            callback(self)
+
     def _init_resume(self):
         self._read_params()
         self.summary = self.read_latest()
@@ -589,6 +595,11 @@ def end(self):
         # If next_step called before end, don't want to update step number
         if "step" in self.summary:
             self.step = self.summary["step"]
+
+        # Kill all independent threads that can be found in callbacks
+        for callback in self._callbacks:
+            callback.end()
+
         self.sync()
 
         if self._inside_dvc_exp and self._dvc_repo:
diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
new file mode 100644
index 00000000..63e09aa6
--- /dev/null
+++ b/src/dvclive/system_metrics.py
@@ -0,0 +1,65 @@
+import logging
+from typing import Dict, Union
+
+import psutil
+from statistics import mean
+from threading import Event, Thread
+
+from .utils import append_dict
+
+logger = logging.getLogger("dvclive")
+MEGABYTES_DIVIDER = 1024.0**2
+
+
+class CPUMetricsCallback:
+    def __init__(
+        self,
+        duration: Union[int, float] = 30,
+        interval: Union[int, float] = 2.0,
+        plot: bool = True,
+    ):
+        self.duration = duration
+        self.interval = interval
+        self.plot = plot
+
+    def __call__(self, live):
+        self._live = live
+        self._shutdown_event = Event()
+        Thread(
+            target=self.monitoring_loop,
+        ).start()
+
+    def monitoring_loop(self):
+        while not self._shutdown_event.is_set():
+            self._cpu_metrics = {}
+            for _ in range(int(self.duration // self.interval)):
+                last_cpu_metrics = {}
+                try:
+                    last_cpu_metrics = get_cpus_metrics()
+                except psutil.Error:
+                    logger.exception("Failed to monitor CPU metrics:")
+                self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics)
+
+            for metric_name, metric_values in self._cpu_metrics.items():
+                self._live.log_metric(
+                    metric_name, mean(metric_values), timestamp=True, plot=self.plot
+                )
+            self._shutdown_event.wait(self.interval)
+            if self._shutdown_event.is_set():
+                break
+
+    def end(self):
+        self._shutdown_event.set()
+
+
+def get_cpus_metrics() -> Dict[str, Union[float, int]]:
+    ram_info = psutil.virtual_memory()
+    io_info = psutil.disk_io_counters()
+    return {
+        "system/cpu/usage_percent": psutil.cpu_percent(),
+        "system/cpu/ram_usage_percent": ram_info.percent,
+        "system/io/read_speed_MB": io_info.read_bytes
+        / (io_info.read_time * MEGABYTES_DIVIDER),
+        "system/io/write_speed_MB": io_info.write_bytes
+        / (io_info.write_time * MEGABYTES_DIVIDER),
+    }
diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py
index a168de9e..5c23f490 100644
--- a/src/dvclive/utils.py
+++ b/src/dvclive/utils.py
@@ -6,7 +6,7 @@
 import shutil
 from pathlib import Path
 from platform import uname
-from typing import Union, List, Dict, TYPE_CHECKING
+from typing import Any, Union, List, Dict, TYPE_CHECKING
 import webbrowser
 
 from .error import InvalidDataTypeError
@@ -245,3 +245,13 @@ def convert_datapoints_to_list_of_dicts(
 
     # Raise an error if the input is not a supported type
     raise InvalidDataTypeError("datapoints", type(datapoints))
+
+
+def append_dict(previous_values: Dict[str, List[Any]], new_value: Dict[str, Any]):
+    return {
+        **previous_values,
+        **{
+            metric_name: [*previous_values.get(metric_name, []), value]
+            for metric_name, value in new_value.items()
+        },
+    }

From e3b654cfb793d2d76f563becb877e17b7af139f2 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 16:39:20 +0100
Subject: [PATCH 02/39] add unit tests and more cpu metrics

---
 src/dvclive/system_metrics.py |  31 +++++++--
 tests/test_system_metrics.py  | 126 ++++++++++++++++++++++++++++++++++
 tests/test_utils.py           |  24 ++++++-
 3 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_system_metrics.py

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 63e09aa6..44f4c0cb 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -10,6 +10,8 @@
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
 
+MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30
+
 
 class CPUMetricsCallback:
     def __init__(
@@ -21,6 +23,7 @@ def __init__(
         self.duration = duration
         self.interval = interval
         self.plot = plot
+        self._no_plot_metrics = ["system/cpu/count"]
 
     def __call__(self, live):
         self._live = live
@@ -39,14 +42,18 @@ def monitoring_loop(self):
                 except psutil.Error:
                     logger.exception("Failed to monitor CPU metrics:")
                 self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics)
-
+                self._shutdown_event.wait(self.interval)
+                if self._shutdown_event.is_set():
+                    break
             for metric_name, metric_values in self._cpu_metrics.items():
                 self._live.log_metric(
-                    metric_name, mean(metric_values), timestamp=True, plot=self.plot
+                    metric_name,
+                    mean(metric_values),
+                    timestamp=True,
+                    plot=self.plot
+                    if metric_name not in self._no_plot_metrics
+                    else False,
                 )
-            self._shutdown_event.wait(self.interval)
-            if self._shutdown_event.is_set():
-                break
 
     def end(self):
         self._shutdown_event.set()
@@ -55,8 +62,20 @@ def end(self):
 def get_cpus_metrics() -> Dict[str, Union[float, int]]:
     ram_info = psutil.virtual_memory()
     io_info = psutil.disk_io_counters()
+    nb_cpus = psutil.cpu_count()
+    cpus_percent = psutil.cpu_percent(percpu=True)
     return {
-        "system/cpu/usage_percent": psutil.cpu_percent(),
+        "system/cpu/usage_avg_percent": mean(cpus_percent),
+        "system/cpu/usage_max_percent": max(cpus_percent),
+        "system/cpu/count": nb_cpus,
+        "system/cpu/parallelism_percent": len(
+            [
+                percent
+                for percent in cpus_percent
+                if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE
+            ]
+        )
+        / nb_cpus,
         "system/cpu/ram_usage_percent": ram_info.percent,
         "system/io/read_speed_MB": io_info.read_bytes
         / (io_info.read_time * MEGABYTES_DIVIDER),
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
new file mode 100644
index 00000000..cfe9fcd9
--- /dev/null
+++ b/tests/test_system_metrics.py
@@ -0,0 +1,126 @@
+import time
+
+import pytest
+
+from dvclive import Live
+from dvclive.system_metrics import CPUMetricsCallback, get_cpus_metrics
+from dvclive.utils import parse_metrics
+
+
+def mock_psutil(mocker):
+    mocker.patch(
+        "dvclive.system_metrics.psutil.cpu_percent",
+        return_value=[10, 20, 30, 40, 50, 60],
+    )
+    mocker.patch("dvclive.system_metrics.psutil.cpu_count", return_value=6)
+
+    mocked_virtual_memory = mocker.MagicMock()
+    mocked_virtual_memory.percent = 20
+
+    mocked_disk_io_counters = mocker.MagicMock()
+    mocked_disk_io_counters.read_bytes = 2 * 1024**2
+    mocked_disk_io_counters.read_time = 1
+    mocked_disk_io_counters.write_bytes = 2 * 1024**2
+    mocked_disk_io_counters.write_time = 1
+
+    mocking_dict = {
+        "virtual_memory": mocked_virtual_memory,
+        "disk_io_counters": mocked_disk_io_counters,
+    }
+    for function_name, return_value in mocking_dict.items():
+        mocker.patch(
+            f"dvclive.system_metrics.psutil.{function_name}",
+            return_value=return_value,
+        )
+
+
+@pytest.mark.parametrize(
+    ("metric_name"),
+    [
+        "system/cpu/usage_avg_percent",
+        "system/cpu/usage_max_percent",
+        "system/cpu/count",
+        "system/cpu/parallelism_percent",
+        "system/cpu/ram_usage_percent",
+        "system/io/read_speed_MB",
+        "system/io/write_speed_MB",
+    ],
+)
+def test_get_cpus_metrics(mocker, metric_name):
+    mock_psutil(mocker)
+    metrics = get_cpus_metrics()
+    assert metric_name in metrics
+
+
+@pytest.mark.parametrize(
+    ("duration", "interval", "plot"),
+    [
+        (1, 0.5, True),
+        (2.0, 1, True),
+    ],
+)
+def test_cpumetricscallback_with_plot(duration, interval, plot):
+    with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live:
+        time.sleep(duration * 2)
+        live.next_step()
+        time.sleep(duration * 2 + 0.1)  # allow the thread to finish
+        history, latest = parse_metrics(live)
+        dir_path = live.dir
+
+    assert "system" in latest
+    assert "cpu" in latest["system"]
+    assert "usage_avg_percent" in latest["system"]["cpu"]
+    assert "usage_max_percent" in latest["system"]["cpu"]
+    assert "count" in latest["system"]["cpu"]
+    assert "parallelism_percent" in latest["system"]["cpu"]
+    assert "ram_usage_percent" in latest["system"]["cpu"]
+    assert "io" in latest["system"]
+    assert "read_speed_MB" in latest["system"]["io"]
+    assert "write_speed_MB" in latest["system"]["io"]
+
+    prefix = f"{dir_path}/plots/metrics/system"
+    assert f"{prefix}/cpu/ram_usage_percent.tsv" in history
+    assert f"{prefix}/cpu/usage_avg_percent.tsv" in history
+    assert f"{prefix}/cpu/usage_max_percent.tsv" in history
+    assert f"{prefix}/cpu/parallelism_percent.tsv" in history
+    assert f"{prefix}/io/write_speed_MB.tsv" in history
+    assert f"{prefix}/io/read_speed_MB.tsv" in history
+    assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4
+
+    assert f"{prefix}/cpu/count.tsv" not in history  # no plot for count
+
+
+@pytest.mark.parametrize(
+    ("duration", "interval", "plot"),
+    [
+        (1, 0.5, False),
+        (2.0, 1, False),
+    ],
+)
+def test_cpumetricscallback_without_plot(duration, interval, plot):
+    with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live:
+        time.sleep(duration * 2)
+        live.next_step()
+        time.sleep(duration * 2 + 0.1)  # allow the thread to finish
+        history, latest = parse_metrics(live)
+        dir_path = live.dir
+
+    assert "system" in latest
+    assert "cpu" in latest["system"]
+    assert "usage_avg_percent" in latest["system"]["cpu"]
+    assert "usage_max_percent" in latest["system"]["cpu"]
+    assert "count" in latest["system"]["cpu"]
+    assert "parallelism_percent" in latest["system"]["cpu"]
+    assert "ram_usage_percent" in latest["system"]["cpu"]
+    assert "io" in latest["system"]
+    assert "read_speed_MB" in latest["system"]["io"]
+    assert "write_speed_MB" in latest["system"]["io"]
+
+    prefix = f"{dir_path}/plots/metrics/system"
+    assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history
+    assert f"{prefix}/cpu/usage_max_percent.tsv" not in history
+    assert f"{prefix}/cpu/count.tsv" not in history
+    assert f"{prefix}/cpu/parallelism_percent.tsv" not in history
+    assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history
+    assert f"{prefix}/cpu/write_speed_MB.tsv" not in history
+    assert f"{prefix}/cpu/read_speed_MB.tsv" not in history
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2b5d56ca..ab782a29 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,11 @@
 import pandas as pd
 import pytest
 
-from dvclive.utils import standardize_metric_name, convert_datapoints_to_list_of_dicts
+from dvclive.utils import (
+    standardize_metric_name,
+    convert_datapoints_to_list_of_dicts,
+    append_dict,
+)
 from dvclive.error import InvalidDataTypeError
 
 
@@ -45,3 +49,21 @@ def test_unsupported_format():
         convert_datapoints_to_list_of_dicts("unsupported data format")
 
     assert "not supported type" in str(exc_info.value)
+
+
+@pytest.mark.parametrize(
+    ("aggregate", "new_metrics", "result"),
+    [
+        ({"a": [1], "b": [0.8]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [0.8, 1.6]}),
+        ({"a": [1]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [1.6]}),
+        (
+            {"a": [1], "b": [0.8], "c": [2]},
+            {"a": 2, "b": 1.6},
+            {"a": [1, 2], "b": [0.8, 1.6], "c": [2]},
+        ),
+        ({}, {"a": 2, "b": 1.6}, {"a": [2], "b": [1.6]}),
+        ({}, {}, {}),
+    ],
+)
+def test_append_dict(aggregate, new_metrics, result):
+    assert append_dict(aggregate, new_metrics) == result

From e6cff3254875eb11a2464fc2d55edbc8a64aa21a Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 16:44:01 +0100
Subject: [PATCH 03/39] change default value for callback

---
 src/dvclive/system_metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 44f4c0cb..3ffe4a42 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -16,8 +16,8 @@
 class CPUMetricsCallback:
     def __init__(
         self,
-        duration: Union[int, float] = 30,
-        interval: Union[int, float] = 2.0,
+        duration: Union[int, float] = 5,
+        interval: Union[int, float] = 1.0,
         plot: bool = True,
     ):
         self.duration = duration

From 8663011531007078a88599e325fa9261e2ea656e Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 16:48:04 +0100
Subject: [PATCH 04/39] uses a percentage value for cpu parallelism

---
 src/dvclive/system_metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 3ffe4a42..7bac6456 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -75,6 +75,7 @@ def get_cpus_metrics() -> Dict[str, Union[float, int]]:
                 if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE
             ]
         )
+        * 100
         / nb_cpus,
         "system/cpu/ram_usage_percent": ram_info.percent,
         "system/io/read_speed_MB": io_info.read_bytes

From 134675027e3caec95c4742ee0cf83e02df207699 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 17:01:47 +0100
Subject: [PATCH 05/39] add ram total

---
 src/dvclive/system_metrics.py | 2 ++
 tests/test_system_metrics.py  | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 7bac6456..00f0658b 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -9,6 +9,7 @@
 
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
+GIGABYTES_DIVIDER = 1024.0**3
 
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30
 
@@ -78,6 +79,7 @@ def get_cpus_metrics() -> Dict[str, Union[float, int]]:
         * 100
         / nb_cpus,
         "system/cpu/ram_usage_percent": ram_info.percent,
+        "system/cpu/ram_total_GB": ram_info.total / GIGABYTES_DIVIDER,
         "system/io/read_speed_MB": io_info.read_bytes
         / (io_info.read_time * MEGABYTES_DIVIDER),
         "system/io/write_speed_MB": io_info.write_bytes
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index cfe9fcd9..e5e8ed6d 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -16,6 +16,7 @@ def mock_psutil(mocker):
 
     mocked_virtual_memory = mocker.MagicMock()
     mocked_virtual_memory.percent = 20
+    mocked_virtual_memory.total = 4 * 1024**3
 
     mocked_disk_io_counters = mocker.MagicMock()
     mocked_disk_io_counters.read_bytes = 2 * 1024**2
@@ -42,6 +43,7 @@ def mock_psutil(mocker):
         "system/cpu/count",
         "system/cpu/parallelism_percent",
         "system/cpu/ram_usage_percent",
+        "system/cpu/ram_total_GB",
         "system/io/read_speed_MB",
         "system/io/write_speed_MB",
     ],
@@ -74,15 +76,17 @@ def test_cpumetricscallback_with_plot(duration, interval, plot):
     assert "count" in latest["system"]["cpu"]
     assert "parallelism_percent" in latest["system"]["cpu"]
     assert "ram_usage_percent" in latest["system"]["cpu"]
+    assert "ram_total_GB" in latest["system"]["cpu"]
     assert "io" in latest["system"]
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
     prefix = f"{dir_path}/plots/metrics/system"
-    assert f"{prefix}/cpu/ram_usage_percent.tsv" in history
     assert f"{prefix}/cpu/usage_avg_percent.tsv" in history
     assert f"{prefix}/cpu/usage_max_percent.tsv" in history
     assert f"{prefix}/cpu/parallelism_percent.tsv" in history
+    assert f"{prefix}/cpu/ram_usage_percent.tsv" in history
+    assert f"{prefix}/cpu/ram_total_GB.tsv" in history
     assert f"{prefix}/io/write_speed_MB.tsv" in history
     assert f"{prefix}/io/read_speed_MB.tsv" in history
     assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4
@@ -112,6 +116,7 @@ def test_cpumetricscallback_without_plot(duration, interval, plot):
     assert "count" in latest["system"]["cpu"]
     assert "parallelism_percent" in latest["system"]["cpu"]
     assert "ram_usage_percent" in latest["system"]["cpu"]
+    assert "ram_total_GB" in latest["system"]["cpu"]
     assert "io" in latest["system"]
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
@@ -122,5 +127,6 @@ def test_cpumetricscallback_without_plot(duration, interval, plot):
     assert f"{prefix}/cpu/count.tsv" not in history
     assert f"{prefix}/cpu/parallelism_percent.tsv" not in history
     assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history
+    assert f"{prefix}/cpu/ram_total_GB.tsv" not in history
     assert f"{prefix}/cpu/write_speed_MB.tsv" not in history
     assert f"{prefix}/cpu/read_speed_MB.tsv" not in history

From 5f90bea9262bb9cb830cb3471344b480e9cd2b7f Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 6 Feb 2024 17:06:01 +0100
Subject: [PATCH 06/39] remove total ram measure from plots

---
 src/dvclive/system_metrics.py | 2 +-
 tests/test_system_metrics.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 00f0658b..b9b27161 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -24,7 +24,7 @@ def __init__(
         self.duration = duration
         self.interval = interval
         self.plot = plot
-        self._no_plot_metrics = ["system/cpu/count"]
+        self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"]
 
     def __call__(self, live):
         self._live = live
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index e5e8ed6d..ae7c24b7 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -86,12 +86,12 @@ def test_cpumetricscallback_with_plot(duration, interval, plot):
     assert f"{prefix}/cpu/usage_max_percent.tsv" in history
     assert f"{prefix}/cpu/parallelism_percent.tsv" in history
     assert f"{prefix}/cpu/ram_usage_percent.tsv" in history
-    assert f"{prefix}/cpu/ram_total_GB.tsv" in history
     assert f"{prefix}/io/write_speed_MB.tsv" in history
     assert f"{prefix}/io/read_speed_MB.tsv" in history
     assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4
 
     assert f"{prefix}/cpu/count.tsv" not in history  # no plot for count
+    assert f"{prefix}/cpu/ram_total_GB.tsv" not in history
 
 
 @pytest.mark.parametrize(

From 2ac50f2e5be954d03108259dcd78f3d64c587179 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Wed, 7 Feb 2024 09:15:18 +0100
Subject: [PATCH 07/39] update pyproject.toml

---
 pyproject.toml | 56 ++++++++------------------------------------------
 1 file changed, 9 insertions(+), 47 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7e06166e..d5fb01b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,11 @@ tests = [
     "dvclive[image,plots,markdown]",
     "ipython",
 ]
-dev = ["dvclive[all,tests]", "mypy>=1.1.1", "types-PyYAML"]
+dev = [
+    "dvclive[all,tests]",
+    "mypy>=1.1.1",
+    "types-PyYAML",
+]
 mmcv = ["mmcv"]
 tf = ["tensorflow"]
 xgb = ["xgboost"]
@@ -65,7 +69,7 @@ fastai = ["fastai"]
 lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"]
 optuna = ["optuna"]
 all = [
-    "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]",
+   "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]"
 ]
 
 [project.urls]
@@ -133,54 +137,12 @@ ignore-words-list = "fpr"
 
 [tool.ruff]
 ignore = ["N818", "UP006", "UP007", "UP035", "UP038", "B905", "PGH003"]
-select = [
-    "F",
-    "E",
-    "W",
-    "C90",
-    "N",
-    "UP",
-    "YTT",
-    "S",
-    "BLE",
-    "B",
-    "A",
-    "C4",
-    "T10",
-    "EXE",
-    "ISC",
-    "INP",
-    "PIE",
-    "T20",
-    "PT",
-    "Q",
-    "RSE",
-    "RET",
-    "SLF",
-    "SIM",
-    "TID",
-    "TCH",
-    "INT",
-    "ARG",
-    "PGH",
-    "PL",
-    "TRY",
-    "NPY",
-    "RUF",
-]
+select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T10", "EXE", "ISC", "INP", "PIE", "T20", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PGH", "PL", "TRY", "NPY", "RUF"]
 
 [tool.ruff.per-file-ignores]
 "noxfile.py" = ["D", "PTH"]
-"tests/*" = [
-    "S101",
-    "INP001",
-    "SLF001",
-    "ARG001",
-    "ARG002",
-    "ARG005",
-    "PLR2004",
-    "NPY002",
-]
+"tests/*" = ["S101", "INP001", "SLF001", "ARG001", "ARG002", "ARG005", "PLR2004", "NPY002"]
 
 [tool.ruff.pylint]
 max-args = 10
+

From 5c0a2882c81d8696fb68b6841c8e5355978522d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 7 Feb 2024 08:15:37 +0000
Subject: [PATCH 08/39] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d5fb01b1..d6f939df 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -145,4 +145,3 @@ select = ["F", "E", "W", "C90", "N", "UP", "YTT", "S", "BLE", "B", "A", "C4", "T
 
 [tool.ruff.pylint]
 max-args = 10
-

From 30315bb327ee9da076c64b65ae88d0b09b4f4a63 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Wed, 7 Feb 2024 17:07:06 +0100
Subject: [PATCH 09/39] add tmpdir to metrics tests

---
 tests/test_system_metrics.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index ae7c24b7..61c0ed63 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -61,13 +61,16 @@ def test_get_cpus_metrics(mocker, metric_name):
         (2.0, 1, True),
     ],
 )
-def test_cpumetricscallback_with_plot(duration, interval, plot):
-    with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live:
+def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir):
+    with Live(
+        tmpdir,
+        save_dvc_exp=False,
+        callbacks=[CPUMetricsCallback(duration, interval, plot)],
+    ) as live:
         time.sleep(duration * 2)
         live.next_step()
         time.sleep(duration * 2 + 0.1)  # allow the thread to finish
         history, latest = parse_metrics(live)
-        dir_path = live.dir
 
     assert "system" in latest
     assert "cpu" in latest["system"]
@@ -81,7 +84,7 @@ def test_cpumetricscallback_with_plot(duration, interval, plot):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    prefix = f"{dir_path}/plots/metrics/system"
+    prefix = f"{tmpdir}/plots/metrics/system"
     assert f"{prefix}/cpu/usage_avg_percent.tsv" in history
     assert f"{prefix}/cpu/usage_max_percent.tsv" in history
     assert f"{prefix}/cpu/parallelism_percent.tsv" in history
@@ -101,13 +104,16 @@ def test_cpumetricscallback_with_plot(duration, interval, plot):
         (2.0, 1, False),
     ],
 )
-def test_cpumetricscallback_without_plot(duration, interval, plot):
-    with Live(callbacks=[CPUMetricsCallback(duration, interval, plot)]) as live:
+def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir):
+    with Live(
+        tmpdir,
+        save_dvc_exp=False,
+        callbacks=[CPUMetricsCallback(duration, interval, plot)],
+    ) as live:
         time.sleep(duration * 2)
         live.next_step()
         time.sleep(duration * 2 + 0.1)  # allow the thread to finish
         history, latest = parse_metrics(live)
-        dir_path = live.dir
 
     assert "system" in latest
     assert "cpu" in latest["system"]
@@ -121,7 +127,7 @@ def test_cpumetricscallback_without_plot(duration, interval, plot):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    prefix = f"{dir_path}/plots/metrics/system"
+    prefix = f"{tmpdir}/plots/metrics/system"
     assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history
     assert f"{prefix}/cpu/usage_max_percent.tsv" not in history
     assert f"{prefix}/cpu/count.tsv" not in history

From 40686c7ea0a90cd710b78b865fc8056eb30dff50 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Wed, 7 Feb 2024 17:31:54 +0100
Subject: [PATCH 10/39] default to no monitoring callbacks

---
 src/dvclive/live.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 7ebcc289..beeba117 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -51,7 +51,6 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .system_metrics import CPUMetricsCallback
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -136,7 +135,7 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._callbacks = callbacks or [CPUMetricsCallback()]
+        self._callbacks = callbacks or []
         for callback in self._callbacks:
             callback(self)
 

From 4b9814486bf02856159d7967833c8f5e8bacf987 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 9 Feb 2024 11:57:07 +0100
Subject: [PATCH 11/39] fix tmp_dir for test on windows and macos

---
 tests/test_system_metrics.py | 49 ++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index 61c0ed63..819ebd5e 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -61,16 +61,16 @@ def test_get_cpus_metrics(mocker, metric_name):
         (2.0, 1, True),
     ],
 )
-def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir):
+def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot):
     with Live(
-        tmpdir,
+        tmp_dir,
         save_dvc_exp=False,
         callbacks=[CPUMetricsCallback(duration, interval, plot)],
     ) as live:
         time.sleep(duration * 2)
         live.next_step()
         time.sleep(duration * 2 + 0.1)  # allow the thread to finish
-        history, latest = parse_metrics(live)
+        timeseries, latest = parse_metrics(live)
 
     assert "system" in latest
     assert "cpu" in latest["system"]
@@ -84,17 +84,17 @@ def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    prefix = f"{tmpdir}/plots/metrics/system"
-    assert f"{prefix}/cpu/usage_avg_percent.tsv" in history
-    assert f"{prefix}/cpu/usage_max_percent.tsv" in history
-    assert f"{prefix}/cpu/parallelism_percent.tsv" in history
-    assert f"{prefix}/cpu/ram_usage_percent.tsv" in history
-    assert f"{prefix}/io/write_speed_MB.tsv" in history
-    assert f"{prefix}/io/read_speed_MB.tsv" in history
-    assert len(history[f"{prefix}/cpu/ram_usage_percent.tsv"]) == 4
+    assert any("system/cpu/usage_avg_percent.tsv" in key for key in timeseries)
+    assert any("system/cpu/usage_max_percent.tsv" in key for key in timeseries)
+    assert any("system/cpu/parallelism_percent.tsv" in key for key in timeseries)
+    assert any("system/cpu/ram_usage_percent.tsv" in key for key in timeseries)
+    assert any("system/io/write_speed_MB.tsv" in key for key in timeseries)
+    assert any("system/io/read_speed_MB.tsv" in key for key in timeseries)
+    assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key)
 
-    assert f"{prefix}/cpu/count.tsv" not in history  # no plot for count
-    assert f"{prefix}/cpu/ram_total_GB.tsv" not in history
+    # not plot for fix values
+    assert all("system/cpu/count.tsv" not in key for key in timeseries)
+    assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries)
 
 
 @pytest.mark.parametrize(
@@ -104,16 +104,16 @@ def test_cpumetricscallback_with_plot(duration, interval, plot, tmpdir):
         (2.0, 1, False),
     ],
 )
-def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir):
+def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot):
     with Live(
-        tmpdir,
+        tmp_dir,
         save_dvc_exp=False,
         callbacks=[CPUMetricsCallback(duration, interval, plot)],
     ) as live:
         time.sleep(duration * 2)
         live.next_step()
         time.sleep(duration * 2 + 0.1)  # allow the thread to finish
-        history, latest = parse_metrics(live)
+        timeseries, latest = parse_metrics(live)
 
     assert "system" in latest
     assert "cpu" in latest["system"]
@@ -127,12 +127,11 @@ def test_cpumetricscallback_without_plot(duration, interval, plot, tmpdir):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    prefix = f"{tmpdir}/plots/metrics/system"
-    assert f"{prefix}/cpu/usage_avg_percent.tsv" not in history
-    assert f"{prefix}/cpu/usage_max_percent.tsv" not in history
-    assert f"{prefix}/cpu/count.tsv" not in history
-    assert f"{prefix}/cpu/parallelism_percent.tsv" not in history
-    assert f"{prefix}/cpu/ram_usage_percent.tsv" not in history
-    assert f"{prefix}/cpu/ram_total_GB.tsv" not in history
-    assert f"{prefix}/cpu/write_speed_MB.tsv" not in history
-    assert f"{prefix}/cpu/read_speed_MB.tsv" not in history
+    assert all("system/cpu/usage_avg_percent.tsv" not in key for key in timeseries)
+    assert all("system/cpu/usage_max_percent.tsv" not in key for key in timeseries)
+    assert all("system/cpu/count.tsv" not in key for key in timeseries)
+    assert all("system/cpu/parallelism_percent.tsv" not in key for key in timeseries)
+    assert all("system/cpu/ram_usage_percent.tsv" not in key for key in timeseries)
+    assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries)
+    assert all("system/cpu/write_speed_MB.tsv" not in key for key in timeseries)
+    assert all("system/cpu/read_speed_MB.tsv" not in key for key in timeseries)

From 36fd94dc2041f67dac8e5ef52e6d4ac1345bfdf4 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 9 Feb 2024 14:08:51 +0100
Subject: [PATCH 12/39] fix tmp_dir for test on windows and macos

---
 tests/test_system_metrics.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index 819ebd5e..6d91d62d 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -84,17 +84,17 @@ def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    assert any("system/cpu/usage_avg_percent.tsv" in key for key in timeseries)
-    assert any("system/cpu/usage_max_percent.tsv" in key for key in timeseries)
-    assert any("system/cpu/parallelism_percent.tsv" in key for key in timeseries)
-    assert any("system/cpu/ram_usage_percent.tsv" in key for key in timeseries)
-    assert any("system/io/write_speed_MB.tsv" in key for key in timeseries)
-    assert any("system/io/read_speed_MB.tsv" in key for key in timeseries)
-    assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key)
+    assert any("usage_avg_percent" in key for key in timeseries)
+    assert any("usage_max_percent.tsv" in key for key in timeseries)
+    assert any("parallelism_percent.tsv" in key for key in timeseries)
+    assert any("ram_usage_percent.tsv" in key for key in timeseries)
+    assert any("write_speed_MB.tsv" in key for key in timeseries)
+    assert any("read_speed_MB.tsv" in key for key in timeseries)
+    # assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key)
 
-    # not plot for fix values
-    assert all("system/cpu/count.tsv" not in key for key in timeseries)
-    assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries)
+    # not plot for constant values
+    assert all("count.tsv" not in key for key in timeseries)
+    assert all("ram_total_GB.tsv" not in key for key in timeseries)
 
 
 @pytest.mark.parametrize(
@@ -127,11 +127,11 @@ def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot):
     assert "read_speed_MB" in latest["system"]["io"]
     assert "write_speed_MB" in latest["system"]["io"]
 
-    assert all("system/cpu/usage_avg_percent.tsv" not in key for key in timeseries)
-    assert all("system/cpu/usage_max_percent.tsv" not in key for key in timeseries)
-    assert all("system/cpu/count.tsv" not in key for key in timeseries)
-    assert all("system/cpu/parallelism_percent.tsv" not in key for key in timeseries)
-    assert all("system/cpu/ram_usage_percent.tsv" not in key for key in timeseries)
-    assert all("system/cpu/ram_total_GB.tsv" not in key for key in timeseries)
-    assert all("system/cpu/write_speed_MB.tsv" not in key for key in timeseries)
-    assert all("system/cpu/read_speed_MB.tsv" not in key for key in timeseries)
+    assert all("usage_avg_percent.tsv" not in key for key in timeseries)
+    assert all("usage_max_percent.tsv" not in key for key in timeseries)
+    assert all("count.tsv" not in key for key in timeseries)
+    assert all("parallelism_percent.tsv" not in key for key in timeseries)
+    assert all("ram_usage_percent.tsv" not in key for key in timeseries)
+    assert all("ram_total_GB.tsv" not in key for key in timeseries)
+    assert all("write_speed_MB.tsv" not in key for key in timeseries)
+    assert all("read_speed_MB.tsv" not in key for key in timeseries)

From 59db522626b1f8a535ead622aa0578e0813fbc4a Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 9 Feb 2024 14:45:16 +0100
Subject: [PATCH 13/39] fix update data to studio live experiment

---
 src/dvclive/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py
index 0c293c4d..d487ae43 100644
--- a/src/dvclive/utils.py
+++ b/src/dvclive/utils.py
@@ -111,7 +111,7 @@ def parse_metrics(live):
     metrics_path = Path(live.plots_dir) / Metric.subfolder
     history = {}
     for suffix in Metric.suffixes:
-        for scalar_file in metrics_path.rglob(f"*{suffix}"):
+        for scalar_file in metrics_path.rglob(f"**/*{suffix}"):
             history[str(scalar_file)] = parse_tsv(scalar_file)
     latest = parse_json(live.metrics_file)
     return history, latest

From 2393fb0af41c40a103f62d17cb79c2f138f5f52e Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 9 Feb 2024 14:54:32 +0100
Subject: [PATCH 14/39] fix studio update data problem

---
 src/dvclive/studio.py | 5 ++---
 src/dvclive/utils.py  | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index f74db7ab..90c1ed10 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -14,7 +14,7 @@
 
 
 def _get_unsent_datapoints(plot, latest_step):
-    return [x for x in plot if int(x["step"]) > latest_step]
+    return [x for x in plot if int(x["step"]) >= latest_step]
 
 
 def _cast_to_numbers(datapoints):
@@ -53,7 +53,7 @@ def _adapt_images(live):
     return {
         _adapt_path(live, image.output_path): {"image": _adapt_image(image.output_path)}
         for image in live._images.values()
-        if image.step > live._latest_studio_step
+        if image.step >= live._latest_studio_step
     }
 
 
@@ -76,7 +76,6 @@ def get_studio_updates(live):
         for name, plot in plots.items()
     }
     plots = {k: {"data": v} for k, v in plots.items()}
-
     plots.update(_adapt_images(live))
 
     return metrics, params, plots
diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py
index d487ae43..0c293c4d 100644
--- a/src/dvclive/utils.py
+++ b/src/dvclive/utils.py
@@ -111,7 +111,7 @@ def parse_metrics(live):
     metrics_path = Path(live.plots_dir) / Metric.subfolder
     history = {}
     for suffix in Metric.suffixes:
-        for scalar_file in metrics_path.rglob(f"**/*{suffix}"):
+        for scalar_file in metrics_path.rglob(f"*{suffix}"):
             history[str(scalar_file)] = parse_tsv(scalar_file)
     latest = parse_json(live.metrics_file)
     return history, latest

From 3b327c5c7e74b4eccc24a8efd425208039e53759 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 9 Feb 2024 17:25:58 +0100
Subject: [PATCH 15/39] debug studio updates

---
 src/dvclive/live.py   | 5 +++++
 src/dvclive/studio.py | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index beeba117..18867279 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -362,6 +362,7 @@ def sync(self):
 
         self.make_report()
 
+        logger.warning(f"post_to_studio step{self.step}")
         self.post_to_studio("data")
 
     def next_step(self):
@@ -392,6 +393,10 @@ def log_metric(
             self._metrics[name] = metric
 
         metric.step = self.step
+
+        logger.warning(
+            f"Logging {name} with value {val} and step{self.step} timestamp {timestamp}"
+        )
         if plot:
             metric.dump(val, timestamp=timestamp)
 
diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index 90c1ed10..80d55b8a 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -14,7 +14,7 @@
 
 
 def _get_unsent_datapoints(plot, latest_step):
-    return [x for x in plot if int(x["step"]) >= latest_step]
+    return [x for x in plot if int(x["step"]) > latest_step]
 
 
 def _cast_to_numbers(datapoints):
@@ -53,7 +53,7 @@ def _adapt_images(live):
     return {
         _adapt_path(live, image.output_path): {"image": _adapt_image(image.output_path)}
         for image in live._images.values()
-        if image.step >= live._latest_studio_step
+        if image.step > live._latest_studio_step
     }
 
 
@@ -71,6 +71,8 @@ def get_studio_updates(live):
     metrics_file = _adapt_path(live, metrics_file)
     metrics = {metrics_file: {"data": metrics}}
 
+    logger.warning(f"steeep: {live._latest_studio_step}")
+    logger.warning(plots)
     plots = {
         _adapt_path(live, name): _adapt_plot_datapoints(live, plot)
         for name, plot in plots.items()

From b0ac980505c87a0bb69b4145e7498d70da914e6d Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Mon, 12 Feb 2024 11:02:59 +0100
Subject: [PATCH 16/39] improve code readability:

hide CPUMetrics properties
change duration to nb_samples
log only once if problem with psutil
---
 src/dvclive/live.py           | 14 +++-----
 src/dvclive/studio.py         |  2 --
 src/dvclive/system_metrics.py | 39 +++++++++++-----------
 src/dvclive/utils.py          | 12 +------
 tests/test_system_metrics.py  | 61 ++++-------------------------------
 tests/test_utils.py           | 19 -----------
 6 files changed, 34 insertions(+), 113 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 18867279..c3bcbc66 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -11,7 +11,6 @@
 from pathlib import Path, PurePath
 from typing import (
     Any,
-    Callable,
     Dict,
     List,
     Optional,
@@ -51,6 +50,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
+from .system_metrics import CPUMetrics
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -89,7 +89,7 @@ def __init__(
         cache_images: bool = False,
         exp_name: Optional[str] = None,
         exp_message: Optional[str] = None,
-        callbacks: Optional[List[Callable]] = None,
+        monitor_system: bool = False,
     ):
         self.summary: Dict[str, Any] = {}
 
@@ -135,8 +135,8 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._callbacks = callbacks or []
-        for callback in self._callbacks:
+        self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else []
+        for callback in self._system_monitoring_callbacks:
             callback(self)
 
     def _init_resume(self):
@@ -362,7 +362,6 @@ def sync(self):
 
         self.make_report()
 
-        logger.warning(f"post_to_studio step{self.step}")
         self.post_to_studio("data")
 
     def next_step(self):
@@ -394,9 +393,6 @@ def log_metric(
 
         metric.step = self.step
 
-        logger.warning(
-            f"Logging {name} with value {val} and step{self.step} timestamp {timestamp}"
-        )
         if plot:
             metric.dump(val, timestamp=timestamp)
 
@@ -626,7 +622,7 @@ def end(self):
             self.step = self.summary["step"]
 
         # Kill all independent threads that can be found in callbacks
-        for callback in self._callbacks:
+        for callback in self._system_monitoring_callbacks:
             callback.end()
 
         self.sync()
diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index 80d55b8a..bff27040 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -71,8 +71,6 @@ def get_studio_updates(live):
     metrics_file = _adapt_path(live, metrics_file)
     metrics = {metrics_file: {"data": metrics}}
 
-    logger.warning(f"steeep: {live._latest_studio_step}")
-    logger.warning(plots)
     plots = {
         _adapt_path(live, name): _adapt_plot_datapoints(live, plot)
         for name, plot in plots.items()
diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index b9b27161..dd707007 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -4,8 +4,7 @@
 import psutil
 from statistics import mean
 from threading import Event, Thread
-
-from .utils import append_dict
+from funcy import merge_with
 
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
@@ -14,44 +13,48 @@
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30
 
 
-class CPUMetricsCallback:
+class CPUMetrics:
     def __init__(
         self,
-        duration: Union[int, float] = 5,
-        interval: Union[int, float] = 1.0,
+        interval: float = 0.5,
+        nb_samples: int = 10,
         plot: bool = True,
     ):
-        self.duration = duration
-        self.interval = interval
-        self.plot = plot
+        self._interval = interval  # seconds
+        self._nb_samples = nb_samples
+        self._plot = plot
         self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"]
+        self._warn_user = True
 
     def __call__(self, live):
         self._live = live
         self._shutdown_event = Event()
         Thread(
-            target=self.monitoring_loop,
+            target=self._monitoring_loop,
         ).start()
 
-    def monitoring_loop(self):
+    def _monitoring_loop(self):
         while not self._shutdown_event.is_set():
             self._cpu_metrics = {}
-            for _ in range(int(self.duration // self.interval)):
+            for _ in range(self._nb_samples):
                 last_cpu_metrics = {}
                 try:
-                    last_cpu_metrics = get_cpus_metrics()
+                    last_cpu_metrics = _get_cpus_metrics()
                 except psutil.Error:
-                    logger.exception("Failed to monitor CPU metrics:")
-                self._cpu_metrics = append_dict(self._cpu_metrics, last_cpu_metrics)
-                self._shutdown_event.wait(self.interval)
+                    if self._warn_user:
+                        logger.exception("Failed to monitor CPU metrics")
+                        self._warn_user = False
+
+                self._cpu_metrics = merge_with(sum, self._cpu_metrics, last_cpu_metrics)
+                self._shutdown_event.wait(self._interval)
                 if self._shutdown_event.is_set():
                     break
             for metric_name, metric_values in self._cpu_metrics.items():
                 self._live.log_metric(
                     metric_name,
-                    mean(metric_values),
+                    metric_values / self._nb_samples,
                     timestamp=True,
-                    plot=self.plot
+                    plot=self._plot
                     if metric_name not in self._no_plot_metrics
                     else False,
                 )
@@ -60,7 +63,7 @@ def end(self):
         self._shutdown_event.set()
 
 
-def get_cpus_metrics() -> Dict[str, Union[float, int]]:
+def _get_cpus_metrics() -> Dict[str, Union[float, int]]:
     ram_info = psutil.virtual_memory()
     io_info = psutil.disk_io_counters()
     nb_cpus = psutil.cpu_count()
diff --git a/src/dvclive/utils.py b/src/dvclive/utils.py
index 0c293c4d..8b762590 100644
--- a/src/dvclive/utils.py
+++ b/src/dvclive/utils.py
@@ -6,7 +6,7 @@
 import shutil
 from pathlib import Path, PurePath
 from platform import uname
-from typing import Any, Union, List, Dict, TYPE_CHECKING
+from typing import Union, List, Dict, TYPE_CHECKING
 import webbrowser
 
 from .error import InvalidDataTypeError
@@ -245,13 +245,3 @@ def convert_datapoints_to_list_of_dicts(
 
     # Raise an error if the input is not a supported type
     raise InvalidDataTypeError("datapoints", type(datapoints))
-
-
-def append_dict(previous_values: Dict[str, List[Any]], new_value: Dict[str, Any]):
-    return {
-        **previous_values,
-        **{
-            metric_name: [*previous_values.get(metric_name, []), value]
-            for metric_name, value in new_value.items()
-        },
-    }
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index 6d91d62d..3195800c 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -3,7 +3,7 @@
 import pytest
 
 from dvclive import Live
-from dvclive.system_metrics import CPUMetricsCallback, get_cpus_metrics
+from dvclive.system_metrics import _get_cpus_metrics
 from dvclive.utils import parse_metrics
 
 
@@ -50,26 +50,19 @@ def mock_psutil(mocker):
 )
 def test_get_cpus_metrics(mocker, metric_name):
     mock_psutil(mocker)
-    metrics = get_cpus_metrics()
+    metrics = _get_cpus_metrics()
     assert metric_name in metrics
 
 
-@pytest.mark.parametrize(
-    ("duration", "interval", "plot"),
-    [
-        (1, 0.5, True),
-        (2.0, 1, True),
-    ],
-)
-def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot):
+def test_monitor_system(tmp_dir):
     with Live(
         tmp_dir,
         save_dvc_exp=False,
-        callbacks=[CPUMetricsCallback(duration, interval, plot)],
+        monitor_system=True,
     ) as live:
-        time.sleep(duration * 2)
+        time.sleep(5 + 1)  # allow the thread to finish
         live.next_step()
-        time.sleep(duration * 2 + 0.1)  # allow the thread to finish
+        time.sleep(5 + 1)  # allow the thread to finish
         timeseries, latest = parse_metrics(live)
 
     assert "system" in latest
@@ -90,48 +83,8 @@ def test_cpumetricscallback_with_plot(tmp_dir, duration, interval, plot):
     assert any("ram_usage_percent.tsv" in key for key in timeseries)
     assert any("write_speed_MB.tsv" in key for key in timeseries)
     assert any("read_speed_MB.tsv" in key for key in timeseries)
-    # assert all(len(timeseries[key]) == 4 for key in timeseries if "system" in key)
+    assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
 
     # not plot for constant values
     assert all("count.tsv" not in key for key in timeseries)
     assert all("ram_total_GB.tsv" not in key for key in timeseries)
-
-
-@pytest.mark.parametrize(
-    ("duration", "interval", "plot"),
-    [
-        (1, 0.5, False),
-        (2.0, 1, False),
-    ],
-)
-def test_cpumetricscallback_without_plot(tmp_dir, duration, interval, plot):
-    with Live(
-        tmp_dir,
-        save_dvc_exp=False,
-        callbacks=[CPUMetricsCallback(duration, interval, plot)],
-    ) as live:
-        time.sleep(duration * 2)
-        live.next_step()
-        time.sleep(duration * 2 + 0.1)  # allow the thread to finish
-        timeseries, latest = parse_metrics(live)
-
-    assert "system" in latest
-    assert "cpu" in latest["system"]
-    assert "usage_avg_percent" in latest["system"]["cpu"]
-    assert "usage_max_percent" in latest["system"]["cpu"]
-    assert "count" in latest["system"]["cpu"]
-    assert "parallelism_percent" in latest["system"]["cpu"]
-    assert "ram_usage_percent" in latest["system"]["cpu"]
-    assert "ram_total_GB" in latest["system"]["cpu"]
-    assert "io" in latest["system"]
-    assert "read_speed_MB" in latest["system"]["io"]
-    assert "write_speed_MB" in latest["system"]["io"]
-
-    assert all("usage_avg_percent.tsv" not in key for key in timeseries)
-    assert all("usage_max_percent.tsv" not in key for key in timeseries)
-    assert all("count.tsv" not in key for key in timeseries)
-    assert all("parallelism_percent.tsv" not in key for key in timeseries)
-    assert all("ram_usage_percent.tsv" not in key for key in timeseries)
-    assert all("ram_total_GB.tsv" not in key for key in timeseries)
-    assert all("write_speed_MB.tsv" not in key for key in timeseries)
-    assert all("read_speed_MB.tsv" not in key for key in timeseries)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ab782a29..ffbc66ef 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,7 +5,6 @@
 from dvclive.utils import (
     standardize_metric_name,
     convert_datapoints_to_list_of_dicts,
-    append_dict,
 )
 from dvclive.error import InvalidDataTypeError
 
@@ -49,21 +48,3 @@ def test_unsupported_format():
         convert_datapoints_to_list_of_dicts("unsupported data format")
 
     assert "not supported type" in str(exc_info.value)
-
-
-@pytest.mark.parametrize(
-    ("aggregate", "new_metrics", "result"),
-    [
-        ({"a": [1], "b": [0.8]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [0.8, 1.6]}),
-        ({"a": [1]}, {"a": 2, "b": 1.6}, {"a": [1, 2], "b": [1.6]}),
-        (
-            {"a": [1], "b": [0.8], "c": [2]},
-            {"a": 2, "b": 1.6},
-            {"a": [1, 2], "b": [0.8, 1.6], "c": [2]},
-        ),
-        ({}, {"a": 2, "b": 1.6}, {"a": [2], "b": [1.6]}),
-        ({}, {}, {}),
-    ],
-)
-def test_append_dict(aggregate, new_metrics, result):
-    assert append_dict(aggregate, new_metrics) == result

From 5162ac23c244379858a641274dac652ed3576ab5 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Mon, 12 Feb 2024 18:19:19 +0100
Subject: [PATCH 17/39] remove hack lightning

---
 src/dvclive/lightning.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/dvclive/lightning.py b/src/dvclive/lightning.py
index 2c8d6006..dc159526 100644
--- a/src/dvclive/lightning.py
+++ b/src/dvclive/lightning.py
@@ -71,11 +71,6 @@ def log_metrics(
         sync: Optional[bool] = False,
     ) -> None:
         if not sync and _should_sync():
-            if step == self.experiment._latest_studio_step:  # noqa: SLF001
-                # We are in log_eval_end_metrics but there has been already
-                # a studio request sent with `step`.
-                # We decrease the number to bypass `live.studio._get_unsent_datapoints`
-                self.experiment._latest_studio_step -= 1  # noqa: SLF001
             sync = True
         super().log_metrics(metrics, step, sync)
 

From 93536456d66cf8c0072d9e78b83a914fb826ba8e Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Mon, 12 Feb 2024 18:47:53 +0100
Subject: [PATCH 18/39] fix lightning problem with steps in studio

---
 src/dvclive/live.py          |  1 +
 src/dvclive/studio.py        | 31 ++++++++++++-------------------
 tests/test_post_to_studio.py |  6 +-----
 3 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index c3bcbc66..bded46e3 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -133,6 +133,7 @@ def __init__(
         self._latest_studio_step = self.step if resume else -1
         self._studio_events_to_skip: Set[str] = set()
         self._dvc_studio_config: Dict[str, Any] = {}
+        self._nb_points_sent_to_studio = {}
         self._init_studio()
 
         self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else []
diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index bff27040..4ed80224 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -13,17 +13,11 @@
 logger = logging.getLogger("dvclive")
 
 
-def _get_unsent_datapoints(plot, latest_step):
-    return [x for x in plot if int(x["step"]) > latest_step]
-
-
 def _cast_to_numbers(datapoints):
     for datapoint in datapoints:
         for k, v in datapoint.items():
-            if k == "step":
+            if k in ["step", "timestamp"]:
                 datapoint[k] = int(v)
-            elif k == "timestamp":
-                continue
             else:
                 float_v = float(v)
                 if math.isnan(float_v) or math.isinf(float_v):
@@ -39,11 +33,6 @@ def _adapt_path(live, name):
     return name
 
 
-def _adapt_plot_datapoints(live, plot):
-    datapoints = _get_unsent_datapoints(plot, live._latest_studio_step)
-    return _cast_to_numbers(datapoints)
-
-
 def _adapt_image(image_path):
     with open(image_path, "rb") as fobj:
         return base64.b64encode(fobj.read()).decode("utf-8")
@@ -71,14 +60,18 @@ def get_studio_updates(live):
     metrics_file = _adapt_path(live, metrics_file)
     metrics = {metrics_file: {"data": metrics}}
 
-    plots = {
-        _adapt_path(live, name): _adapt_plot_datapoints(live, plot)
-        for name, plot in plots.items()
-    }
-    plots = {k: {"data": v} for k, v in plots.items()}
-    plots.update(_adapt_images(live))
+    plots_to_send = {}
+    for name, plot in plots.items():
+        path = _adapt_path(live, name)
+        nb_points_already_sent = live._nb_points_sent_to_studio.get(path, 0)
+        live._nb_points_sent_to_studio[path] = len(plot)
+
+        plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:])
+
+    plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()}
+    plots_to_send.update(_adapt_images(live))
 
-    return metrics, params, plots
+    return metrics, params, plots_to_send
 
 
 def get_dvc_studio_config(live):
diff --git a/tests/test_post_to_studio.py b/tests/test_post_to_studio.py
index a22dfa55..6fc5dcfb 100644
--- a/tests/test_post_to_studio.py
+++ b/tests/test_post_to_studio.py
@@ -105,11 +105,7 @@ def test_post_to_studio_failed_data_request(
             "data",
             exp_name=live._exp_name,
             step=1,
-            plots={
-                f"{foo_path}": {
-                    "data": [{"step": 0, "foo": 1.0}, {"step": 1, "foo": 2.0}]
-                }
-            },
+            plots={f"{foo_path}": {"data": [{"step": 1, "foo": 2.0}]}},
         ),
     )
 

From 871bebc4942c639ba1cee5d1dd76b7e14893ed43 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 13 Feb 2024 09:12:57 +0100
Subject: [PATCH 19/39] simplify the metric names

---
 src/dvclive/system_metrics.py | 19 ++++++++--------
 tests/test_system_metrics.py  | 42 +++++++++++++++++------------------
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index dd707007..97760800 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -10,7 +10,7 @@
 MEGABYTES_DIVIDER = 1024.0**2
 GIGABYTES_DIVIDER = 1024.0**3
 
-MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 30
+MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
 
 
 class CPUMetrics:
@@ -23,7 +23,7 @@ def __init__(
         self._interval = interval  # seconds
         self._nb_samples = nb_samples
         self._plot = plot
-        self._no_plot_metrics = ["system/cpu/count", "system/cpu/ram_total_GB"]
+        self._no_plot_metrics = ["system/cpu/count", "system/ram/total (GB)"]
         self._warn_user = True
 
     def __call__(self, live):
@@ -69,10 +69,9 @@ def _get_cpus_metrics() -> Dict[str, Union[float, int]]:
     nb_cpus = psutil.cpu_count()
     cpus_percent = psutil.cpu_percent(percpu=True)
     return {
-        "system/cpu/usage_avg_percent": mean(cpus_percent),
-        "system/cpu/usage_max_percent": max(cpus_percent),
+        "system/cpu/usage (%)": mean(cpus_percent),
         "system/cpu/count": nb_cpus,
-        "system/cpu/parallelism_percent": len(
+        "system/cpu/parallelization (%)": len(
             [
                 percent
                 for percent in cpus_percent
@@ -81,10 +80,12 @@ def _get_cpus_metrics() -> Dict[str, Union[float, int]]:
         )
         * 100
         / nb_cpus,
-        "system/cpu/ram_usage_percent": ram_info.percent,
-        "system/cpu/ram_total_GB": ram_info.total / GIGABYTES_DIVIDER,
-        "system/io/read_speed_MB": io_info.read_bytes
+        "system/ram/usage (%)": ram_info.percent,
+        "system/ram/usage (GB)": (ram_info.percent / 100)
+        * (ram_info.total / GIGABYTES_DIVIDER),
+        "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
+        "system/io/read speed (MB)": io_info.read_bytes
         / (io_info.read_time * MEGABYTES_DIVIDER),
-        "system/io/write_speed_MB": io_info.write_bytes
+        "system/io/write speed (MB)": io_info.write_bytes
         / (io_info.write_time * MEGABYTES_DIVIDER),
     }
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index 3195800c..97b5ddf8 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -38,14 +38,14 @@ def mock_psutil(mocker):
 @pytest.mark.parametrize(
     ("metric_name"),
     [
-        "system/cpu/usage_avg_percent",
-        "system/cpu/usage_max_percent",
+        "system/cpu/usage (%)",
         "system/cpu/count",
-        "system/cpu/parallelism_percent",
-        "system/cpu/ram_usage_percent",
-        "system/cpu/ram_total_GB",
-        "system/io/read_speed_MB",
-        "system/io/write_speed_MB",
+        "system/cpu/parallelization (%)",
+        "system/ram/usage (%)",
+        "system/ram/usage (GB)",
+        "system/ram/total (GB)",
+        "system/io/read speed (MB)",
+        "system/io/write speed (MB)",
     ],
 )
 def test_get_cpus_metrics(mocker, metric_name):
@@ -67,24 +67,24 @@ def test_monitor_system(tmp_dir):
 
     assert "system" in latest
     assert "cpu" in latest["system"]
-    assert "usage_avg_percent" in latest["system"]["cpu"]
-    assert "usage_max_percent" in latest["system"]["cpu"]
+    assert "usage (%)" in latest["system"]["cpu"]
     assert "count" in latest["system"]["cpu"]
-    assert "parallelism_percent" in latest["system"]["cpu"]
-    assert "ram_usage_percent" in latest["system"]["cpu"]
-    assert "ram_total_GB" in latest["system"]["cpu"]
+    assert "parallelization (%)" in latest["system"]["cpu"]
+    assert "ram" in latest["system"]
+    assert "usage (%)" in latest["system"]["ram"]
+    assert "usage (GB)" in latest["system"]["ram"]
+    assert "total (GB)" in latest["system"]["ram"]
     assert "io" in latest["system"]
-    assert "read_speed_MB" in latest["system"]["io"]
-    assert "write_speed_MB" in latest["system"]["io"]
+    assert "read speed (MB)" in latest["system"]["io"]
+    assert "write speed (MB)" in latest["system"]["io"]
 
-    assert any("usage_avg_percent" in key for key in timeseries)
-    assert any("usage_max_percent.tsv" in key for key in timeseries)
-    assert any("parallelism_percent.tsv" in key for key in timeseries)
-    assert any("ram_usage_percent.tsv" in key for key in timeseries)
-    assert any("write_speed_MB.tsv" in key for key in timeseries)
-    assert any("read_speed_MB.tsv" in key for key in timeseries)
+    assert any("usage (%).tsv" in key for key in timeseries)
+    assert any("parallelization (%).tsv" in key for key in timeseries)
+    assert any("usage (GB).tsv" in key for key in timeseries)
+    assert any("read speed (MB).tsv" in key for key in timeseries)
+    assert any("write speed (MB).tsv" in key for key in timeseries)
     assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
 
     # not plot for constant values
     assert all("count.tsv" not in key for key in timeseries)
-    assert all("ram_total_GB.tsv" not in key for key in timeseries)
+    assert all("total (GB).tsv" not in key for key in timeseries)

From e169abc57e3a925e0c585390b7f27fbaf240b733 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Wed, 14 Feb 2024 16:24:33 +0100
Subject: [PATCH 20/39] don't increment `num_point_sent_to_studio` if studio
 didn't received the data

---
 src/dvclive/live.py          | 3 ++-
 src/dvclive/studio.py        | 8 +++++---
 tests/test_post_to_studio.py | 6 +++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index ac74e2d3..1c1879ca 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -172,7 +172,8 @@ def __init__(
         self._latest_studio_step = self.step if resume else -1
         self._studio_events_to_skip: Set[str] = set()
         self._dvc_studio_config: Dict[str, Any] = {}
-        self._nb_points_sent_to_studio = {}
+        self._num_points_sent_to_studio: Dict[str, int] = {}
+        self._num_points_read_from_file: Dict[str, int] = {}
         self._init_studio()
 
         self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else []
diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index 4ed80224..8a3109d9 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -61,12 +61,12 @@ def get_studio_updates(live):
     metrics = {metrics_file: {"data": metrics}}
 
     plots_to_send = {}
+    live._num_points_read_from_file = {}
     for name, plot in plots.items():
         path = _adapt_path(live, name)
-        nb_points_already_sent = live._nb_points_sent_to_studio.get(path, 0)
-        live._nb_points_sent_to_studio[path] = len(plot)
-
+        nb_points_already_sent = live._num_points_sent_to_studio.get(path, 0)
         plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:])
+        live._num_points_read_from_file.update({path: len(plot)})
 
     plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()}
     plots_to_send.update(_adapt_images(live))
@@ -112,6 +112,8 @@ def post_to_studio(live, event):
             live._studio_events_to_skip.add("data")
             live._studio_events_to_skip.add("done")
     elif event == "data":
+        for path, num_points in live._num_points_read_from_file.items():
+            live._num_points_sent_to_studio[path] = num_points
         live._latest_studio_step = live.step
 
     if event == "done":
diff --git a/tests/test_post_to_studio.py b/tests/test_post_to_studio.py
index 6fc5dcfb..a22dfa55 100644
--- a/tests/test_post_to_studio.py
+++ b/tests/test_post_to_studio.py
@@ -105,7 +105,11 @@ def test_post_to_studio_failed_data_request(
             "data",
             exp_name=live._exp_name,
             step=1,
-            plots={f"{foo_path}": {"data": [{"step": 1, "foo": 2.0}]}},
+            plots={
+                f"{foo_path}": {
+                    "data": [{"step": 0, "foo": 1.0}, {"step": 1, "foo": 2.0}]
+                }
+            },
         ),
     )
 

From aa5b511e414938f07aad66d3502e4a21bdb6de3b Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 10:46:17 +0100
Subject: [PATCH 21/39] add directory metrics to the list of metrics tracked +
 refacto

---
 src/dvclive/live.py           |  20 ++++--
 src/dvclive/system_metrics.py | 106 ++++++++++++++++++++------------
 tests/test_system_metrics.py  | 112 ++++++++++++++++++++++------------
 3 files changed, 156 insertions(+), 82 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 1c1879ca..ce6ba98f 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -176,9 +176,10 @@ def __init__(
         self._num_points_read_from_file: Dict[str, int] = {}
         self._init_studio()
 
-        self._system_monitoring_callbacks = [CPUMetrics()] if monitor_system else []
-        for callback in self._system_monitoring_callbacks:
-            callback(self)
+        self._cpu_metrics = None
+        if monitor_system:
+            self._cpu_metrics = CPUMetrics()
+            self._cpu_metrics(self)
 
     def _init_resume(self):
         self._read_params()
@@ -398,6 +399,13 @@ def step(self, value: int) -> None:
         self._step = value
         logger.debug(f"Step: {self.step}")
 
+    @step.setter
+    def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None:
+        if self._cpu_metrics is not None:
+            self._cpu_metrics.end()
+        self._cpu_metrics = cpu_metrics
+        self._cpu_metrics(self)
+
     def sync(self):
         self.make_summary()
 
@@ -887,9 +895,9 @@ def end(self):
         if "step" in self.summary:
             self.step = self.summary["step"]
 
-        # Kill all independent threads that can be found in callbacks
-        for callback in self._system_monitoring_callbacks:
-            callback.end()
+        # Kill threads that monitor the system metrics
+        if self._cpu_metrics is not None:
+            self._cpu_metrics.end()
 
         self.sync()
 
diff --git a/src/dvclive/system_metrics.py b/src/dvclive/system_metrics.py
index 97760800..9bb4685e 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/system_metrics.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Dict, Union
+from typing import Dict, Union, Optional, List
+from pathlib import Path
 
 import psutil
 from statistics import mean
@@ -13,7 +14,9 @@
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
 
 
-class CPUMetrics:
+class _SystemMetrics:
+    _plot_blacklist_prefix = ()
+
     def __init__(
         self,
         interval: float = 0.5,
@@ -23,7 +26,6 @@ def __init__(
         self._interval = interval  # seconds
         self._nb_samples = nb_samples
         self._plot = plot
-        self._no_plot_metrics = ["system/cpu/count", "system/ram/total (GB)"]
         self._warn_user = True
 
     def __call__(self, live):
@@ -35,57 +37,85 @@ def __call__(self, live):
 
     def _monitoring_loop(self):
         while not self._shutdown_event.is_set():
-            self._cpu_metrics = {}
+            self._metrics = {}
             for _ in range(self._nb_samples):
-                last_cpu_metrics = {}
+                last_metrics = {}
                 try:
-                    last_cpu_metrics = _get_cpus_metrics()
+                    last_metrics = self._get_metrics()
                 except psutil.Error:
                     if self._warn_user:
                         logger.exception("Failed to monitor CPU metrics")
                         self._warn_user = False
 
-                self._cpu_metrics = merge_with(sum, self._cpu_metrics, last_cpu_metrics)
+                self._metrics = merge_with(sum, self._metrics, last_metrics)
                 self._shutdown_event.wait(self._interval)
                 if self._shutdown_event.is_set():
                     break
-            for metric_name, metric_values in self._cpu_metrics.items():
+            for name, values in self._metrics.items():
+                blacklisted = any(
+                    name.startswith(prefix) for prefix in self._plot_blacklist_prefix
+                )
                 self._live.log_metric(
-                    metric_name,
-                    metric_values / self._nb_samples,
+                    name,
+                    values / self._nb_samples,
                     timestamp=True,
-                    plot=self._plot
-                    if metric_name not in self._no_plot_metrics
-                    else False,
+                    plot=None if blacklisted else self._plot,
                 )
 
+    def _get_metrics() -> Dict[str, Union[float, int]]:
+        pass
+
     def end(self):
         self._shutdown_event.set()
 
 
-def _get_cpus_metrics() -> Dict[str, Union[float, int]]:
-    ram_info = psutil.virtual_memory()
-    io_info = psutil.disk_io_counters()
-    nb_cpus = psutil.cpu_count()
-    cpus_percent = psutil.cpu_percent(percpu=True)
-    return {
-        "system/cpu/usage (%)": mean(cpus_percent),
-        "system/cpu/count": nb_cpus,
-        "system/cpu/parallelization (%)": len(
-            [
-                percent
-                for percent in cpus_percent
-                if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE
-            ]
+class CPUMetrics(_SystemMetrics):
+    _plot_blacklist_prefix = (
+        "system/cpu/count",
+        "system/ram/total (GB)",
+        "system/disk/total (GB)",
+    )
+
+    def __init__(
+        self,
+        interval: float = 0.5,
+        nb_samples: int = 10,
+        directories_to_monitor: Optional[List[str]] = None,
+        plot: bool = True,
+    ):
+        super().__init__(interval=interval, nb_samples=nb_samples, plot=plot)
+        self._directories_to_monitor = (
+            ["."] if directories_to_monitor is None else directories_to_monitor
         )
-        * 100
-        / nb_cpus,
-        "system/ram/usage (%)": ram_info.percent,
-        "system/ram/usage (GB)": (ram_info.percent / 100)
-        * (ram_info.total / GIGABYTES_DIVIDER),
-        "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
-        "system/io/read speed (MB)": io_info.read_bytes
-        / (io_info.read_time * MEGABYTES_DIVIDER),
-        "system/io/write speed (MB)": io_info.write_bytes
-        / (io_info.write_time * MEGABYTES_DIVIDER),
-    }
+
+    def _get_metrics(self) -> Dict[str, Union[float, int]]:
+        ram_info = psutil.virtual_memory()
+        nb_cpus = psutil.cpu_count()
+        cpus_percent = psutil.cpu_percent(percpu=True)
+        result = {
+            "system/cpu/usage (%)": mean(cpus_percent),
+            "system/cpu/count": nb_cpus,
+            "system/cpu/parallelization (%)": len(
+                [
+                    percent
+                    for percent in cpus_percent
+                    if percent >= MINIMUM_CPU_USAGE_TO_BE_ACTIVE
+                ]
+            )
+            * 100
+            / nb_cpus,
+            "system/ram/usage (%)": ram_info.percent,
+            "system/ram/usage (GB)": (ram_info.percent / 100)
+            * (ram_info.total / GIGABYTES_DIVIDER),
+            "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
+        }
+        for idx, directory in enumerate(self._directories_to_monitor):
+            if not Path(directory).exists():
+                continue
+            disk_info = psutil.disk_usage(directory)
+            result[f"system/disk/usage (%)/{idx}"] = disk_info.percent
+            result[f"system/disk/usage (GB)/{idx}"] = disk_info.used / GIGABYTES_DIVIDER
+            result[f"system/disk/total (GB)/{idx}"] = (
+                disk_info.total / GIGABYTES_DIVIDER
+            )
+        return result
diff --git a/tests/test_system_metrics.py b/tests/test_system_metrics.py
index 97b5ddf8..53ad9060 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_system_metrics.py
@@ -1,9 +1,8 @@
 import time
-
-import pytest
+from pathlib import Path
 
 from dvclive import Live
-from dvclive.system_metrics import _get_cpus_metrics
+from dvclive.system_metrics import CPUMetrics
 from dvclive.utils import parse_metrics
 
 
@@ -18,15 +17,14 @@ def mock_psutil(mocker):
     mocked_virtual_memory.percent = 20
     mocked_virtual_memory.total = 4 * 1024**3
 
-    mocked_disk_io_counters = mocker.MagicMock()
-    mocked_disk_io_counters.read_bytes = 2 * 1024**2
-    mocked_disk_io_counters.read_time = 1
-    mocked_disk_io_counters.write_bytes = 2 * 1024**2
-    mocked_disk_io_counters.write_time = 1
+    mocked_disk_usage = mocker.MagicMock()
+    mocked_disk_usage.used = 16 * 1024**3
+    mocked_disk_usage.percent = 50
+    mocked_disk_usage.total = 32 * 1024**3
 
     mocking_dict = {
         "virtual_memory": mocked_virtual_memory,
-        "disk_io_counters": mocked_disk_io_counters,
+        "disk_usage": mocked_disk_usage,
     }
     for function_name, return_value in mocking_dict.items():
         mocker.patch(
@@ -35,26 +33,54 @@ def mock_psutil(mocker):
         )
 
 
-@pytest.mark.parametrize(
-    ("metric_name"),
-    [
-        "system/cpu/usage (%)",
-        "system/cpu/count",
-        "system/cpu/parallelization (%)",
-        "system/ram/usage (%)",
-        "system/ram/usage (GB)",
-        "system/ram/total (GB)",
-        "system/io/read speed (MB)",
-        "system/io/write speed (MB)",
-    ],
-)
-def test_get_cpus_metrics(mocker, metric_name):
+def test_get_cpus_metrics_mocker(tmp_dir, mocker):
     mock_psutil(mocker)
-    metrics = _get_cpus_metrics()
-    assert metric_name in metrics
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=False,
+    ) as live:
+        monitor = CPUMetrics(directories_to_monitor=["/", "/"])
+        monitor(live)
+        metrics = monitor._get_metrics()
+        monitor.end()
 
+    assert "system/cpu/usage (%)" in metrics
+    assert "system/cpu/count" in metrics
+    assert "system/cpu/parallelization (%)" in metrics
+    assert "system/ram/usage (%)" in metrics
+    assert "system/ram/usage (GB)" in metrics
+    assert "system/ram/total (GB)" in metrics
+    assert "system/disk/usage (%)/0" in metrics
+    assert "system/disk/usage (%)/1" in metrics
+    assert "system/disk/usage (GB)/0" in metrics
+    assert "system/disk/usage (GB)/1" in metrics
+    assert "system/disk/total (GB)/0" in metrics
+    assert "system/disk/total (GB)/1" in metrics
 
-def test_monitor_system(tmp_dir):
+
+def test_ignore_missing_directories(tmp_dir, mocker):
+    mock_psutil(mocker)
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=False,
+    ) as live:
+        missing_directories = "______"
+        monitor = CPUMetrics(directories_to_monitor=["/", missing_directories])
+        monitor(live)
+        metrics = monitor._get_metrics()
+        monitor.end()
+
+    assert not Path(missing_directories).exists()
+
+    assert "system/disk/usage (%)/1" not in metrics
+    assert "system/disk/usage (GB)/1" not in metrics
+    assert "system/disk/total (GB)/1" not in metrics
+
+
+def test_monitor_system(tmp_dir, mocker):
+    mock_psutil(mocker)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -65,6 +91,7 @@ def test_monitor_system(tmp_dir):
         time.sleep(5 + 1)  # allow the thread to finish
         timeseries, latest = parse_metrics(live)
 
+    # metrics.json file contains all the system metrics
     assert "system" in latest
     assert "cpu" in latest["system"]
     assert "usage (%)" in latest["system"]["cpu"]
@@ -74,17 +101,26 @@ def test_monitor_system(tmp_dir):
     assert "usage (%)" in latest["system"]["ram"]
     assert "usage (GB)" in latest["system"]["ram"]
     assert "total (GB)" in latest["system"]["ram"]
-    assert "io" in latest["system"]
-    assert "read speed (MB)" in latest["system"]["io"]
-    assert "write speed (MB)" in latest["system"]["io"]
-
-    assert any("usage (%).tsv" in key for key in timeseries)
-    assert any("parallelization (%).tsv" in key for key in timeseries)
-    assert any("usage (GB).tsv" in key for key in timeseries)
-    assert any("read speed (MB).tsv" in key for key in timeseries)
-    assert any("write speed (MB).tsv" in key for key in timeseries)
+    assert "disk" in latest["system"]
+    assert "usage (%)" in latest["system"]["disk"]
+    assert "0" in latest["system"]["disk"]["usage (%)"]
+    assert "usage (GB)" in latest["system"]["disk"]
+    assert "total (GB)" in latest["system"]["disk"]
+
+    # timeseries contains all the system metrics
+    assert any(str(Path("system/cpu/usage (%).tsv")) in key for key in timeseries)
+    assert any(
+        str(Path("system/cpu/parallelization (%).tsv")) in key for key in timeseries
+    )
+    assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries)
+    assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries)
+    assert any(str(Path("system/disk/usage (%)/0.tsv")) in key for key in timeseries)
+    assert any(str(Path("system/disk/usage (GB)/0.tsv")) in key for key in timeseries)
     assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
 
-    # not plot for constant values
-    assert all("count.tsv" not in key for key in timeseries)
-    assert all("total (GB).tsv" not in key for key in timeseries)
+    # blacklisted timeseries
+    assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries)
+    assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries)
+    assert all(
+        str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries
+    )

From 9d0e70d264b21e9b7eb0b3556e2d94db5a2e65dd Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 11:05:46 +0100
Subject: [PATCH 22/39] clean code and split features into several PRs

---
 src/dvclive/lightning.py |  5 +++++
 src/dvclive/live.py      |  9 +++++----
 src/dvclive/studio.py    | 32 +++++++++++++++++++-------------
 tests/test_utils.py      |  5 +----
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/dvclive/lightning.py b/src/dvclive/lightning.py
index dc159526..2c8d6006 100644
--- a/src/dvclive/lightning.py
+++ b/src/dvclive/lightning.py
@@ -71,6 +71,11 @@ def log_metrics(
         sync: Optional[bool] = False,
     ) -> None:
         if not sync and _should_sync():
+            if step == self.experiment._latest_studio_step:  # noqa: SLF001
+                # We are in log_eval_end_metrics but there has been already
+                # a studio request sent with `step`.
+                # We decrease the number to bypass `live.studio._get_unsent_datapoints`
+                self.experiment._latest_studio_step -= 1  # noqa: SLF001
             sync = True
         super().log_metrics(metrics, step, sync)
 
diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index ce6ba98f..4ae1006e 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -172,8 +172,6 @@ def __init__(
         self._latest_studio_step = self.step if resume else -1
         self._studio_events_to_skip: Set[str] = set()
         self._dvc_studio_config: Dict[str, Any] = {}
-        self._num_points_sent_to_studio: Dict[str, int] = {}
-        self._num_points_read_from_file: Dict[str, int] = {}
         self._init_studio()
 
         self._cpu_metrics = None
@@ -399,7 +397,11 @@ def step(self, value: int) -> None:
         self._step = value
         logger.debug(f"Step: {self.step}")
 
-    @step.setter
+    @property
+    def cpu_metrics(self) -> int:
+        return self._cpu_metrics or None
+
+    @cpu_metrics.setter
     def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None:
         if self._cpu_metrics is not None:
             self._cpu_metrics.end()
@@ -474,7 +476,6 @@ def log_metric(
             self._metrics[name] = metric
 
         metric.step = self.step
-
         if plot:
             metric.dump(val, timestamp=timestamp)
 
diff --git a/src/dvclive/studio.py b/src/dvclive/studio.py
index 8a3109d9..f74db7ab 100644
--- a/src/dvclive/studio.py
+++ b/src/dvclive/studio.py
@@ -13,11 +13,17 @@
 logger = logging.getLogger("dvclive")
 
 
+def _get_unsent_datapoints(plot, latest_step):
+    return [x for x in plot if int(x["step"]) > latest_step]
+
+
 def _cast_to_numbers(datapoints):
     for datapoint in datapoints:
         for k, v in datapoint.items():
-            if k in ["step", "timestamp"]:
+            if k == "step":
                 datapoint[k] = int(v)
+            elif k == "timestamp":
+                continue
             else:
                 float_v = float(v)
                 if math.isnan(float_v) or math.isinf(float_v):
@@ -33,6 +39,11 @@ def _adapt_path(live, name):
     return name
 
 
+def _adapt_plot_datapoints(live, plot):
+    datapoints = _get_unsent_datapoints(plot, live._latest_studio_step)
+    return _cast_to_numbers(datapoints)
+
+
 def _adapt_image(image_path):
     with open(image_path, "rb") as fobj:
         return base64.b64encode(fobj.read()).decode("utf-8")
@@ -60,18 +71,15 @@ def get_studio_updates(live):
     metrics_file = _adapt_path(live, metrics_file)
     metrics = {metrics_file: {"data": metrics}}
 
-    plots_to_send = {}
-    live._num_points_read_from_file = {}
-    for name, plot in plots.items():
-        path = _adapt_path(live, name)
-        nb_points_already_sent = live._num_points_sent_to_studio.get(path, 0)
-        plots_to_send[path] = _cast_to_numbers(plot[nb_points_already_sent:])
-        live._num_points_read_from_file.update({path: len(plot)})
+    plots = {
+        _adapt_path(live, name): _adapt_plot_datapoints(live, plot)
+        for name, plot in plots.items()
+    }
+    plots = {k: {"data": v} for k, v in plots.items()}
 
-    plots_to_send = {k: {"data": v} for k, v in plots_to_send.items()}
-    plots_to_send.update(_adapt_images(live))
+    plots.update(_adapt_images(live))
 
-    return metrics, params, plots_to_send
+    return metrics, params, plots
 
 
 def get_dvc_studio_config(live):
@@ -112,8 +120,6 @@ def post_to_studio(live, event):
             live._studio_events_to_skip.add("data")
             live._studio_events_to_skip.add("done")
     elif event == "data":
-        for path, num_points in live._num_points_read_from_file.items():
-            live._num_points_sent_to_studio[path] = num_points
         live._latest_studio_step = live.step
 
     if event == "done":
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ffbc66ef..2b5d56ca 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,10 +2,7 @@
 import pandas as pd
 import pytest
 
-from dvclive.utils import (
-    standardize_metric_name,
-    convert_datapoints_to_list_of_dicts,
-)
+from dvclive.utils import standardize_metric_name, convert_datapoints_to_list_of_dicts
 from dvclive.error import InvalidDataTypeError
 
 

From ba72e0132e313b64cbc1ccbbd52e332b7f3bf724 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 11:22:17 +0100
Subject: [PATCH 23/39] cleaner user interface

---
 src/dvclive/live.py                           | 28 +++++++++----------
 .../{system_metrics.py => monitor_system.py}  |  4 +--
 ...stem_metrics.py => test_monitor_system.py} | 12 ++++----
 3 files changed, 22 insertions(+), 22 deletions(-)
 rename src/dvclive/{system_metrics.py => monitor_system.py} (98%)
 rename tests/{test_system_metrics.py => test_monitor_system.py} (92%)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 4ae1006e..324414ac 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -51,7 +51,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .system_metrics import CPUMetrics
+from .monitor_system import MonitorCPU
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -174,10 +174,10 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._cpu_metrics = None
+        self._monitor_cpu = None
         if monitor_system:
-            self._cpu_metrics = CPUMetrics()
-            self._cpu_metrics(self)
+            self._monitor_cpu = MonitorCPU()
+            self._monitor_cpu(self)
 
     def _init_resume(self):
         self._read_params()
@@ -398,15 +398,15 @@ def step(self, value: int) -> None:
         logger.debug(f"Step: {self.step}")
 
     @property
-    def cpu_metrics(self) -> int:
-        return self._cpu_metrics or None
+    def monitor_cpu(self) -> int:
+        return self._monitor_cpu or None
 
-    @cpu_metrics.setter
-    def cpu_metrics(self, cpu_metrics: CPUMetrics) -> None:
-        if self._cpu_metrics is not None:
-            self._cpu_metrics.end()
-        self._cpu_metrics = cpu_metrics
-        self._cpu_metrics(self)
+    @monitor_cpu.setter
+    def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None:
+        if self._monitor_cpu is not None:
+            self._monitor_cpu.end()
+        self._monitor_cpu = monitor_cpu
+        self._monitor_cpu(self)
 
     def sync(self):
         self.make_summary()
@@ -897,8 +897,8 @@ def end(self):
             self.step = self.summary["step"]
 
         # Kill threads that monitor the system metrics
-        if self._cpu_metrics is not None:
-            self._cpu_metrics.end()
+        if self._monitor_cpu is not None:
+            self._monitor_cpu.end()
 
         self.sync()
 
diff --git a/src/dvclive/system_metrics.py b/src/dvclive/monitor_system.py
similarity index 98%
rename from src/dvclive/system_metrics.py
rename to src/dvclive/monitor_system.py
index 9bb4685e..9e4f9f54 100644
--- a/src/dvclive/system_metrics.py
+++ b/src/dvclive/monitor_system.py
@@ -14,7 +14,7 @@
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
 
 
-class _SystemMetrics:
+class _MonitorSystem:
     _plot_blacklist_prefix = ()
 
     def __init__(
@@ -69,7 +69,7 @@ def end(self):
         self._shutdown_event.set()
 
 
-class CPUMetrics(_SystemMetrics):
+class MonitorCPU(_MonitorSystem):
     _plot_blacklist_prefix = (
         "system/cpu/count",
         "system/ram/total (GB)",
diff --git a/tests/test_system_metrics.py b/tests/test_monitor_system.py
similarity index 92%
rename from tests/test_system_metrics.py
rename to tests/test_monitor_system.py
index 53ad9060..9653c2e1 100644
--- a/tests/test_system_metrics.py
+++ b/tests/test_monitor_system.py
@@ -2,16 +2,16 @@
 from pathlib import Path
 
 from dvclive import Live
-from dvclive.system_metrics import CPUMetrics
+from dvclive.monitor_system import MonitorCPU
 from dvclive.utils import parse_metrics
 
 
 def mock_psutil(mocker):
     mocker.patch(
-        "dvclive.system_metrics.psutil.cpu_percent",
+        "dvclive.monitor_system.psutil.cpu_percent",
         return_value=[10, 20, 30, 40, 50, 60],
     )
-    mocker.patch("dvclive.system_metrics.psutil.cpu_count", return_value=6)
+    mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6)
 
     mocked_virtual_memory = mocker.MagicMock()
     mocked_virtual_memory.percent = 20
@@ -28,7 +28,7 @@ def mock_psutil(mocker):
     }
     for function_name, return_value in mocking_dict.items():
         mocker.patch(
-            f"dvclive.system_metrics.psutil.{function_name}",
+            f"dvclive.monitor_system.psutil.{function_name}",
             return_value=return_value,
         )
 
@@ -40,7 +40,7 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        monitor = CPUMetrics(directories_to_monitor=["/", "/"])
+        monitor = MonitorCPU(directories_to_monitor=["/", "/"])
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()
@@ -67,7 +67,7 @@ def test_ignore_missing_directories(tmp_dir, mocker):
         monitor_system=False,
     ) as live:
         missing_directories = "______"
-        monitor = CPUMetrics(directories_to_monitor=["/", missing_directories])
+        monitor = MonitorCPU(directories_to_monitor=["/", missing_directories])
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()

From 6e29ff6193cd601bc921fec40df179b95e5f2c1c Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 12:26:14 +0100
Subject: [PATCH 24/39] add docstrings

---
 src/dvclive/monitor_system.py | 47 +++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 9e4f9f54..26324dc5 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -7,6 +7,9 @@
 from threading import Event, Thread
 from funcy import merge_with
 
+from .error import InvalidDataTypeError
+
+
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
 GIGABYTES_DIVIDER = 1024.0**3
@@ -15,16 +18,29 @@
 
 
 class _MonitorSystem:
+    """
+    Monitor system resources and log them to DVC Live.
+    Use a separate thread to call a `_get_metrics` function at  fix interval and
+    aggregate the results of this sampling using the average.
+    """
+
     _plot_blacklist_prefix = ()
 
     def __init__(
         self,
         interval: float = 0.5,
-        nb_samples: int = 10,
+        num_samples: int = 10,
         plot: bool = True,
     ):
+        if not isinstance(interval, (int, float)):
+            raise InvalidDataTypeError("MonitorSystem.interval", type(interval))
+        if not isinstance(num_samples, int):
+            raise InvalidDataTypeError("MonitorSystem.sample", type(num_samples))
+        if not isinstance(plot, bool):
+            raise InvalidDataTypeError("MonitorSystem.plot", type(plot))
+
         self._interval = interval  # seconds
-        self._nb_samples = nb_samples
+        self._nb_samples = num_samples
         self._plot = plot
         self._warn_user = True
 
@@ -79,14 +95,35 @@ class MonitorCPU(_MonitorSystem):
     def __init__(
         self,
         interval: float = 0.5,
-        nb_samples: int = 10,
+        num_samples: int = 10,
         directories_to_monitor: Optional[List[str]] = None,
         plot: bool = True,
     ):
-        super().__init__(interval=interval, nb_samples=nb_samples, plot=plot)
-        self._directories_to_monitor = (
+        """Monitor CPU resources and log them to DVC Live.
+
+        Args:
+            interval (float): interval in seconds between two measurements.
+                Defaults to 0.5.
+            num_samples (int): number of samples to average. Defaults to 10.
+            directories_to_monitor (Optional[List[str]]): paths of the directories that
+            needs to be monitor for disk storage metrics. Defaults to the current
+            directory.
+            plot (bool): should the system metrics be saved as plots. Defaults to True.
+
+        Raises:
+            InvalidDataTypeError: if the arguments passed to the function don't have a
+                supported type.
+        """
+        super().__init__(interval=interval, num_samples=num_samples, plot=plot)
+        directories_to_monitor = (
             ["."] if directories_to_monitor is None else directories_to_monitor
         )
+        for idx, path in enumerate(directories_to_monitor):
+            if not isinstance(path, str):
+                raise InvalidDataTypeError(
+                    f"MonitorCPU.directories_to_monitor[{idx}]", type(path)
+                )
+        self._directories_to_monitor = directories_to_monitor
 
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()

From 719087be3d6256b1306581471772aa680c2dca9d Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 12:41:50 +0100
Subject: [PATCH 25/39] mypy conflicts

---
 src/dvclive/live.py           | 2 +-
 src/dvclive/monitor_system.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 324414ac..05ff9943 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -398,7 +398,7 @@ def step(self, value: int) -> None:
         logger.debug(f"Step: {self.step}")
 
     @property
-    def monitor_cpu(self) -> int:
+    def monitor_cpu(self) -> Optional[MonitorCPU]:
         return self._monitor_cpu or None
 
     @monitor_cpu.setter
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 26324dc5..6820a1f2 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Union, Optional, List
+from typing import Dict, Union, Optional, List, Tuple
 from pathlib import Path
 
 import psutil
@@ -24,7 +24,7 @@ class _MonitorSystem:
     aggregate the results of this sampling using the average.
     """
 
-    _plot_blacklist_prefix = ()
+    _plot_blacklist_prefix: Tuple = ()
 
     def __init__(
         self,
@@ -78,7 +78,7 @@ def _monitoring_loop(self):
                     plot=None if blacklisted else self._plot,
                 )
 
-    def _get_metrics() -> Dict[str, Union[float, int]]:
+    def _get_metrics(self):
         pass
 
     def end(self):
@@ -86,7 +86,7 @@ def end(self):
 
 
 class MonitorCPU(_MonitorSystem):
-    _plot_blacklist_prefix = (
+    _plot_blacklist_prefix: Tuple = (
         "system/cpu/count",
         "system/ram/total (GB)",
         "system/disk/total (GB)",

From 32de58ca4a4ceb119fad7489dba4f37e41b363e7 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 13:08:27 +0100
Subject: [PATCH 26/39] monitor GPU metrics

---
 pyproject.toml                |  3 +-
 src/dvclive/live.py           | 10 +++-
 src/dvclive/monitor_system.py | 59 +++++++++++++++++++++++
 tests/test_monitor_system.py  | 91 ++++++++++++++++++++++++++++++++++-
 4 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 62294cfa..e425e8eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,8 @@ dependencies = [
   "gto",
   "ruamel.yaml",
   "scmrepo",
-  "psutil"
+  "psutil",
+  "py3nvml"
 ]
 
 [project.optional-dependencies]
diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 05ff9943..cecf2e01 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -9,6 +9,7 @@
 import tempfile
 
 from pathlib import Path, PurePath
+from py3nvml import get_num_procs
 from typing import (
     Any,
     Dict,
@@ -51,7 +52,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .monitor_system import MonitorCPU
+from .monitor_system import MonitorCPU, MonitorGPU
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -175,10 +176,15 @@ def __init__(
         self._init_studio()
 
         self._monitor_cpu = None
+        self._monitor_gpu = None
         if monitor_system:
             self._monitor_cpu = MonitorCPU()
             self._monitor_cpu(self)
 
+            if len(get_num_procs()):
+                self._monitor_gpu = MonitorGPU()
+                self._monitor_gpu(self)
+
     def _init_resume(self):
         self._read_params()
         self.summary = self.read_latest()
@@ -899,6 +905,8 @@ def end(self):
         # Kill threads that monitor the system metrics
         if self._monitor_cpu is not None:
             self._monitor_cpu.end()
+        if self._monitor_gpu is not None:
+            self._monitor_gpu.end()
 
         self.sync()
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 6820a1f2..4725066d 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -6,6 +6,14 @@
 from statistics import mean
 from threading import Event, Thread
 from funcy import merge_with
+from py3nvml.py3nvml import (
+    nvmlInit,
+    nvmlDeviceGetCount,
+    nvmlDeviceGetHandleByIndex,
+    nvmlDeviceGetMemoryInfo,
+    nvmlDeviceGetUtilizationRates,
+    nvmlShutdown,
+)
 
 from .error import InvalidDataTypeError
 
@@ -156,3 +164,54 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
                 disk_info.total / GIGABYTES_DIVIDER
             )
         return result
+
+
+class MonitorGPU(_MonitorSystem):
+    _plot_blacklist_prefix = ("system/gpu/count", "system/vram/total (GB)")
+
+    def __init__(
+        self,
+        interval: float = 0.5,
+        num_samples: int = 10,
+        plot: bool = True,
+    ):
+        """Monitor GPU resources and log them to DVC Live.
+
+        Args:
+            interval (float): interval in seconds between two measurements.
+                Defaults to 0.5.
+            num_samples (int): number of samples to average. Defaults to 10.
+            plot (bool): should the system metrics be saved as plots. Defaults to True.
+
+        Raises:
+            InvalidDataTypeError: if the arguments passed to the function don't have a
+                supported type.
+        """
+        super().__init__(interval=interval, num_samples=num_samples, plot=plot)
+
+    def _get_metrics(self) -> Dict[str, Union[float, int]]:
+        nvmlInit()
+        num_gpus = nvmlDeviceGetCount()
+        gpu_metrics = {
+            "system/gpu/count": num_gpus,
+        }
+
+        for gpu_idx in range(num_gpus):
+            gpu_handle = nvmlDeviceGetHandleByIndex(gpu_idx)
+            memory_info = nvmlDeviceGetMemoryInfo(gpu_handle)
+            usage_info = nvmlDeviceGetUtilizationRates(gpu_handle)
+
+            gpu_metrics[f"system/gpu/usage (%)/{gpu_idx}"] = (
+                100 * usage_info.memory / usage_info.gpu if usage_info.gpu else 0
+            )
+            gpu_metrics[f"system/vram/usage (%)/{gpu_idx}"] = (
+                100 * memory_info.used / memory_info.total
+            )
+            gpu_metrics[f"system/vram/usage (GB)/{gpu_idx}"] = (
+                memory_info.used / GIGABYTES_DIVIDER
+            )
+            gpu_metrics[f"system/vram/total (GB)/{gpu_idx}"] = (
+                memory_info.total / GIGABYTES_DIVIDER
+            )
+        nvmlShutdown()
+        return gpu_metrics
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 9653c2e1..d0fbffbf 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 from dvclive import Live
-from dvclive.monitor_system import MonitorCPU
+from dvclive.monitor_system import MonitorCPU, MonitorGPU
 from dvclive.utils import parse_metrics
 
 
@@ -124,3 +124,92 @@ def test_monitor_system(tmp_dir, mocker):
     assert all(
         str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries
     )
+
+
+def mock_py3nvml(mocker, nb_gpus=2, crash_index=None):
+    mocker.patch("dvclive.live.get_num_procs", return_value=[1 for _ in range(nb_gpus)])
+    mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=nb_gpus)
+
+    mocked_memory_info = mocker.MagicMock()
+    mocked_memory_info.used = 3 * 1024**3
+    mocked_memory_info.total = 5 * 1024**3
+
+    mocked_utilization_rate = mocker.MagicMock()
+    mocked_utilization_rate.memory = 5
+    mocked_utilization_rate.gpu = 10
+
+    mocking_dict = {
+        "nvmlDeviceGetHandleByIndex": None,
+        "nvmlDeviceGetMemoryInfo": mocked_memory_info,
+        "nvmlDeviceGetUtilizationRates": mocked_utilization_rate,
+    }
+
+    for function_name, return_value in mocking_dict.items():
+        mocker.patch(
+            f"dvclive.monitor_system.{function_name}",
+            return_value=return_value,
+        )
+
+
+def test_get_gpus_metrics_mocker(mocker, tmp_dir):
+    mock_py3nvml(mocker, nb_gpus=2)
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=False,
+    ) as live:
+        monitor = MonitorGPU()
+        monitor(live)
+        metrics = monitor._get_metrics()
+        monitor.end()
+    assert "system/gpu/usage (%)/0" in metrics
+    assert "system/gpu/usage (%)/1" in metrics
+    assert "system/vram/usage (%)/0" in metrics
+    assert "system/vram/usage (%)/1" in metrics
+    assert "system/vram/usage (GB)/0" in metrics
+    assert "system/vram/usage (GB)/1" in metrics
+    assert "system/vram/total (GB)/0" in metrics
+    assert "system/vram/total (GB)/1" in metrics
+
+
+def test_monitor_gpu_system(tmp_dir, mocker):
+    mock_psutil(mocker)
+    mock_py3nvml(mocker, nb_gpus=1)
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=True,
+    ) as live:
+        time.sleep(5 + 1)  # allow the thread to finish
+        live.next_step()
+        time.sleep(5 + 1)  # allow the thread to finish
+        timeseries, latest = parse_metrics(live)
+
+    # metrics.json records CPU and RAM metrics if GPU detected
+    assert "system" in latest
+    assert "cpu" in latest["system"]
+    assert "ram" in latest["system"]
+    assert "disk" in latest["system"]
+
+    # metrics.json file contains all the system metrics
+    assert "gpu" in latest["system"]
+    assert "count" in latest["system"]["gpu"]
+    assert "usage (%)" in latest["system"]["gpu"]
+    assert "0" in latest["system"]["gpu"]["usage (%)"]
+    assert "vram" in latest["system"]
+    assert "usage (%)" in latest["system"]["vram"]
+    assert "0" in latest["system"]["vram"]["usage (%)"]
+    assert "usage (GB)" in latest["system"]["vram"]
+    assert "0" in latest["system"]["vram"]["usage (GB)"]
+    assert "total (GB)" in latest["system"]["vram"]
+    assert "0" in latest["system"]["vram"]["total (GB)"]
+
+    # timeseries contains all the system metrics
+    assert any(str(Path("system/gpu/usage (%)/0.tsv")) in key for key in timeseries)
+    assert any(str(Path("system/vram/usage (%)/0.tsv")) in key for key in timeseries)
+    assert any(str(Path("system/vram/usage (GB)/0.tsv")) in key for key in timeseries)
+    assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
+
+    # blacklisted timeseries
+    assert all(str(Path("system/gpu/count.tsv")) not in key for key in timeseries)
+    assert all(str(Path("system/vram/total (GB).tsv")) not in key for key in timeseries)

From b9e9bac9999384b6246ad8b55f3fdaf0756b4068 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 15 Feb 2024 16:25:05 +0100
Subject: [PATCH 27/39] detect GPUs with nmvl import

---
 src/dvclive/live.py           |  7 +++----
 src/dvclive/monitor_system.py | 22 ++++++++++++++--------
 tests/test_monitor_system.py  | 10 +++++-----
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index cecf2e01..c0a85274 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -9,7 +9,6 @@
 import tempfile
 
 from pathlib import Path, PurePath
-from py3nvml import get_num_procs
 from typing import (
     Any,
     Dict,
@@ -22,6 +21,7 @@
     Literal,
 )
 
+
 if TYPE_CHECKING:
     import numpy as np
     import pandas as pd
@@ -52,7 +52,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .monitor_system import MonitorCPU, MonitorGPU
+from .monitor_system import GPU_AVAILABLE, MonitorCPU, MonitorGPU
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -180,8 +180,7 @@ def __init__(
         if monitor_system:
             self._monitor_cpu = MonitorCPU()
             self._monitor_cpu(self)
-
-            if len(get_num_procs()):
+            if GPU_AVAILABLE:
                 self._monitor_gpu = MonitorGPU()
                 self._monitor_gpu(self)
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 4725066d..78710b40 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -6,18 +6,24 @@
 from statistics import mean
 from threading import Event, Thread
 from funcy import merge_with
-from py3nvml.py3nvml import (
-    nvmlInit,
-    nvmlDeviceGetCount,
-    nvmlDeviceGetHandleByIndex,
-    nvmlDeviceGetMemoryInfo,
-    nvmlDeviceGetUtilizationRates,
-    nvmlShutdown,
-)
 
 from .error import InvalidDataTypeError
 
 
+try:
+    from py3nvml.py3nvml import (
+        nvmlInit,
+        nvmlDeviceGetCount,
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlDeviceGetUtilizationRates,
+        nvmlShutdown,
+    )
+
+    GPU_AVAILABLE = True
+except ImportError:
+    GPU_AVAILABLE = False
+
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
 GIGABYTES_DIVIDER = 1024.0**3
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index d0fbffbf..73d0e155 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -126,9 +126,9 @@ def test_monitor_system(tmp_dir, mocker):
     )
 
 
-def mock_py3nvml(mocker, nb_gpus=2, crash_index=None):
-    mocker.patch("dvclive.live.get_num_procs", return_value=[1 for _ in range(nb_gpus)])
-    mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=nb_gpus)
+def mock_py3nvml(mocker, num_gpus=2, crash_index=None):
+    mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus)
+    mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus)
 
     mocked_memory_info = mocker.MagicMock()
     mocked_memory_info.used = 3 * 1024**3
@@ -152,7 +152,7 @@ def mock_py3nvml(mocker, nb_gpus=2, crash_index=None):
 
 
 def test_get_gpus_metrics_mocker(mocker, tmp_dir):
-    mock_py3nvml(mocker, nb_gpus=2)
+    mock_py3nvml(mocker, num_gpus=2)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -174,7 +174,7 @@ def test_get_gpus_metrics_mocker(mocker, tmp_dir):
 
 def test_monitor_gpu_system(tmp_dir, mocker):
     mock_psutil(mocker)
-    mock_py3nvml(mocker, nb_gpus=1)
+    mock_py3nvml(mocker, num_gpus=1)
     with Live(
         tmp_dir,
         save_dvc_exp=False,

From b44881b0b794ef4b6ef134e179a2c8e0bb65f365 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 16 Feb 2024 09:56:01 +0100
Subject: [PATCH 28/39] replace py3nvml to pynvml and add setter to monitor_gpu

---
 pyproject.toml                |  2 +-
 src/dvclive/live.py           | 11 +++++++++++
 src/dvclive/monitor_system.py |  2 +-
 tests/test_monitor_system.py  |  8 +++++---
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e425e8eb..4366ab64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
   "ruamel.yaml",
   "scmrepo",
   "psutil",
-  "py3nvml"
+  "pynvml"
 ]
 
 [project.optional-dependencies]
diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index c0a85274..e1813ddc 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -413,6 +413,17 @@ def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None:
         self._monitor_cpu = monitor_cpu
         self._monitor_cpu(self)
 
+    @property
+    def monitor_gpu(self) -> Optional[MonitorGPU]:
+        return self._monitor_gpu or None
+
+    @monitor_gpu.setter
+    def monitor_gpu(self, monitor_gpu: MonitorGPU) -> None:
+        if self._monitor_gpu is not None:
+            self._monitor_gpu.end()
+        self._monitor_gpu = monitor_gpu
+        self._monitor_gpu(self)
+
     def sync(self):
         self.make_summary()
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 78710b40..33352008 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -11,7 +11,7 @@
 
 
 try:
-    from py3nvml.py3nvml import (
+    from pynvml import (
         nvmlInit,
         nvmlDeviceGetCount,
         nvmlDeviceGetHandleByIndex,
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 73d0e155..5165cde3 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -126,9 +126,11 @@ def test_monitor_system(tmp_dir, mocker):
     )
 
 
-def mock_py3nvml(mocker, num_gpus=2, crash_index=None):
+def mock_pynvml(mocker, num_gpus=2, crash_index=None):
     mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus)
     mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus)
+    mocker.patch("dvclive.monitor_system.nvmlInit", return_value=None)
+    mocker.patch("dvclive.monitor_system.nvmlShutdown", return_value=None)
 
     mocked_memory_info = mocker.MagicMock()
     mocked_memory_info.used = 3 * 1024**3
@@ -152,7 +154,7 @@ def mock_py3nvml(mocker, num_gpus=2, crash_index=None):
 
 
 def test_get_gpus_metrics_mocker(mocker, tmp_dir):
-    mock_py3nvml(mocker, num_gpus=2)
+    mock_pynvml(mocker, num_gpus=2)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -174,7 +176,7 @@ def test_get_gpus_metrics_mocker(mocker, tmp_dir):
 
 def test_monitor_gpu_system(tmp_dir, mocker):
     mock_psutil(mocker)
-    mock_py3nvml(mocker, num_gpus=1)
+    mock_pynvml(mocker, num_gpus=1)
     with Live(
         tmp_dir,
         save_dvc_exp=False,

From 8654aee32e2b4cf01ed1c60c761368808d24e870 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 16 Feb 2024 11:46:30 +0100
Subject: [PATCH 29/39] change error types and `monitor_cpu` to `cpu_monitor`

---
 src/dvclive/live.py           | 40 ++++++++------------
 src/dvclive/monitor_system.py | 69 ++++++++++++++++++++---------------
 tests/test_monitor_system.py  | 39 +++++++++-----------
 3 files changed, 73 insertions(+), 75 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 05ff9943..b71f3b6c 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -9,17 +9,7 @@
 import tempfile
 
 from pathlib import Path, PurePath
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-    TYPE_CHECKING,
-    Literal,
-)
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
     import numpy as np
@@ -51,7 +41,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .monitor_system import MonitorCPU
+from .monitor_system import CPUMonitor
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -174,10 +164,10 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._monitor_cpu = None
+        self._cpu_monitor = None
         if monitor_system:
-            self._monitor_cpu = MonitorCPU()
-            self._monitor_cpu(self)
+            self._cpu_monitor = CPUMonitor()
+            self._cpu_monitor(self)
 
     def _init_resume(self):
         self._read_params()
@@ -398,15 +388,15 @@ def step(self, value: int) -> None:
         logger.debug(f"Step: {self.step}")
 
     @property
-    def monitor_cpu(self) -> Optional[MonitorCPU]:
-        return self._monitor_cpu or None
+    def cpu_monitor(self) -> Optional[CPUMonitor]:
+        return self._cpu_monitor or None
 
-    @monitor_cpu.setter
-    def monitor_cpu(self, monitor_cpu: MonitorCPU) -> None:
-        if self._monitor_cpu is not None:
-            self._monitor_cpu.end()
-        self._monitor_cpu = monitor_cpu
-        self._monitor_cpu(self)
+    @cpu_monitor.setter
+    def cpu_monitor(self, cpu_monitor: CPUMonitor) -> None:
+        if self._cpu_monitor is not None:
+            self._cpu_monitor.end()
+        self._cpu_monitor = cpu_monitor
+        self._cpu_monitor(self)
 
     def sync(self):
         self.make_summary()
@@ -897,8 +887,8 @@ def end(self):
             self.step = self.summary["step"]
 
         # Kill threads that monitor the system metrics
-        if self._monitor_cpu is not None:
-            self._monitor_cpu.end()
+        if self._cpu_monitor is not None:
+            self._cpu_monitor.end()
 
         self.sync()
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 6820a1f2..67a1fc29 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -1,3 +1,4 @@
+import abc
 import logging
 from typing import Dict, Union, Optional, List, Tuple
 from pathlib import Path
@@ -7,8 +8,6 @@
 from threading import Event, Thread
 from funcy import merge_with
 
-from .error import InvalidDataTypeError
-
 
 logger = logging.getLogger("dvclive")
 MEGABYTES_DIVIDER = 1024.0**2
@@ -17,10 +16,10 @@
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
 
 
-class _MonitorSystem:
+class _SystemMonitor(abc.ABC):
     """
     Monitor system resources and log them to DVC Live.
-    Use a separate thread to call a `_get_metrics` function at  fix interval and
+    Use a separate thread to call a `_get_metrics` function at fix interval and
     aggregate the results of this sampling using the average.
     """
 
@@ -33,11 +32,19 @@ def __init__(
         plot: bool = True,
     ):
         if not isinstance(interval, (int, float)):
-            raise InvalidDataTypeError("MonitorSystem.interval", type(interval))
+            raise TypeError(  # noqa: TRY003
+                "System monitoring `interval` should be an `int` or a `float`, but got "
+                f"{type(interval)}"
+            )
         if not isinstance(num_samples, int):
-            raise InvalidDataTypeError("MonitorSystem.sample", type(num_samples))
+            raise TypeError(  # noqa: TRY003
+                "System monitoring `num_samples` should be an `int`, but got "
+                f"{type(num_samples)}"
+            )
         if not isinstance(plot, bool):
-            raise InvalidDataTypeError("MonitorSystem.plot", type(plot))
+            raise TypeError(  # noqa: TRY003
+                f"System monitoring `plot` should be a `bool`, but got {type(plot)}"
+            )
 
         self._interval = interval  # seconds
         self._nb_samples = num_samples
@@ -78,14 +85,15 @@ def _monitoring_loop(self):
                     plot=None if blacklisted else self._plot,
                 )
 
-    def _get_metrics(self):
+    @abc.abstractmethod
+    def _get_metrics(self) -> Dict[str, Union[float, int]]:
         pass
 
     def end(self):
         self._shutdown_event.set()
 
 
-class MonitorCPU(_MonitorSystem):
+class CPUMonitor(_SystemMonitor):
     _plot_blacklist_prefix: Tuple = (
         "system/cpu/count",
         "system/ram/total (GB)",
@@ -96,7 +104,7 @@ def __init__(
         self,
         interval: float = 0.5,
         num_samples: int = 10,
-        directories_to_monitor: Optional[List[str]] = None,
+        disks_to_monitor: Optional[List[str]] = None,
         plot: bool = True,
     ):
         """Monitor CPU resources and log them to DVC Live.
@@ -105,25 +113,23 @@ def __init__(
             interval (float): interval in seconds between two measurements.
                 Defaults to 0.5.
             num_samples (int): number of samples to average. Defaults to 10.
-            directories_to_monitor (Optional[List[str]]): paths of the directories that
-            needs to be monitor for disk storage metrics. Defaults to the current
-            directory.
+            disks_to_monitor (Optional[List[str]]): paths to the disks or partitions to
+                monitor disk usage statistics. Defaults to "/".
             plot (bool): should the system metrics be saved as plots. Defaults to True.
 
         Raises:
-            InvalidDataTypeError: if the arguments passed to the function don't have a
+            TypeError: if the arguments passed to the function don't have a
                 supported type.
         """
         super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-        directories_to_monitor = (
-            ["."] if directories_to_monitor is None else directories_to_monitor
-        )
-        for idx, path in enumerate(directories_to_monitor):
+        disks_to_monitor = ["/"] if disks_to_monitor is None else disks_to_monitor
+        for idx, path in enumerate(disks_to_monitor):
             if not isinstance(path, str):
-                raise InvalidDataTypeError(
-                    f"MonitorCPU.directories_to_monitor[{idx}]", type(path)
+                raise TypeError(  # noqa: TRY003
+                    "CPU monitoring `partitions_to_monitor` should be a `List[str]`, "
+                    f"but got {type(path)} at position {idx}"
                 )
-        self._directories_to_monitor = directories_to_monitor
+        self._disks_to_monitor = disks_to_monitor
 
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()
@@ -146,13 +152,18 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             * (ram_info.total / GIGABYTES_DIVIDER),
             "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
         }
-        for idx, directory in enumerate(self._directories_to_monitor):
-            if not Path(directory).exists():
+        for disk_name in self._disks_to_monitor:
+            if not Path(disk_name).exists():
                 continue
-            disk_info = psutil.disk_usage(directory)
-            result[f"system/disk/usage (%)/{idx}"] = disk_info.percent
-            result[f"system/disk/usage (GB)/{idx}"] = disk_info.used / GIGABYTES_DIVIDER
-            result[f"system/disk/total (GB)/{idx}"] = (
-                disk_info.total / GIGABYTES_DIVIDER
-            )
+            disk_info = psutil.disk_usage(disk_name)
+            disk_path = Path(disk_name).as_posix().lstrip("/")
+            disk_metrics = {
+                f"system/disk/usage (%)/{disk_path}": disk_info.percent,
+                f"system/disk/usage (GB)/{disk_path}": disk_info.used
+                / GIGABYTES_DIVIDER,
+                f"system/disk/total (GB)/{disk_path}": disk_info.total
+                / GIGABYTES_DIVIDER,
+            }
+            disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()}
+            result.update(disk_metrics)
         return result
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 9653c2e1..1ddee6ec 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 from dvclive import Live
-from dvclive.monitor_system import MonitorCPU
+from dvclive.monitor_system import CPUMonitor
 from dvclive.utils import parse_metrics
 
 
@@ -40,7 +40,7 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        monitor = MonitorCPU(directories_to_monitor=["/", "/"])
+        monitor = CPUMonitor(disks_to_monitor=["/", "/home"])
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()
@@ -51,32 +51,32 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker):
     assert "system/ram/usage (%)" in metrics
     assert "system/ram/usage (GB)" in metrics
     assert "system/ram/total (GB)" in metrics
-    assert "system/disk/usage (%)/0" in metrics
-    assert "system/disk/usage (%)/1" in metrics
-    assert "system/disk/usage (GB)/0" in metrics
-    assert "system/disk/usage (GB)/1" in metrics
-    assert "system/disk/total (GB)/0" in metrics
-    assert "system/disk/total (GB)/1" in metrics
+    assert "system/disk/usage (%)" in metrics
+    assert "system/disk/usage (%)/home" in metrics
+    assert "system/disk/usage (GB)" in metrics
+    assert "system/disk/usage (GB)/home" in metrics
+    assert "system/disk/total (GB)" in metrics
+    assert "system/disk/total (GB)/home" in metrics
 
 
-def test_ignore_missing_directories(tmp_dir, mocker):
+def test_ignore_non_existent_directories(tmp_dir, mocker):
     mock_psutil(mocker)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        missing_directories = "______"
-        monitor = MonitorCPU(directories_to_monitor=["/", missing_directories])
+        non_existent_directories = "/non-existent"
+        monitor = CPUMonitor(disks_to_monitor=["/", non_existent_directories])
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()
 
-    assert not Path(missing_directories).exists()
+    assert not Path(non_existent_directories).exists()
 
-    assert "system/disk/usage (%)/1" not in metrics
-    assert "system/disk/usage (GB)/1" not in metrics
-    assert "system/disk/total (GB)/1" not in metrics
+    assert "system/disk/usage (%)/non-existent" not in metrics
+    assert "system/disk/usage (GB)/non-existent" not in metrics
+    assert "system/disk/total (GB)/non-existent" not in metrics
 
 
 def test_monitor_system(tmp_dir, mocker):
@@ -103,7 +103,6 @@ def test_monitor_system(tmp_dir, mocker):
     assert "total (GB)" in latest["system"]["ram"]
     assert "disk" in latest["system"]
     assert "usage (%)" in latest["system"]["disk"]
-    assert "0" in latest["system"]["disk"]["usage (%)"]
     assert "usage (GB)" in latest["system"]["disk"]
     assert "total (GB)" in latest["system"]["disk"]
 
@@ -114,13 +113,11 @@ def test_monitor_system(tmp_dir, mocker):
     )
     assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries)
     assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries)
-    assert any(str(Path("system/disk/usage (%)/0.tsv")) in key for key in timeseries)
-    assert any(str(Path("system/disk/usage (GB)/0.tsv")) in key for key in timeseries)
+    assert any(str(Path("system/disk/usage (%).tsv")) in key for key in timeseries)
+    assert any(str(Path("system/disk/usage (GB).tsv")) in key for key in timeseries)
     assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
 
     # blacklisted timeseries
     assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries)
     assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries)
-    assert all(
-        str(Path("system/disk/total (GB)/0.tsv")) not in key for key in timeseries
-    )
+    assert all(str(Path("system/disk/total (GB).tsv")) not in key for key in timeseries)

From f0d6234e157bd8fa64ad60b282c0e3ef7eb2383a Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 16 Feb 2024 15:18:12 +0100
Subject: [PATCH 30/39] add unit tests about _num_points_sent_to_studio
 behavior

---
 pyproject.toml                |   6 +-
 src/dvclive/monitor_system.py |  52 ++++++----
 tests/test_monitor_system.py  | 172 ++++++++++++++++++++++++----------
 3 files changed, 162 insertions(+), 68 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 62294cfa..68fd9dce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,12 +52,14 @@ tests = [
   "pytest-cov>=3.0.0,<4.0",
   "pytest-mock>=3.8.2,<4.0",
   "dvclive[image,plots,markdown]",
-  "ipython"
+  "ipython",
+  "pytest_voluptuous"
 ]
 dev = [
   "dvclive[all,tests]",
   "mypy>=1.1.1",
-  "types-PyYAML"
+  "types-PyYAML",
+  "voluptuous"
 ]
 mmcv = ["mmcv"]
 tf = ["tensorflow"]
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 67a1fc29..bb9fbfa0 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -1,6 +1,7 @@
 import abc
 import logging
-from typing import Dict, Union, Optional, List, Tuple
+import os
+from typing import Dict, Union, Optional, Tuple
 from pathlib import Path
 
 import psutil
@@ -34,16 +35,16 @@ def __init__(
         if not isinstance(interval, (int, float)):
             raise TypeError(  # noqa: TRY003
                 "System monitoring `interval` should be an `int` or a `float`, but got "
-                f"{type(interval)}"
+                f"'{type(interval)}'"
             )
         if not isinstance(num_samples, int):
             raise TypeError(  # noqa: TRY003
                 "System monitoring `num_samples` should be an `int`, but got "
-                f"{type(num_samples)}"
+                f"'{type(num_samples)}'"
             )
         if not isinstance(plot, bool):
             raise TypeError(  # noqa: TRY003
-                f"System monitoring `plot` should be a `bool`, but got {type(plot)}"
+                f"System monitoring `plot` should be a `bool`, but got '{type(plot)}'"
             )
 
         self._interval = interval  # seconds
@@ -104,7 +105,7 @@ def __init__(
         self,
         interval: float = 0.5,
         num_samples: int = 10,
-        disks_to_monitor: Optional[List[str]] = None,
+        disks_to_monitor: Optional[Dict[str, str]] = None,
         plot: bool = True,
     ):
         """Monitor CPU resources and log them to DVC Live.
@@ -113,8 +114,10 @@ def __init__(
             interval (float): interval in seconds between two measurements.
                 Defaults to 0.5.
             num_samples (int): number of samples to average. Defaults to 10.
-            disks_to_monitor (Optional[List[str]]): paths to the disks or partitions to
-                monitor disk usage statistics. Defaults to "/".
+            disks_to_monitor (Optional[Dict[str, str]]): paths to the disks or
+                partitions to monitor disk usage statistics. The key is the name that
+                will be displayed in the metric name. The value is the path to the disk
+                or partition. Defaults to {"main": "/"}.
             plot (bool): should the system metrics be saved as plots. Defaults to True.
 
         Raises:
@@ -122,12 +125,24 @@ def __init__(
                 supported type.
         """
         super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-        disks_to_monitor = ["/"] if disks_to_monitor is None else disks_to_monitor
-        for idx, path in enumerate(disks_to_monitor):
-            if not isinstance(path, str):
+        disks_to_monitor = (
+            {"main": "/"} if disks_to_monitor is None else disks_to_monitor
+        )
+        for disk_name, disk_path in disks_to_monitor.items():
+            if not isinstance(disk_name, str):
                 raise TypeError(  # noqa: TRY003
-                    "CPU monitoring `partitions_to_monitor` should be a `List[str]`, "
-                    f"but got {type(path)} at position {idx}"
+                    "Keys for `partitions_to_monitor` should be a `str`"
+                    f", but got '{type(disk_name)}'"
+                )
+            if not isinstance(disk_path, str):
+                raise TypeError(  # noqa: TRY003
+                    "Value for `partitions_to_monitor` should be a `str`"
+                    f", but got '{type(disk_path)}'"
+                )
+            if disk_name != os.path.normpath(disk_name):
+                raise ValueError(  # noqa: TRY003
+                    "Keys for `partitions_to_monitor` should be a valid name"
+                    f", but got '{disk_name}'"
                 )
         self._disks_to_monitor = disks_to_monitor
 
@@ -152,16 +167,15 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             * (ram_info.total / GIGABYTES_DIVIDER),
             "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
         }
-        for disk_name in self._disks_to_monitor:
-            if not Path(disk_name).exists():
+        for disk_name, disk_path in self._disks_to_monitor.items():
+            if not Path(disk_path).exists():
                 continue
-            disk_info = psutil.disk_usage(disk_name)
-            disk_path = Path(disk_name).as_posix().lstrip("/")
+            disk_info = psutil.disk_usage(disk_path)
             disk_metrics = {
-                f"system/disk/usage (%)/{disk_path}": disk_info.percent,
-                f"system/disk/usage (GB)/{disk_path}": disk_info.used
+                f"system/disk/usage (%)/{disk_name}": disk_info.percent,
+                f"system/disk/usage (GB)/{disk_name}": disk_info.used
                 / GIGABYTES_DIVIDER,
-                f"system/disk/total (GB)/{disk_path}": disk_info.total
+                f"system/disk/total (GB)/{disk_name}": disk_info.total
                 / GIGABYTES_DIVIDER,
             }
             disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()}
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 1ddee6ec..798471e7 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -5,6 +5,9 @@
 from dvclive.monitor_system import CPUMonitor
 from dvclive.utils import parse_metrics
 
+from voluptuous import Union
+from pytest_voluptuous import S
+
 
 def mock_psutil(mocker):
     mocker.patch(
@@ -40,23 +43,53 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        monitor = CPUMonitor(disks_to_monitor=["/", "/home"])
+        monitor = CPUMonitor(disks_to_monitor={"main": "/", "home": "/"})
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()
 
-    assert "system/cpu/usage (%)" in metrics
-    assert "system/cpu/count" in metrics
-    assert "system/cpu/parallelization (%)" in metrics
-    assert "system/ram/usage (%)" in metrics
-    assert "system/ram/usage (GB)" in metrics
-    assert "system/ram/total (GB)" in metrics
-    assert "system/disk/usage (%)" in metrics
-    assert "system/disk/usage (%)/home" in metrics
-    assert "system/disk/usage (GB)" in metrics
-    assert "system/disk/usage (GB)/home" in metrics
-    assert "system/disk/total (GB)" in metrics
-    assert "system/disk/total (GB)/home" in metrics
+    assert metrics == S(
+        {
+            "system/cpu/usage (%)": Union(int, float),
+            "system/cpu/count": int,
+            "system/cpu/parallelization (%)": Union(int, float),
+            "system/ram/usage (%)": Union(int, float),
+            "system/ram/usage (GB)": Union(int, float),
+            "system/ram/total (GB)": Union(int, float),
+            "system/disk/usage (%)/main": Union(int, float),
+            "system/disk/usage (%)/home": Union(int, float),
+            "system/disk/usage (GB)/main": Union(int, float),
+            "system/disk/usage (GB)/home": Union(int, float),
+            "system/disk/total (GB)/main": Union(int, float),
+            "system/disk/total (GB)/home": Union(int, float),
+        }
+    )
+
+
+def test_monitor_system_is_false(tmp_dir, mocker):
+    mock_psutil(mocker)
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=False,
+    ) as live:
+        assert live._cpu_monitor is None
+
+
+def test_monitor_system_is_true(tmp_dir, mocker):
+    mock_psutil(mocker)
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=True,
+    ) as live:
+        cpu_monitor = live._cpu_monitor
+        assert isinstance(live._cpu_monitor, CPUMonitor)
+        # start is called but end not called
+        assert cpu_monitor._shutdown_event._flag is False
+
+    # end was called
+    assert cpu_monitor._shutdown_event._flag is True
 
 
 def test_ignore_non_existent_directories(tmp_dir, mocker):
@@ -66,58 +99,103 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        non_existent_directories = "/non-existent"
-        monitor = CPUMonitor(disks_to_monitor=["/", non_existent_directories])
+        non_existent_disk = "/non-existent"
+        monitor = CPUMonitor(
+            disks_to_monitor={"main": "/", "non-existent": non_existent_disk}
+        )
         monitor(live)
         metrics = monitor._get_metrics()
         monitor.end()
 
-    assert not Path(non_existent_directories).exists()
+    assert not Path(non_existent_disk).exists()
 
     assert "system/disk/usage (%)/non-existent" not in metrics
     assert "system/disk/usage (GB)/non-existent" not in metrics
     assert "system/disk/total (GB)/non-existent" not in metrics
 
 
-def test_monitor_system(tmp_dir, mocker):
+def test_monitor_system_metrics(tmp_dir, mocker):
     mock_psutil(mocker)
+    interval = 0.05
+    num_samples = 4
     with Live(
         tmp_dir,
         save_dvc_exp=False,
-        monitor_system=True,
+        monitor_system=False,
     ) as live:
-        time.sleep(5 + 1)  # allow the thread to finish
+        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        time.sleep(interval * num_samples + interval)  # log metrics once
         live.next_step()
-        time.sleep(5 + 1)  # allow the thread to finish
-        timeseries, latest = parse_metrics(live)
+        time.sleep(interval * num_samples + interval)  # log metrics twice
+        live.make_summary()
+
+        _, latest = parse_metrics(live)
 
     # metrics.json file contains all the system metrics
-    assert "system" in latest
-    assert "cpu" in latest["system"]
-    assert "usage (%)" in latest["system"]["cpu"]
-    assert "count" in latest["system"]["cpu"]
-    assert "parallelization (%)" in latest["system"]["cpu"]
-    assert "ram" in latest["system"]
-    assert "usage (%)" in latest["system"]["ram"]
-    assert "usage (GB)" in latest["system"]["ram"]
-    assert "total (GB)" in latest["system"]["ram"]
-    assert "disk" in latest["system"]
-    assert "usage (%)" in latest["system"]["disk"]
-    assert "usage (GB)" in latest["system"]["disk"]
-    assert "total (GB)" in latest["system"]["disk"]
+    assert latest == S(
+        {
+            "step": Union(int, float),
+            "system": {
+                "cpu": {
+                    "usage (%)": Union(int, float),
+                    "count": Union(int, float),
+                    "parallelization (%)": Union(int, float),
+                },
+                "ram": {
+                    "usage (%)": Union(int, float),
+                    "usage (GB)": Union(int, float),
+                    "total (GB)": Union(int, float),
+                },
+                "disk": {
+                    "usage (%)": {"main": Union(int, float)},
+                    "usage (GB)": {"main": Union(int, float)},
+                    "total (GB)": {"main": Union(int, float)},
+                },
+            },
+        }
+    )
+
+
+def test_monitor_system_timeseries(tmp_dir, mocker):
+    mock_psutil(mocker)
+    interval = 0.05
+    num_samples = 4
+    with Live(
+        tmp_dir,
+        save_dvc_exp=False,
+        monitor_system=False,
+    ) as live:
+        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        time.sleep(interval * num_samples + interval)  # log metrics once
+        live.next_step()
+        time.sleep(interval * num_samples + interval)  # log metrics twice
+        live.make_summary()
+
+        timeseries, _ = parse_metrics(live)
+
+    def timeserie_metric_schema(name):
+        return [{name: str, "timestamp": str, "step": str} for _ in range(2)]
 
     # timeseries contains all the system metrics
-    assert any(str(Path("system/cpu/usage (%).tsv")) in key for key in timeseries)
-    assert any(
-        str(Path("system/cpu/parallelization (%).tsv")) in key for key in timeseries
+    assert timeseries == S(
+        {
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/cpu/usage (%).tsv"
+            ): timeserie_metric_schema("usage (%)"),
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/cpu/parallelization (%).tsv"
+            ): timeserie_metric_schema("parallelization (%)"),
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/ram/usage (%).tsv"
+            ): timeserie_metric_schema("usage (%)"),
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/ram/usage (GB).tsv"
+            ): timeserie_metric_schema("usage (GB)"),
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/disk/usage (%)/main.tsv"
+            ): timeserie_metric_schema("main"),
+            str(
+                Path(tmp_dir) / "plots/metrics" / "system/disk/usage (GB)/main.tsv"
+            ): timeserie_metric_schema("main"),
+        }
     )
-    assert any(str(Path("system/ram/usage (%).tsv")) in key for key in timeseries)
-    assert any(str(Path("system/ram/usage (GB).tsv")) in key for key in timeseries)
-    assert any(str(Path("system/disk/usage (%).tsv")) in key for key in timeseries)
-    assert any(str(Path("system/disk/usage (GB).tsv")) in key for key in timeseries)
-    assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
-
-    # blacklisted timeseries
-    assert all(str(Path("system/cpu/count.tsv")) not in key for key in timeseries)
-    assert all(str(Path("system/ram/total (GB).tsv")) not in key for key in timeseries)
-    assert all(str(Path("system/disk/total (GB).tsv")) not in key for key in timeseries)

From d2c1c84fdc15acdfbfac92b032caba8e3960b014 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Fri, 16 Feb 2024 17:08:15 +0100
Subject: [PATCH 31/39] use constant values for metrics names

---
 pyproject.toml                |   1 +
 src/dvclive/monitor_system.py |  34 +++++++---
 tests/test_monitor_system.py  | 124 ++++++++++++++++++----------------
 3 files changed, 88 insertions(+), 71 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 68fd9dce..c622102d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
   "dvc>=3.33.3",
   "dvc-render>=1.0.0,<2",
   "dvc-studio-client>=0.17.1,<1",
+  "dpath",
   "funcy",
   "gto",
   "ruamel.yaml",
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index bb9fbfa0..43bd8786 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -16,6 +16,18 @@
 
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
 
+METRIC_CPU_COUNT = "system/cpu/count"
+METRIC_CPU_USAGE_PERCENT = "system/cpu/usage (%)"
+METRIC_CPU_PARALLELIZATION_PERCENT = "system/cpu/parallelization (%)"
+
+METRIC_RAM_USAGE_PERCENT = "system/ram/usage (%)"
+METRIC_RAM_USAGE_GB = "system/ram/usage (GB)"
+METRIC_RAM_TOTAL_GB = "system/ram/total (GB)"
+
+METRIC_DISK_USAGE_PERCENT = "system/disk/usage (%)"
+METRIC_DISK_USAGE_GB = "system/disk/usage (GB)"
+METRIC_DISK_TOTAL_GB = "system/disk/total (GB)"
+
 
 class _SystemMonitor(abc.ABC):
     """
@@ -96,8 +108,8 @@ def end(self):
 
 class CPUMonitor(_SystemMonitor):
     _plot_blacklist_prefix: Tuple = (
-        "system/cpu/count",
-        "system/ram/total (GB)",
+        METRIC_CPU_COUNT,
+        METRIC_RAM_TOTAL_GB,
         "system/disk/total (GB)",
     )
 
@@ -151,9 +163,9 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
         nb_cpus = psutil.cpu_count()
         cpus_percent = psutil.cpu_percent(percpu=True)
         result = {
-            "system/cpu/usage (%)": mean(cpus_percent),
-            "system/cpu/count": nb_cpus,
-            "system/cpu/parallelization (%)": len(
+            METRIC_CPU_COUNT: nb_cpus,
+            METRIC_CPU_USAGE_PERCENT: mean(cpus_percent),
+            METRIC_CPU_PARALLELIZATION_PERCENT: len(
                 [
                     percent
                     for percent in cpus_percent
@@ -162,20 +174,20 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             )
             * 100
             / nb_cpus,
-            "system/ram/usage (%)": ram_info.percent,
-            "system/ram/usage (GB)": (ram_info.percent / 100)
+            METRIC_RAM_USAGE_PERCENT: ram_info.percent,
+            METRIC_RAM_USAGE_GB: (ram_info.percent / 100)
             * (ram_info.total / GIGABYTES_DIVIDER),
-            "system/ram/total (GB)": ram_info.total / GIGABYTES_DIVIDER,
+            METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
         }
         for disk_name, disk_path in self._disks_to_monitor.items():
             if not Path(disk_path).exists():
                 continue
             disk_info = psutil.disk_usage(disk_path)
             disk_metrics = {
-                f"system/disk/usage (%)/{disk_name}": disk_info.percent,
-                f"system/disk/usage (GB)/{disk_name}": disk_info.used
+                f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent,
+                f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used
                 / GIGABYTES_DIVIDER,
-                f"system/disk/total (GB)/{disk_name}": disk_info.total
+                f"{METRIC_DISK_TOTAL_GB}/{disk_name}": disk_info.total
                 / GIGABYTES_DIVIDER,
             }
             disk_metrics = {k.rstrip("/"): v for k, v in disk_metrics.items()}
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 798471e7..832c576f 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -1,8 +1,20 @@
 import time
 from pathlib import Path
+import dpath
 
 from dvclive import Live
-from dvclive.monitor_system import CPUMonitor
+from dvclive.monitor_system import (
+    CPUMonitor,
+    METRIC_CPU_COUNT,
+    METRIC_CPU_USAGE_PERCENT,
+    METRIC_CPU_PARALLELIZATION_PERCENT,
+    METRIC_RAM_USAGE_PERCENT,
+    METRIC_RAM_USAGE_GB,
+    METRIC_RAM_TOTAL_GB,
+    METRIC_DISK_USAGE_PERCENT,
+    METRIC_DISK_USAGE_GB,
+    METRIC_DISK_TOTAL_GB,
+)
 from dvclive.utils import parse_metrics
 
 from voluptuous import Union
@@ -50,18 +62,18 @@ def test_get_cpus_metrics_mocker(tmp_dir, mocker):
 
     assert metrics == S(
         {
-            "system/cpu/usage (%)": Union(int, float),
-            "system/cpu/count": int,
-            "system/cpu/parallelization (%)": Union(int, float),
-            "system/ram/usage (%)": Union(int, float),
-            "system/ram/usage (GB)": Union(int, float),
-            "system/ram/total (GB)": Union(int, float),
-            "system/disk/usage (%)/main": Union(int, float),
-            "system/disk/usage (%)/home": Union(int, float),
-            "system/disk/usage (GB)/main": Union(int, float),
-            "system/disk/usage (GB)/home": Union(int, float),
-            "system/disk/total (GB)/main": Union(int, float),
-            "system/disk/total (GB)/home": Union(int, float),
+            METRIC_CPU_COUNT: int,
+            METRIC_CPU_USAGE_PERCENT: Union(int, float),
+            METRIC_CPU_PARALLELIZATION_PERCENT: Union(int, float),
+            METRIC_RAM_USAGE_PERCENT: Union(int, float),
+            METRIC_RAM_USAGE_GB: Union(int, float),
+            METRIC_RAM_TOTAL_GB: Union(int, float),
+            f"{METRIC_DISK_USAGE_PERCENT}/main": Union(int, float),
+            f"{METRIC_DISK_USAGE_PERCENT}/home": Union(int, float),
+            f"{METRIC_DISK_USAGE_GB}/main": Union(int, float),
+            f"{METRIC_DISK_USAGE_GB}/home": Union(int, float),
+            f"{METRIC_DISK_TOTAL_GB}/main": Union(int, float),
+            f"{METRIC_DISK_TOTAL_GB}/home": Union(int, float),
         }
     )
 
@@ -109,9 +121,9 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
 
     assert not Path(non_existent_disk).exists()
 
-    assert "system/disk/usage (%)/non-existent" not in metrics
-    assert "system/disk/usage (GB)/non-existent" not in metrics
-    assert "system/disk/total (GB)/non-existent" not in metrics
+    assert f"{METRIC_DISK_USAGE_PERCENT}/non-existent" not in metrics
+    assert f"{METRIC_DISK_USAGE_GB}/non-existent" not in metrics
+    assert f"{METRIC_DISK_TOTAL_GB}/non-existent" not in metrics
 
 
 def test_monitor_system_metrics(tmp_dir, mocker):
@@ -123,7 +135,7 @@ def test_monitor_system_metrics(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples)
         time.sleep(interval * num_samples + interval)  # log metrics once
         live.next_step()
         time.sleep(interval * num_samples + interval)  # log metrics twice
@@ -131,29 +143,22 @@ def test_monitor_system_metrics(tmp_dir, mocker):
 
         _, latest = parse_metrics(live)
 
-    # metrics.json file contains all the system metrics
-    assert latest == S(
-        {
-            "step": Union(int, float),
-            "system": {
-                "cpu": {
-                    "usage (%)": Union(int, float),
-                    "count": Union(int, float),
-                    "parallelization (%)": Union(int, float),
-                },
-                "ram": {
-                    "usage (%)": Union(int, float),
-                    "usage (GB)": Union(int, float),
-                    "total (GB)": Union(int, float),
-                },
-                "disk": {
-                    "usage (%)": {"main": Union(int, float)},
-                    "usage (GB)": {"main": Union(int, float)},
-                    "total (GB)": {"main": Union(int, float)},
-                },
-            },
-        }
-    )
+    schema = {}
+    for name in [
+        "step",
+        METRIC_CPU_COUNT,
+        METRIC_CPU_USAGE_PERCENT,
+        METRIC_CPU_PARALLELIZATION_PERCENT,
+        METRIC_RAM_USAGE_PERCENT,
+        METRIC_RAM_USAGE_GB,
+        METRIC_RAM_TOTAL_GB,
+        f"{METRIC_DISK_USAGE_PERCENT}/main",
+        f"{METRIC_DISK_USAGE_GB}/main",
+        f"{METRIC_DISK_TOTAL_GB}/main",
+    ]:
+        dpath.new(schema, name, Union(int, float))
+
+    assert latest == S(schema)
 
 
 def test_monitor_system_timeseries(tmp_dir, mocker):
@@ -165,7 +170,7 @@ def test_monitor_system_timeseries(tmp_dir, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples)
         time.sleep(interval * num_samples + interval)  # log metrics once
         live.next_step()
         time.sleep(interval * num_samples + interval)  # log metrics twice
@@ -173,29 +178,28 @@ def test_monitor_system_timeseries(tmp_dir, mocker):
 
         timeseries, _ = parse_metrics(live)
 
-    def timeserie_metric_schema(name):
+    def timeserie_schema(name):
         return [{name: str, "timestamp": str, "step": str} for _ in range(2)]
 
     # timeseries contains all the system metrics
+    prefix = Path(tmp_dir) / "plots/metrics"
     assert timeseries == S(
         {
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/cpu/usage (%).tsv"
-            ): timeserie_metric_schema("usage (%)"),
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/cpu/parallelization (%).tsv"
-            ): timeserie_metric_schema("parallelization (%)"),
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/ram/usage (%).tsv"
-            ): timeserie_metric_schema("usage (%)"),
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/ram/usage (GB).tsv"
-            ): timeserie_metric_schema("usage (GB)"),
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/disk/usage (%)/main.tsv"
-            ): timeserie_metric_schema("main"),
-            str(
-                Path(tmp_dir) / "plots/metrics" / "system/disk/usage (GB)/main.tsv"
-            ): timeserie_metric_schema("main"),
+            str(prefix / f"{METRIC_CPU_USAGE_PERCENT}.tsv"): timeserie_schema(
+                METRIC_CPU_USAGE_PERCENT.split("/")[-1]
+            ),
+            str(prefix / f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv"): timeserie_schema(
+                METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1]
+            ),
+            str(prefix / f"{METRIC_RAM_USAGE_PERCENT}.tsv"): timeserie_schema(
+                METRIC_RAM_USAGE_PERCENT.split("/")[-1]
+            ),
+            str(prefix / f"{METRIC_RAM_USAGE_GB}.tsv"): timeserie_schema(
+                METRIC_RAM_USAGE_GB.split("/")[-1]
+            ),
+            str(prefix / f"{METRIC_DISK_USAGE_PERCENT}/main.tsv"): timeserie_schema(
+                "main"
+            ),
+            str(prefix / f"{METRIC_DISK_USAGE_GB}/main.tsv"): timeserie_schema("main"),
         }
     )

From 7ab891500b37a1b17720fa8738b7c70d6a590d7f Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Mon, 19 Feb 2024 16:57:48 +0100
Subject: [PATCH 32/39] improve test and user inputs

---
 pyproject.toml                |   8 +--
 src/dvclive/monitor_system.py |  77 +++++++++++-----------
 tests/test_monitor_system.py  | 116 +++++++++++++++++++---------------
 3 files changed, 106 insertions(+), 95 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c622102d..5c0544f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,6 @@ dependencies = [
   "dvc>=3.33.3",
   "dvc-render>=1.0.0,<2",
   "dvc-studio-client>=0.17.1,<1",
-  "dpath",
   "funcy",
   "gto",
   "ruamel.yaml",
@@ -54,13 +53,14 @@ tests = [
   "pytest-mock>=3.8.2,<4.0",
   "dvclive[image,plots,markdown]",
   "ipython",
-  "pytest_voluptuous"
+  "pytest_voluptuous",
+  "dpath",
+  "voluptuous"
 ]
 dev = [
   "dvclive[all,tests]",
   "mypy>=1.1.1",
-  "types-PyYAML",
-  "voluptuous"
+  "types-PyYAML"
 ]
 mmcv = ["mmcv"]
 tf = ["tensorflow"]
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 43bd8786..6277ce26 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -2,7 +2,6 @@
 import logging
 import os
 from typing import Dict, Union, Optional, Tuple
-from pathlib import Path
 
 import psutil
 from statistics import mean
@@ -40,23 +39,25 @@ class _SystemMonitor(abc.ABC):
 
     def __init__(
         self,
-        interval: float = 0.5,
-        num_samples: int = 10,
+        interval: float,
+        num_samples: int,
         plot: bool = True,
     ):
-        if not isinstance(interval, (int, float)):
-            raise TypeError(  # noqa: TRY003
-                "System monitoring `interval` should be an `int` or a `float`, but got "
-                f"'{type(interval)}'"
+        max_interval = 0.1
+        if interval > max_interval:
+            interval = max_interval
+            logger.warning(
+                f"System monitoring `interval` should be less than {max_interval} "
+                f"seconds. Setting `interval` to {interval} seconds."
             )
-        if not isinstance(num_samples, int):
-            raise TypeError(  # noqa: TRY003
-                "System monitoring `num_samples` should be an `int`, but got "
-                f"'{type(num_samples)}'"
-            )
-        if not isinstance(plot, bool):
-            raise TypeError(  # noqa: TRY003
-                f"System monitoring `plot` should be a `bool`, but got '{type(plot)}'"
+
+        min_num_samples = 1
+        max_num_samples = 30
+        if min_num_samples < num_samples < max_num_samples:
+            num_samples = max(min(num_samples, max_num_samples), min_num_samples)
+            logger.warning(
+                f"System monitoring `num_samples` should be between {min_num_samples} "
+                f"and {max_num_samples}. Setting `num_samples` to {num_samples}."
             )
 
         self._interval = interval  # seconds
@@ -115,9 +116,9 @@ class CPUMonitor(_SystemMonitor):
 
     def __init__(
         self,
-        interval: float = 0.5,
-        num_samples: int = 10,
-        disks_to_monitor: Optional[Dict[str, str]] = None,
+        interval: float = 0.1,
+        num_samples: int = 20,
+        folders_to_monitor: Optional[Dict[str, str]] = None,
         plot: bool = True,
     ):
         """Monitor CPU resources and log them to DVC Live.
@@ -126,37 +127,35 @@ def __init__(
             interval (float): interval in seconds between two measurements.
                 Defaults to 0.5.
             num_samples (int): number of samples to average. Defaults to 10.
-            disks_to_monitor (Optional[Dict[str, str]]): paths to the disks or
-                partitions to monitor disk usage statistics. The key is the name that
-                will be displayed in the metric name. The value is the path to the disk
-                or partition. Defaults to {"main": "/"}.
+            folders_to_monitor (Optional[Dict[str, str]]): monitor disk usage
+                statistics about the partition which contains the given paths. The
+                statistics include total and used space in gygabytes and percent.
+                This argument expect a dict where the key is the name that will be used
+                in the metric's name and the value is the path to the folder to monitor.
+                Defaults to {"main": "/"}.
             plot (bool): should the system metrics be saved as plots. Defaults to True.
 
         Raises:
-            TypeError: if the arguments passed to the function don't have a
+            ValueError: if the arguments passed to the function don't have a
                 supported type.
         """
         super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-        disks_to_monitor = (
-            {"main": "/"} if disks_to_monitor is None else disks_to_monitor
+        folders_to_monitor = (
+            {"main": "/"} if folders_to_monitor is None else folders_to_monitor
         )
-        for disk_name, disk_path in disks_to_monitor.items():
-            if not isinstance(disk_name, str):
-                raise TypeError(  # noqa: TRY003
-                    "Keys for `partitions_to_monitor` should be a `str`"
-                    f", but got '{type(disk_name)}'"
-                )
-            if not isinstance(disk_path, str):
-                raise TypeError(  # noqa: TRY003
-                    "Value for `partitions_to_monitor` should be a `str`"
-                    f", but got '{type(disk_path)}'"
-                )
+        self._disks_to_monitor = {}
+        for disk_name, disk_path in folders_to_monitor.items():
             if disk_name != os.path.normpath(disk_name):
                 raise ValueError(  # noqa: TRY003
                     "Keys for `partitions_to_monitor` should be a valid name"
-                    f", but got '{disk_name}'"
+                    f", but got '{disk_name}'."
                 )
-        self._disks_to_monitor = disks_to_monitor
+            try:
+                psutil.disk_usage(disk_path)
+            except OSError:
+                logger.warning(f"Couldn't find partition '{disk_path}', ignoring it.")
+                continue
+            self._disks_to_monitor[disk_name] = disk_path
 
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()
@@ -180,8 +179,6 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
         }
         for disk_name, disk_path in self._disks_to_monitor.items():
-            if not Path(disk_path).exists():
-                continue
             disk_info = psutil.disk_usage(disk_path)
             disk_metrics = {
                 f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent,
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 832c576f..ae06de70 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -1,6 +1,10 @@
 import time
 from pathlib import Path
+import pytest
+
 import dpath
+from voluptuous import Union
+from pytest_voluptuous import S
 
 from dvclive import Live
 from dvclive.monitor_system import (
@@ -17,9 +21,6 @@
 )
 from dvclive.utils import parse_metrics
 
-from voluptuous import Union
-from pytest_voluptuous import S
-
 
 def mock_psutil(mocker):
     mocker.patch(
@@ -48,64 +49,75 @@ def mock_psutil(mocker):
         )
 
 
-def test_get_cpus_metrics_mocker(tmp_dir, mocker):
-    mock_psutil(mocker)
-    with Live(
-        tmp_dir,
-        save_dvc_exp=False,
-        monitor_system=False,
-    ) as live:
-        monitor = CPUMonitor(disks_to_monitor={"main": "/", "home": "/"})
-        monitor(live)
-        metrics = monitor._get_metrics()
-        monitor.end()
+def mock_psutil_with_oserror(mocker):
+    mocker.patch(
+        "dvclive.monitor_system.psutil.cpu_percent",
+        return_value=[10, 20, 30, 40, 50, 60],
+    )
+    mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6)
 
-    assert metrics == S(
-        {
-            METRIC_CPU_COUNT: int,
-            METRIC_CPU_USAGE_PERCENT: Union(int, float),
-            METRIC_CPU_PARALLELIZATION_PERCENT: Union(int, float),
-            METRIC_RAM_USAGE_PERCENT: Union(int, float),
-            METRIC_RAM_USAGE_GB: Union(int, float),
-            METRIC_RAM_TOTAL_GB: Union(int, float),
-            f"{METRIC_DISK_USAGE_PERCENT}/main": Union(int, float),
-            f"{METRIC_DISK_USAGE_PERCENT}/home": Union(int, float),
-            f"{METRIC_DISK_USAGE_GB}/main": Union(int, float),
-            f"{METRIC_DISK_USAGE_GB}/home": Union(int, float),
-            f"{METRIC_DISK_TOTAL_GB}/main": Union(int, float),
-            f"{METRIC_DISK_TOTAL_GB}/home": Union(int, float),
-        }
+    mocked_virtual_memory = mocker.MagicMock()
+    mocked_virtual_memory.percent = 20
+    mocked_virtual_memory.total = 4 * 1024**3
+
+    mocked_disk_usage = mocker.MagicMock()
+    mocked_disk_usage.used = 16 * 1024**3
+    mocked_disk_usage.percent = 50
+    mocked_disk_usage.total = 32 * 1024**3
+
+    mocker.patch(
+        "dvclive.monitor_system.psutil.virtual_memory",
+        return_value=mocked_virtual_memory,
+    )
+    mocker.patch(
+        "dvclive.monitor_system.psutil.disk_usage",
+        side_effect=[
+            mocked_disk_usage,
+            OSError,
+            mocked_disk_usage,
+            mocked_disk_usage,
+        ],
     )
 
 
 def test_monitor_system_is_false(tmp_dir, mocker):
     mock_psutil(mocker)
+
+    cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor")
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        assert live._cpu_monitor is None
+        assert live.cpu_monitor is None
+
+    cpu_monitor_mock.assert_not_called()
 
 
 def test_monitor_system_is_true(tmp_dir, mocker):
     mock_psutil(mocker)
+    cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor)
+
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=True,
     ) as live:
-        cpu_monitor = live._cpu_monitor
-        assert isinstance(live._cpu_monitor, CPUMonitor)
-        # start is called but end not called
-        assert cpu_monitor._shutdown_event._flag is False
+        cpu_monitor = live.cpu_monitor
+
+        assert isinstance(cpu_monitor, CPUMonitor)
+        cpu_monitor_mock.assert_called_once()
 
-    # end was called
-    assert cpu_monitor._shutdown_event._flag is True
+        end_spy = mocker.spy(cpu_monitor, "end")
+        end_spy.assert_not_called()
+
+    # check the monitoring thread is stopped
+    end_spy.assert_called_once()
 
 
 def test_ignore_non_existent_directories(tmp_dir, mocker):
-    mock_psutil(mocker)
+    mock_psutil_with_oserror(mocker)
+
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -113,7 +125,7 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
     ) as live:
         non_existent_disk = "/non-existent"
         monitor = CPUMonitor(
-            disks_to_monitor={"main": "/", "non-existent": non_existent_disk}
+            folders_to_monitor={"main": "/", "non-existent": non_existent_disk}
         )
         monitor(live)
         metrics = monitor._get_metrics()
@@ -126,20 +138,20 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
     assert f"{METRIC_DISK_TOTAL_GB}/non-existent" not in metrics
 
 
+@pytest.mark.timeout(2)
 def test_monitor_system_metrics(tmp_dir, mocker):
     mock_psutil(mocker)
-    interval = 0.05
-    num_samples = 4
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples)
-        time.sleep(interval * num_samples + interval)  # log metrics once
+        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        # wait for the metrics to be logged.
+        # METRIC_DISK_TOTAL_GB is the last metric to be logged.
+        while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
+            time.sleep(0.001)
         live.next_step()
-        time.sleep(interval * num_samples + interval)  # log metrics twice
-        live.make_summary()
 
         _, latest = parse_metrics(live)
 
@@ -161,25 +173,27 @@ def test_monitor_system_metrics(tmp_dir, mocker):
     assert latest == S(schema)
 
 
+@pytest.mark.timeout(2)
 def test_monitor_system_timeseries(tmp_dir, mocker):
     mock_psutil(mocker)
-    interval = 0.05
-    num_samples = 4
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=interval, num_samples=num_samples)
-        time.sleep(interval * num_samples + interval)  # log metrics once
+        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+
+        # wait for the metrics to be logged.
+        # METRIC_DISK_TOTAL_GB is the last metric to be logged.
+        while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
+            time.sleep(0.001)
+
         live.next_step()
-        time.sleep(interval * num_samples + interval)  # log metrics twice
-        live.make_summary()
 
         timeseries, _ = parse_metrics(live)
 
     def timeserie_schema(name):
-        return [{name: str, "timestamp": str, "step": str} for _ in range(2)]
+        return [{name: str, "timestamp": str, "step": str(0)}]
 
     # timeseries contains all the system metrics
     prefix = Path(tmp_dir) / "plots/metrics"

From a8d1bfe0063e7a0e52e75d4822f6528ef06bc57e Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 20 Feb 2024 09:30:39 +0100
Subject: [PATCH 33/39] improve tests and error catching

---
 pyproject.toml                |   3 +-
 src/dvclive/monitor_system.py |  37 ++++++-----
 tests/test_monitor_system.py  | 114 ++++++++++++++++------------------
 3 files changed, 74 insertions(+), 80 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5c0544f2..aec95d71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,8 +54,7 @@ tests = [
   "dvclive[image,plots,markdown]",
   "ipython",
   "pytest_voluptuous",
-  "dpath",
-  "voluptuous"
+  "dpath"
 ]
 dev = [
   "dvclive[all,tests]",
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 6277ce26..69a7db6e 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -10,7 +10,6 @@
 
 
 logger = logging.getLogger("dvclive")
-MEGABYTES_DIVIDER = 1024.0**2
 GIGABYTES_DIVIDER = 1024.0**3
 
 MINIMUM_CPU_USAGE_TO_BE_ACTIVE = 20
@@ -118,7 +117,7 @@ def __init__(
         self,
         interval: float = 0.1,
         num_samples: int = 20,
-        folders_to_monitor: Optional[Dict[str, str]] = None,
+        directories_to_monitor: Optional[Dict[str, str]] = None,
         plot: bool = True,
     ):
         """Monitor CPU resources and log them to DVC Live.
@@ -127,12 +126,12 @@ def __init__(
             interval (float): interval in seconds between two measurements.
                 Defaults to 0.5.
             num_samples (int): number of samples to average. Defaults to 10.
-            folders_to_monitor (Optional[Dict[str, str]]): monitor disk usage
+            directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage
                 statistics about the partition which contains the given paths. The
                 statistics include total and used space in gygabytes and percent.
                 This argument expect a dict where the key is the name that will be used
-                in the metric's name and the value is the path to the folder to monitor.
-                Defaults to {"main": "/"}.
+                in the metric's name and the value is the path to the directory to
+                monitor. Defaults to {"main": "/"}.
             plot (bool): should the system metrics be saved as plots. Defaults to True.
 
         Raises:
@@ -140,23 +139,20 @@ def __init__(
                 supported type.
         """
         super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-        folders_to_monitor = (
-            {"main": "/"} if folders_to_monitor is None else folders_to_monitor
+        directories_to_monitor = (
+            {"main": "/"} if directories_to_monitor is None else directories_to_monitor
         )
         self._disks_to_monitor = {}
-        for disk_name, disk_path in folders_to_monitor.items():
+        for disk_name, disk_path in directories_to_monitor.items():
             if disk_name != os.path.normpath(disk_name):
                 raise ValueError(  # noqa: TRY003
-                    "Keys for `partitions_to_monitor` should be a valid name"
+                    "Keys for `directories_to_monitor` should be a valid name"
                     f", but got '{disk_name}'."
                 )
-            try:
-                psutil.disk_usage(disk_path)
-            except OSError:
-                logger.warning(f"Couldn't find partition '{disk_path}', ignoring it.")
-                continue
             self._disks_to_monitor[disk_name] = disk_path
 
+        self._warn_disk_doesnt_exist: Dict[str, bool] = {}
+
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()
         nb_cpus = psutil.cpu_count()
@@ -174,12 +170,19 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             * 100
             / nb_cpus,
             METRIC_RAM_USAGE_PERCENT: ram_info.percent,
-            METRIC_RAM_USAGE_GB: (ram_info.percent / 100)
-            * (ram_info.total / GIGABYTES_DIVIDER),
+            METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER,
             METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
         }
         for disk_name, disk_path in self._disks_to_monitor.items():
-            disk_info = psutil.disk_usage(disk_path)
+            try:
+                disk_info = psutil.disk_usage(disk_path)
+            except OSError:
+                if self._warn_disk_doesnt_exist.get(disk_name, True):
+                    logger.warning(
+                        f"Couldn't find directory '{disk_path}', ignoring it."
+                    )
+                    self._warn_disk_doesnt_exist[disk_name] = False
+                continue
             disk_metrics = {
                 f"{METRIC_DISK_USAGE_PERCENT}/{disk_name}": disk_info.percent,
                 f"{METRIC_DISK_USAGE_GB}/{disk_name}": disk_info.used
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index ae06de70..49011639 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -3,7 +3,6 @@
 import pytest
 
 import dpath
-from voluptuous import Union
 from pytest_voluptuous import S
 
 from dvclive import Live
@@ -18,71 +17,57 @@
     METRIC_DISK_USAGE_PERCENT,
     METRIC_DISK_USAGE_GB,
     METRIC_DISK_TOTAL_GB,
+    GIGABYTES_DIVIDER,
 )
 from dvclive.utils import parse_metrics
 
 
-def mock_psutil(mocker):
+def mock_psutil_cpu(mocker):
     mocker.patch(
         "dvclive.monitor_system.psutil.cpu_percent",
-        return_value=[10, 20, 30, 40, 50, 60],
+        return_value=[10, 10, 10, 40, 50, 60],
     )
     mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6)
 
-    mocked_virtual_memory = mocker.MagicMock()
-    mocked_virtual_memory.percent = 20
-    mocked_virtual_memory.total = 4 * 1024**3
-
-    mocked_disk_usage = mocker.MagicMock()
-    mocked_disk_usage.used = 16 * 1024**3
-    mocked_disk_usage.percent = 50
-    mocked_disk_usage.total = 32 * 1024**3
-
-    mocking_dict = {
-        "virtual_memory": mocked_virtual_memory,
-        "disk_usage": mocked_disk_usage,
-    }
-    for function_name, return_value in mocking_dict.items():
-        mocker.patch(
-            f"dvclive.monitor_system.psutil.{function_name}",
-            return_value=return_value,
-        )
-
 
-def mock_psutil_with_oserror(mocker):
+def mock_psutil_ram(mocker):
+    mocked_ram = mocker.MagicMock()
+    mocked_ram.percent = 50
+    mocked_ram.used = 2 * GIGABYTES_DIVIDER
+    mocked_ram.total = 4 * GIGABYTES_DIVIDER
     mocker.patch(
-        "dvclive.monitor_system.psutil.cpu_percent",
-        return_value=[10, 20, 30, 40, 50, 60],
+        "dvclive.monitor_system.psutil.virtual_memory", return_value=mocked_ram
     )
-    mocker.patch("dvclive.monitor_system.psutil.cpu_count", return_value=6)
 
-    mocked_virtual_memory = mocker.MagicMock()
-    mocked_virtual_memory.percent = 20
-    mocked_virtual_memory.total = 4 * 1024**3
 
-    mocked_disk_usage = mocker.MagicMock()
-    mocked_disk_usage.used = 16 * 1024**3
-    mocked_disk_usage.percent = 50
-    mocked_disk_usage.total = 32 * 1024**3
+def mock_psutil_disk(mocker):
+    mocked_disk = mocker.MagicMock()
+    mocked_disk.percent = 50
+    mocked_disk.used = 16 * GIGABYTES_DIVIDER
+    mocked_disk.total = 32 * GIGABYTES_DIVIDER
+    mocker.patch("dvclive.monitor_system.psutil.disk_usage", return_value=mocked_disk)
 
-    mocker.patch(
-        "dvclive.monitor_system.psutil.virtual_memory",
-        return_value=mocked_virtual_memory,
-    )
+
+def mock_psutil_disk_with_oserror(mocker):
+    mocked_disk = mocker.MagicMock()
+    mocked_disk.percent = 50
+    mocked_disk.used = 16 * GIGABYTES_DIVIDER
+    mocked_disk.total = 32 * GIGABYTES_DIVIDER
     mocker.patch(
         "dvclive.monitor_system.psutil.disk_usage",
         side_effect=[
-            mocked_disk_usage,
+            mocked_disk,
+            OSError,
+            mocked_disk,
             OSError,
-            mocked_disk_usage,
-            mocked_disk_usage,
         ],
     )
 
 
 def test_monitor_system_is_false(tmp_dir, mocker):
-    mock_psutil(mocker)
-
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
     cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor")
     with Live(
         tmp_dir,
@@ -95,7 +80,9 @@ def test_monitor_system_is_false(tmp_dir, mocker):
 
 
 def test_monitor_system_is_true(tmp_dir, mocker):
-    mock_psutil(mocker)
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
     cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor)
 
     with Live(
@@ -116,8 +103,9 @@ def test_monitor_system_is_true(tmp_dir, mocker):
 
 
 def test_ignore_non_existent_directories(tmp_dir, mocker):
-    mock_psutil_with_oserror(mocker)
-
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk_with_oserror(mocker)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -125,7 +113,7 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
     ) as live:
         non_existent_disk = "/non-existent"
         monitor = CPUMonitor(
-            folders_to_monitor={"main": "/", "non-existent": non_existent_disk}
+            directories_to_monitor={"main": "/", "non-existent": non_existent_disk}
         )
         monitor(live)
         metrics = monitor._get_metrics()
@@ -140,7 +128,9 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
 
 @pytest.mark.timeout(2)
 def test_monitor_system_metrics(tmp_dir, mocker):
-    mock_psutil(mocker)
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
@@ -156,26 +146,28 @@ def test_monitor_system_metrics(tmp_dir, mocker):
         _, latest = parse_metrics(live)
 
     schema = {}
-    for name in [
-        "step",
-        METRIC_CPU_COUNT,
-        METRIC_CPU_USAGE_PERCENT,
-        METRIC_CPU_PARALLELIZATION_PERCENT,
-        METRIC_RAM_USAGE_PERCENT,
-        METRIC_RAM_USAGE_GB,
-        METRIC_RAM_TOTAL_GB,
-        f"{METRIC_DISK_USAGE_PERCENT}/main",
-        f"{METRIC_DISK_USAGE_GB}/main",
-        f"{METRIC_DISK_TOTAL_GB}/main",
-    ]:
-        dpath.new(schema, name, Union(int, float))
+    for name, value in {
+        "step": 0,
+        METRIC_CPU_COUNT: 6,
+        METRIC_CPU_USAGE_PERCENT: 30.0,
+        METRIC_CPU_PARALLELIZATION_PERCENT: 50.0,
+        METRIC_RAM_USAGE_PERCENT: 50.0,
+        METRIC_RAM_USAGE_GB: 2.0,
+        METRIC_RAM_TOTAL_GB: 4.0,
+        f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0,
+        f"{METRIC_DISK_USAGE_GB}/main": 16.0,
+        f"{METRIC_DISK_TOTAL_GB}/main": 32.0,
+    }.items():
+        dpath.new(schema, name, value)
 
     assert latest == S(schema)
 
 
 @pytest.mark.timeout(2)
 def test_monitor_system_timeseries(tmp_dir, mocker):
-    mock_psutil(mocker)
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
     with Live(
         tmp_dir,
         save_dvc_exp=False,

From 33590016daf0476dbea57e252dccf51aa3b8c0ee Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 20 Feb 2024 10:06:32 +0100
Subject: [PATCH 34/39] add docstring

---
 src/dvclive/live.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 4addceb3..f1a15f63 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -121,6 +121,8 @@ def __init__(
                 provided string will be passed to `dvc exp save --message`.
                 If DVCLive is used inside `dvc exp run`, the option will be ignored, use
                 `dvc exp run --message` instead.
+            monitor_system (bool): if `True`, DVCLive will monitor CPU, ram, and disk
+                usage. Defaults to `False`.
         """
         self.summary: Dict[str, Any] = {}
 

From 3f94e241e900d2f8b36878d73ba2a2702c9b15fc Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Tue, 20 Feb 2024 18:27:45 +0100
Subject: [PATCH 35/39] merge cpu and gpu monitoring into a system monitor
 object

general improvements in the code as well
---
 src/dvclive/live.py           |  45 ++----
 src/dvclive/monitor_system.py | 245 +++++++++++++++---------------
 tests/test_monitor_system.py  | 276 ++++++++++++++++++----------------
 3 files changed, 286 insertions(+), 280 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 3f1b2391..51501c0c 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -42,7 +42,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .monitor_system import GPU_AVAILABLE, CPUMonitor, GPUMonitor
+from .monitor_system import SystemMonitor
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -169,14 +169,10 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._monitor_cpu = None
-        self._monitor_gpu = None
+        self._system_monitor = None
         if monitor_system:
-            self._monitor_cpu = CPUMonitor()
-            self._monitor_cpu(self)
-            if GPU_AVAILABLE:
-                self._monitor_gpu = GPUMonitor()
-                self._monitor_gpu(self)
+            self._system_monitor = SystemMonitor()
+            self._system_monitor(self)
 
     def _init_resume(self):
         self._read_params()
@@ -382,26 +378,15 @@ def step(self, value: int) -> None:
         logger.debug(f"Step: {self.step}")
 
     @property
-    def cpu_monitor(self) -> Optional[CPUMonitor]:
-        return self._monitor_cpu or None
+    def system_monitor(self) -> Optional[SystemMonitor]:
+        return self._system_monitor or None
 
-    @cpu_monitor.setter
-    def cpu_monitor(self, cpu_monitor: CPUMonitor) -> None:
-        if self._monitor_cpu is not None:
-            self._monitor_cpu.end()
-        self._monitor_cpu = cpu_monitor
-        self._monitor_cpu(self)
-
-    @property
-    def monitor_gpu(self) -> Optional[GPUMonitor]:
-        return self._monitor_gpu or None
-
-    @monitor_gpu.setter
-    def monitor_gpu(self, monitor_gpu: GPUMonitor) -> None:
-        if self._monitor_gpu is not None:
-            self._monitor_gpu.end()
-        self._monitor_gpu = monitor_gpu
-        self._monitor_gpu(self)
+    @system_monitor.setter
+    def system_monitor(self, system_monitor: SystemMonitor) -> None:
+        if self._system_monitor is not None:
+            self._system_monitor.end()
+        self._system_monitor = system_monitor
+        self._system_monitor(self)
 
     def sync(self):
         self.make_summary()
@@ -892,10 +877,8 @@ def end(self):
             self.step = self.summary["step"]
 
         # Kill threads that monitor the system metrics
-        if self._monitor_cpu is not None:
-            self._monitor_cpu.end()
-        if self._monitor_gpu is not None:
-            self._monitor_gpu.end()
+        if self._system_monitor is not None:
+            self._system_monitor.end()
 
         self.sync()
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index efaa071b..df0293f9 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -1,4 +1,3 @@
-import abc
 import logging
 import os
 from typing import Dict, Union, Optional, Tuple
@@ -17,6 +16,7 @@
         nvmlDeviceGetMemoryInfo,
         nvmlDeviceGetUtilizationRates,
         nvmlShutdown,
+        NVMLError,
     )
 
     GPU_AVAILABLE = True
@@ -40,43 +40,98 @@
 METRIC_DISK_USAGE_GB = "system/disk/usage (GB)"
 METRIC_DISK_TOTAL_GB = "system/disk/total (GB)"
 
-
-class _SystemMonitor(abc.ABC):
-    """
-    Monitor system resources and log them to DVC Live.
-    Use a separate thread to call a `_get_metrics` function at fix interval and
-    aggregate the results of this sampling using the average.
+METRIC_GPU_COUNT = "system/gpu/count"
+METRIC_GPU_USAGE_PERCENT = "system/gpu/usage (%)"
+METRIC_VRAM_USAGE_PERCENT = "system/vram/usage (%)"
+METRIC_VRAM_USAGE_GB = "system/vram/usage (GB)"
+METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)"
+
+
+class SystemMonitor:
+    """Monitor CPU, ram, and disk resources and log them to DVC Live. Monitor also the
+    GPU resources if available.
+
+    Args:
+        interval (float): interval in seconds between two measurements.
+            Defaults to 0.05.
+        num_samples (int): number of samples to average. Defaults to 20.
+        directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage
+            statistics about the partition which contains the given paths. The
+            statistics include total and used space in gygabytes and percent.
+            This argument expect a dict where the key is the name that will be used
+            in the metric's name and the value is the path to the directory to
+            monitor. Defaults to {"main": "/"}.
+        plot (bool): should the system metrics be saved as plots. Defaults to True.
+
+    Raises:
+        ValueError: if the keys in `directories_to_monitor` contains invalid characters
+        as defined by `os.path.normpath`.
     """
 
-    _plot_blacklist_prefix: Tuple = ()
+    _plot_blacklist_prefix: Tuple = (
+        METRIC_CPU_COUNT,
+        METRIC_RAM_TOTAL_GB,
+        METRIC_DISK_TOTAL_GB,
+        METRIC_GPU_COUNT,
+        METRIC_VRAM_TOTAL_GB,
+    )
 
     def __init__(
         self,
-        interval: float,
-        num_samples: int,
+        interval: float = 0.05,  # seconds
+        num_samples: int = 20,
+        directories_to_monitor: Optional[Dict[str, str]] = None,
         plot: bool = True,
     ):
-        max_interval = 0.1
+        self._interval = self._check_interval(interval, max_interval=0.1)
+        self._num_samples = self._check_num_samples(
+            num_samples, min_num_samples=1, max_num_samples=30
+        )
+        self._disks_to_monitor = self._check_directories_to_monitor(
+            directories_to_monitor
+        )
+        self._plot = plot
+        self._warn_cpu_problem = True
+        self._warn_gpu_problem = True
+        self._warn_disk_doesnt_exist: Dict[str, bool] = {}
+
+    def _check_interval(self, interval: float, max_interval: float) -> float:
         if interval > max_interval:
-            interval = max_interval
             logger.warning(
                 f"System monitoring `interval` should be less than {max_interval} "
-                f"seconds. Setting `interval` to {interval} seconds."
+                f"seconds. Setting `interval` to {max_interval} seconds."
             )
+            return max_interval
+        return interval
 
+    def _check_num_samples(
+        self, num_samples: int, min_num_samples: int, max_num_samples: int
+    ) -> int:
         min_num_samples = 1
         max_num_samples = 30
-        if min_num_samples < num_samples < max_num_samples:
+        if not min_num_samples < num_samples < max_num_samples:
             num_samples = max(min(num_samples, max_num_samples), min_num_samples)
             logger.warning(
                 f"System monitoring `num_samples` should be between {min_num_samples} "
                 f"and {max_num_samples}. Setting `num_samples` to {num_samples}."
             )
+        return num_samples
 
-        self._interval = interval  # seconds
-        self._nb_samples = num_samples
-        self._plot = plot
-        self._warn_user = True
+    def _check_directories_to_monitor(
+        self, directories_to_monitor: Optional[Dict[str, str]]
+    ) -> Dict[str, str]:
+        if directories_to_monitor is None:
+            return {"main": "/"}
+
+        disks_to_monitor = {}
+        for disk_name, disk_path in directories_to_monitor.items():
+            if disk_name != os.path.normpath(disk_name):
+                raise ValueError(  # noqa: TRY003
+                    "Keys for `directories_to_monitor` should be a valid name"
+                    f", but got '{disk_name}'."
+                )
+            disks_to_monitor[disk_name] = disk_path
+        return disks_to_monitor
 
     def __call__(self, live):
         self._live = live
@@ -88,14 +143,17 @@ def __call__(self, live):
     def _monitoring_loop(self):
         while not self._shutdown_event.is_set():
             self._metrics = {}
-            for _ in range(self._nb_samples):
-                last_metrics = {}
+            for _ in range(self._num_samples):
                 try:
                     last_metrics = self._get_metrics()
                 except psutil.Error:
-                    if self._warn_user:
+                    if self._warn_cpu_problem:
                         logger.exception("Failed to monitor CPU metrics")
-                        self._warn_user = False
+                        self._warn_cpu_problem = False
+                except NVMLError:
+                    if self._warn_gpu_problem:
+                        logger.exception("Failed to monitor GPU metrics")
+                        self._warn_gpu_problem = False
 
                 self._metrics = merge_with(sum, self._metrics, last_metrics)
                 self._shutdown_event.wait(self._interval)
@@ -107,72 +165,34 @@ def _monitoring_loop(self):
                 )
                 self._live.log_metric(
                     name,
-                    values / self._nb_samples,
+                    values / self._num_samples,
                     timestamp=True,
                     plot=None if blacklisted else self._plot,
                 )
 
-    @abc.abstractmethod
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
-        pass
-
-    def end(self):
-        self._shutdown_event.set()
-
-
-class CPUMonitor(_SystemMonitor):
-    _plot_blacklist_prefix: Tuple = (
-        METRIC_CPU_COUNT,
-        METRIC_RAM_TOTAL_GB,
-        "system/disk/total (GB)",
-    )
-
-    def __init__(
-        self,
-        interval: float = 0.1,
-        num_samples: int = 20,
-        directories_to_monitor: Optional[Dict[str, str]] = None,
-        plot: bool = True,
-    ):
-        """Monitor CPU resources and log them to DVC Live.
-
-        Args:
-            interval (float): interval in seconds between two measurements.
-                Defaults to 0.5.
-            num_samples (int): number of samples to average. Defaults to 10.
-            directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage
-                statistics about the partition which contains the given paths. The
-                statistics include total and used space in gygabytes and percent.
-                This argument expect a dict where the key is the name that will be used
-                in the metric's name and the value is the path to the directory to
-                monitor. Defaults to {"main": "/"}.
-            plot (bool): should the system metrics be saved as plots. Defaults to True.
-
-        Raises:
-            ValueError: if the arguments passed to the function don't have a
-                supported type.
-        """
-        super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-        directories_to_monitor = (
-            {"main": "/"} if directories_to_monitor is None else directories_to_monitor
-        )
-        self._disks_to_monitor = {}
-        for disk_name, disk_path in directories_to_monitor.items():
-            if disk_name != os.path.normpath(disk_name):
-                raise ValueError(  # noqa: TRY003
-                    "Keys for `directories_to_monitor` should be a valid name"
-                    f", but got '{disk_name}'."
-                )
-            self._disks_to_monitor[disk_name] = disk_path
-
-        self._warn_disk_doesnt_exist: Dict[str, bool] = {}
+        result = {
+            **self._get_cpu_info(),
+            **self._get_ram_info(),
+            **self._get_disk_info(),
+        }
+        if GPU_AVAILABLE:
+            result.update(self.__get_gpu_info())
+        return result
 
-    def _get_metrics(self) -> Dict[str, Union[float, int]]:
+    def _get_ram_info(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()
-        nb_cpus = psutil.cpu_count()
+        return {
+            METRIC_RAM_USAGE_PERCENT: ram_info.percent,
+            METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER,
+            METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
+        }
+
+    def _get_cpu_info(self) -> Dict[str, Union[float, int]]:
+        num_cpus = psutil.cpu_count()
         cpus_percent = psutil.cpu_percent(percpu=True)
-        result = {
-            METRIC_CPU_COUNT: nb_cpus,
+        return {
+            METRIC_CPU_COUNT: num_cpus,
             METRIC_CPU_USAGE_PERCENT: mean(cpus_percent),
             METRIC_CPU_PARALLELIZATION_PERCENT: len(
                 [
@@ -182,11 +202,11 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
                 ]
             )
             * 100
-            / nb_cpus,
-            METRIC_RAM_USAGE_PERCENT: ram_info.percent,
-            METRIC_RAM_USAGE_GB: ram_info.used / GIGABYTES_DIVIDER,
-            METRIC_RAM_TOTAL_GB: ram_info.total / GIGABYTES_DIVIDER,
+            / num_cpus,
         }
+
+    def _get_disk_info(self) -> Dict[str, Union[float, int]]:
+        result = {}
         for disk_name, disk_path in self._disks_to_monitor.items():
             try:
                 disk_info = psutil.disk_usage(disk_path)
@@ -208,31 +228,7 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             result.update(disk_metrics)
         return result
 
-
-class GPUMonitor(_SystemMonitor):
-    _plot_blacklist_prefix = ("system/gpu/count", "system/vram/total (GB)")
-
-    def __init__(
-        self,
-        interval: float = 0.5,
-        num_samples: int = 10,
-        plot: bool = True,
-    ):
-        """Monitor GPU resources and log them to DVC Live.
-
-        Args:
-            interval (float): interval in seconds between two measurements.
-                Defaults to 0.5.
-            num_samples (int): number of samples to average. Defaults to 10.
-            plot (bool): should the system metrics be saved as plots. Defaults to True.
-
-        Raises:
-            InvalidDataTypeError: if the arguments passed to the function don't have a
-                supported type.
-        """
-        super().__init__(interval=interval, num_samples=num_samples, plot=plot)
-
-    def _get_metrics(self) -> Dict[str, Union[float, int]]:
+    def __get_gpu_info(self) -> Dict[str, Union[float, int]]:
         nvmlInit()
         num_gpus = nvmlDeviceGetCount()
         gpu_metrics = {
@@ -244,17 +240,26 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             memory_info = nvmlDeviceGetMemoryInfo(gpu_handle)
             usage_info = nvmlDeviceGetUtilizationRates(gpu_handle)
 
-            gpu_metrics[f"system/gpu/usage (%)/{gpu_idx}"] = (
-                100 * usage_info.memory / usage_info.gpu if usage_info.gpu else 0
-            )
-            gpu_metrics[f"system/vram/usage (%)/{gpu_idx}"] = (
-                100 * memory_info.used / memory_info.total
-            )
-            gpu_metrics[f"system/vram/usage (GB)/{gpu_idx}"] = (
-                memory_info.used / GIGABYTES_DIVIDER
-            )
-            gpu_metrics[f"system/vram/total (GB)/{gpu_idx}"] = (
-                memory_info.total / GIGABYTES_DIVIDER
+            gpu_metrics.update(
+                {
+                    f"{METRIC_GPU_USAGE_PERCENT}/{gpu_idx}": (
+                        100 * usage_info.memory / usage_info.gpu
+                        if usage_info.gpu
+                        else 0
+                    ),
+                    f"{METRIC_VRAM_USAGE_PERCENT}/{gpu_idx}": (
+                        100 * memory_info.used / memory_info.total
+                    ),
+                    f"{METRIC_VRAM_USAGE_GB}/{gpu_idx}": (
+                        memory_info.used / GIGABYTES_DIVIDER
+                    ),
+                    f"{METRIC_VRAM_TOTAL_GB}/{gpu_idx}": (
+                        memory_info.total / GIGABYTES_DIVIDER
+                    ),
+                }
             )
         nvmlShutdown()
         return gpu_metrics
+
+    def end(self):
+        self._shutdown_event.set()
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 8f993659..2476a0f0 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -7,8 +7,7 @@
 
 from dvclive import Live
 from dvclive.monitor_system import (
-    CPUMonitor,
-    GPUMonitor,
+    SystemMonitor,
     METRIC_CPU_COUNT,
     METRIC_CPU_USAGE_PERCENT,
     METRIC_CPU_PARALLELIZATION_PERCENT,
@@ -18,6 +17,11 @@
     METRIC_DISK_USAGE_PERCENT,
     METRIC_DISK_USAGE_GB,
     METRIC_DISK_TOTAL_GB,
+    METRIC_GPU_COUNT,
+    METRIC_GPU_USAGE_PERCENT,
+    METRIC_VRAM_USAGE_PERCENT,
+    METRIC_VRAM_USAGE_GB,
+    METRIC_VRAM_TOTAL_GB,
     GIGABYTES_DIVIDER,
 )
 from dvclive.utils import parse_metrics
@@ -65,38 +69,115 @@ def mock_psutil_disk_with_oserror(mocker):
     )
 
 
+def mock_pynvml(mocker, num_gpus=2):
+    prefix = "dvclive.monitor_system"
+    mocker.patch(f"{prefix}.GPU_AVAILABLE", bool(num_gpus))
+    mocker.patch(f"{prefix}.nvmlDeviceGetCount", return_value=num_gpus)
+    mocker.patch(f"{prefix}.nvmlInit", return_value=None)
+    mocker.patch(f"{prefix}.nvmlShutdown", return_value=None)
+    mocker.patch(f"{prefix}.nvmlDeviceGetHandleByIndex", return_value=None)
+
+    vram_info = mocker.MagicMock()
+    vram_info.used = 3 * 1024**3
+    vram_info.total = 6 * 1024**3
+
+    gpu_usage = mocker.MagicMock()
+    gpu_usage.memory = 5
+    gpu_usage.gpu = 10
+
+    mocker.patch(f"{prefix}.nvmlDeviceGetMemoryInfo", return_value=vram_info)
+    mocker.patch(f"{prefix}.nvmlDeviceGetUtilizationRates", return_value=gpu_usage)
+
+
+@pytest.fixture()
+def cpu_metrics():
+    content = {
+        METRIC_CPU_COUNT: 6,
+        METRIC_CPU_USAGE_PERCENT: 30.0,
+        METRIC_CPU_PARALLELIZATION_PERCENT: 50.0,
+        METRIC_RAM_USAGE_PERCENT: 50.0,
+        METRIC_RAM_USAGE_GB: 2.0,
+        METRIC_RAM_TOTAL_GB: 4.0,
+        f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0,
+        f"{METRIC_DISK_USAGE_GB}/main": 16.0,
+        f"{METRIC_DISK_TOTAL_GB}/main": 32.0,
+    }
+    result = {}
+    for name, value in content.items():
+        dpath.new(result, name, value)
+    return result
+
+
+def _timeserie_schema(name, value):
+    return [{name: str(value), "timestamp": str, "step": "0"}]
+
+
+@pytest.fixture()
+def cpu_timeseries():
+    return {
+        f"{METRIC_CPU_USAGE_PERCENT}.tsv": _timeserie_schema(
+            METRIC_CPU_USAGE_PERCENT.split("/")[-1], 30.0
+        ),
+        f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv": _timeserie_schema(
+            METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1], 50.0
+        ),
+        f"{METRIC_RAM_USAGE_PERCENT}.tsv": _timeserie_schema(
+            METRIC_RAM_USAGE_PERCENT.split("/")[-1], 50.0
+        ),
+        f"{METRIC_RAM_USAGE_GB}.tsv": _timeserie_schema(
+            METRIC_RAM_USAGE_GB.split("/")[-1], 2.0
+        ),
+        f"{METRIC_DISK_USAGE_PERCENT}/main.tsv": _timeserie_schema("main", 50.0),
+        f"{METRIC_DISK_USAGE_GB}/main.tsv": _timeserie_schema("main", 16.0),
+    }
+
+
+@pytest.fixture()
+def gpu_timeseries():
+    return {
+        f"{METRIC_GPU_USAGE_PERCENT}/0.tsv": _timeserie_schema("0", 50.0),
+        f"{METRIC_GPU_USAGE_PERCENT}/1.tsv": _timeserie_schema("1", 50.0),
+        f"{METRIC_VRAM_USAGE_PERCENT}/0.tsv": _timeserie_schema("0", 50.0),
+        f"{METRIC_VRAM_USAGE_PERCENT}/1.tsv": _timeserie_schema("1", 50.0),
+        f"{METRIC_VRAM_USAGE_GB}/0.tsv": _timeserie_schema("0", 3.0),
+        f"{METRIC_VRAM_USAGE_GB}/1.tsv": _timeserie_schema("1", 3.0),
+    }
+
+
 def test_monitor_system_is_false(tmp_dir, mocker):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
-    cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor")
+    mock_pynvml(mocker, num_gpus=0)
+    system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        assert live.cpu_monitor is None
+        assert live.system_monitor is None
 
-    cpu_monitor_mock.assert_not_called()
+    system_monitor_mock.assert_not_called()
 
 
 def test_monitor_system_is_true(tmp_dir, mocker):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
-    cpu_monitor_mock = mocker.patch("dvclive.live.CPUMonitor", spec=CPUMonitor)
+    mock_pynvml(mocker, num_gpus=0)
+    system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor)
 
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=True,
     ) as live:
-        cpu_monitor = live.cpu_monitor
+        system_monitor = live.system_monitor
 
-        assert isinstance(cpu_monitor, CPUMonitor)
-        cpu_monitor_mock.assert_called_once()
+        assert isinstance(system_monitor, SystemMonitor)
+        system_monitor_mock.assert_called_once()
 
-        end_spy = mocker.spy(cpu_monitor, "end")
+        end_spy = mocker.spy(system_monitor, "end")
         end_spy.assert_not_called()
 
     # check the monitoring thread is stopped
@@ -107,13 +188,14 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk_with_oserror(mocker)
+    mock_pynvml(mocker, num_gpus=0)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
         non_existent_disk = "/non-existent"
-        monitor = CPUMonitor(
+        monitor = SystemMonitor(
             directories_to_monitor={"main": "/", "non-existent": non_existent_disk}
         )
         monitor(live)
@@ -128,16 +210,17 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
 
 
 @pytest.mark.timeout(2)
-def test_monitor_system_metrics(tmp_dir, mocker):
+def test_monitor_system_metrics(tmp_dir, cpu_metrics, mocker):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
+    mock_pynvml(mocker, num_gpus=0)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
         # wait for the metrics to be logged.
         # METRIC_DISK_TOTAL_GB is the last metric to be logged.
         while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
@@ -146,35 +229,22 @@ def test_monitor_system_metrics(tmp_dir, mocker):
 
         _, latest = parse_metrics(live)
 
-    schema = {}
-    for name, value in {
-        "step": 0,
-        METRIC_CPU_COUNT: 6,
-        METRIC_CPU_USAGE_PERCENT: 30.0,
-        METRIC_CPU_PARALLELIZATION_PERCENT: 50.0,
-        METRIC_RAM_USAGE_PERCENT: 50.0,
-        METRIC_RAM_USAGE_GB: 2.0,
-        METRIC_RAM_TOTAL_GB: 4.0,
-        f"{METRIC_DISK_USAGE_PERCENT}/main": 50.0,
-        f"{METRIC_DISK_USAGE_GB}/main": 16.0,
-        f"{METRIC_DISK_TOTAL_GB}/main": 32.0,
-    }.items():
-        dpath.new(schema, name, value)
-
+    schema = {"step": 0, **cpu_metrics}
     assert latest == S(schema)
 
 
 @pytest.mark.timeout(2)
-def test_monitor_system_timeseries(tmp_dir, mocker):
+def test_monitor_system_timeseries(tmp_dir, cpu_timeseries, mocker):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
+    mock_pynvml(mocker, num_gpus=0)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.cpu_monitor = CPUMonitor(interval=0.05, num_samples=4)
+        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
 
         # wait for the metrics to be logged.
         # METRIC_DISK_TOTAL_GB is the last metric to be logged.
@@ -185,121 +255,69 @@ def test_monitor_system_timeseries(tmp_dir, mocker):
 
         timeseries, _ = parse_metrics(live)
 
-    def timeserie_schema(name):
-        return [{name: str, "timestamp": str, "step": str(0)}]
-
-    # timeseries contains all the system metrics
     prefix = Path(tmp_dir) / "plots/metrics"
-    assert timeseries == S(
-        {
-            str(prefix / f"{METRIC_CPU_USAGE_PERCENT}.tsv"): timeserie_schema(
-                METRIC_CPU_USAGE_PERCENT.split("/")[-1]
-            ),
-            str(prefix / f"{METRIC_CPU_PARALLELIZATION_PERCENT}.tsv"): timeserie_schema(
-                METRIC_CPU_PARALLELIZATION_PERCENT.split("/")[-1]
-            ),
-            str(prefix / f"{METRIC_RAM_USAGE_PERCENT}.tsv"): timeserie_schema(
-                METRIC_RAM_USAGE_PERCENT.split("/")[-1]
-            ),
-            str(prefix / f"{METRIC_RAM_USAGE_GB}.tsv"): timeserie_schema(
-                METRIC_RAM_USAGE_GB.split("/")[-1]
-            ),
-            str(prefix / f"{METRIC_DISK_USAGE_PERCENT}/main.tsv"): timeserie_schema(
-                "main"
-            ),
-            str(prefix / f"{METRIC_DISK_USAGE_GB}/main.tsv"): timeserie_schema("main"),
-        }
-    )
-
+    schema = {str(prefix / name): value for name, value in cpu_timeseries.items()}
+    assert timeseries == S(schema)
 
-def mock_pynvml(mocker, num_gpus=2, crash_index=None):
-    mocker.patch("dvclive.monitor_system.GPU_AVAILABLE", return_value=num_gpus)
-    mocker.patch("dvclive.monitor_system.nvmlDeviceGetCount", return_value=num_gpus)
-    mocker.patch("dvclive.monitor_system.nvmlInit", return_value=None)
-    mocker.patch("dvclive.monitor_system.nvmlShutdown", return_value=None)
 
-    mocked_memory_info = mocker.MagicMock()
-    mocked_memory_info.used = 3 * 1024**3
-    mocked_memory_info.total = 5 * 1024**3
-
-    mocked_utilization_rate = mocker.MagicMock()
-    mocked_utilization_rate.memory = 5
-    mocked_utilization_rate.gpu = 10
-
-    mocking_dict = {
-        "nvmlDeviceGetHandleByIndex": None,
-        "nvmlDeviceGetMemoryInfo": mocked_memory_info,
-        "nvmlDeviceGetUtilizationRates": mocked_utilization_rate,
-    }
-
-    for function_name, return_value in mocking_dict.items():
-        mocker.patch(
-            f"dvclive.monitor_system.{function_name}",
-            return_value=return_value,
-        )
-
-
-def test_get_gpus_metrics_mocker(mocker, tmp_dir):
+@pytest.mark.timeout(2)
+def test_monitor_system_metrics_with_gpu(tmp_dir, cpu_metrics, mocker):
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
     mock_pynvml(mocker, num_gpus=2)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        monitor = GPUMonitor()
-        monitor(live)
-        metrics = monitor._get_metrics()
-        monitor.end()
-    assert "system/gpu/usage (%)/0" in metrics
-    assert "system/gpu/usage (%)/1" in metrics
-    assert "system/vram/usage (%)/0" in metrics
-    assert "system/vram/usage (%)/1" in metrics
-    assert "system/vram/usage (GB)/0" in metrics
-    assert "system/vram/usage (GB)/1" in metrics
-    assert "system/vram/total (GB)/0" in metrics
-    assert "system/vram/total (GB)/1" in metrics
+        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+        # wait for the metrics to be logged.
+        # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU.
+        while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0:
+            time.sleep(0.001)
+        live.next_step()
 
+        _, latest = parse_metrics(live)
 
-def test_monitor_gpu_system(tmp_dir, mocker):
+    schema = {"step": 0, **cpu_metrics}
+    gpu_content = {
+        METRIC_GPU_COUNT: 2,
+        f"{METRIC_GPU_USAGE_PERCENT}": {"0": 50.0, "1": 50.0},
+        f"{METRIC_VRAM_USAGE_PERCENT}": {"0": 50.0, "1": 50.0},
+        f"{METRIC_VRAM_USAGE_GB}": {"0": 3.0, "1": 3.0},
+        f"{METRIC_VRAM_TOTAL_GB}": {"0": 6.0, "1": 6.0},
+    }
+    for name, value in gpu_content.items():
+        dpath.new(schema, name, value)
+    assert latest == S(schema)
+
+
+@pytest.mark.timeout(2)
+def test_monitor_system_timeseries_with_gpu(
+    tmp_dir, cpu_timeseries, gpu_timeseries, mocker
+):
     mock_psutil_cpu(mocker)
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
-    mock_pynvml(mocker, num_gpus=1)
+    mock_pynvml(mocker, num_gpus=2)
     with Live(
         tmp_dir,
         save_dvc_exp=False,
-        monitor_system=True,
+        monitor_system=False,
     ) as live:
-        time.sleep(5 + 1)  # allow the thread to finish
+        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+
+        # wait for the metrics to be logged.
+        # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU.
+        while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0:
+            time.sleep(0.001)
+
         live.next_step()
-        time.sleep(5 + 1)  # allow the thread to finish
-        timeseries, latest = parse_metrics(live)
-
-    # metrics.json records CPU and RAM metrics if GPU detected
-    assert "system" in latest
-    assert "cpu" in latest["system"]
-    assert "ram" in latest["system"]
-    assert "disk" in latest["system"]
-
-    # metrics.json file contains all the system metrics
-    assert "gpu" in latest["system"]
-    assert "count" in latest["system"]["gpu"]
-    assert "usage (%)" in latest["system"]["gpu"]
-    assert "0" in latest["system"]["gpu"]["usage (%)"]
-    assert "vram" in latest["system"]
-    assert "usage (%)" in latest["system"]["vram"]
-    assert "0" in latest["system"]["vram"]["usage (%)"]
-    assert "usage (GB)" in latest["system"]["vram"]
-    assert "0" in latest["system"]["vram"]["usage (GB)"]
-    assert "total (GB)" in latest["system"]["vram"]
-    assert "0" in latest["system"]["vram"]["total (GB)"]
-
-    # timeseries contains all the system metrics
-    assert any(str(Path("system/gpu/usage (%)/0.tsv")) in key for key in timeseries)
-    assert any(str(Path("system/vram/usage (%)/0.tsv")) in key for key in timeseries)
-    assert any(str(Path("system/vram/usage (GB)/0.tsv")) in key for key in timeseries)
-    assert all(len(timeseries[key]) == 2 for key in timeseries if "system" in key)
-
-    # blacklisted timeseries
-    assert all(str(Path("system/gpu/count.tsv")) not in key for key in timeseries)
-    assert all(str(Path("system/vram/total (GB).tsv")) not in key for key in timeseries)
+
+        timeseries, _ = parse_metrics(live)
+
+    prefix = Path(tmp_dir) / "plots/metrics"
+    schema = {str(prefix / name): value for name, value in cpu_timeseries.items()}
+    schema.update({str(prefix / name): value for name, value in gpu_timeseries.items()})
+    assert timeseries == S(schema)

From 506dd5211e39d33baa7e7a206023a0f73314ef22 Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Wed, 21 Feb 2024 10:09:26 +0100
Subject: [PATCH 36/39] change pyproject dependencies and fix typo

---
 pyproject.toml                | 9 ++++-----
 src/dvclive/monitor_system.py | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7f1a9575..970ec8ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,9 +37,7 @@ dependencies = [
   "funcy",
   "gto",
   "ruamel.yaml",
-  "scmrepo",
-  "psutil",
-  "pynvml"
+  "scmrepo"
 ]
 
 [project.optional-dependencies]
@@ -52,7 +50,7 @@ tests = [
   "pytest-sugar>=0.9.6,<1.0",
   "pytest-cov>=3.0.0,<4.0",
   "pytest-mock>=3.8.2,<4.0",
-  "dvclive[image,plots,markdown]",
+  "dvclive[image,plots,markdown,system]",
   "ipython",
   "pytest_voluptuous",
   "dpath"
@@ -71,8 +69,9 @@ catalyst = ["catalyst>22"]
 fastai = ["fastai"]
 lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"]
 optuna = ["optuna"]
+system = ["psutil", "pynvml"]
 all = [
-  "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]"
+  "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown,system]"
 ]
 
 [project.urls]
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index df0293f9..84803917 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -177,7 +177,7 @@ def _get_metrics(self) -> Dict[str, Union[float, int]]:
             **self._get_disk_info(),
         }
         if GPU_AVAILABLE:
-            result.update(self.__get_gpu_info())
+            result.update(self._get_gpu_info())
         return result
 
     def _get_ram_info(self) -> Dict[str, Union[float, int]]:
@@ -228,7 +228,7 @@ def _get_disk_info(self) -> Dict[str, Union[float, int]]:
             result.update(disk_metrics)
         return result
 
-    def __get_gpu_info(self) -> Dict[str, Union[float, int]]:
+    def _get_gpu_info(self) -> Dict[str, Union[float, int]]:
         nvmlInit()
         num_gpus = nvmlDeviceGetCount()
         gpu_metrics = {

From 5df5d48adf260ff7445897bf6a191b6095f9e55b Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 22 Feb 2024 09:40:56 +0100
Subject: [PATCH 37/39] install psutil and pynvml by default

---
 pyproject.toml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 970ec8ba..7f1a9575 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,9 @@ dependencies = [
   "funcy",
   "gto",
   "ruamel.yaml",
-  "scmrepo"
+  "scmrepo",
+  "psutil",
+  "pynvml"
 ]
 
 [project.optional-dependencies]
@@ -50,7 +52,7 @@ tests = [
   "pytest-sugar>=0.9.6,<1.0",
   "pytest-cov>=3.0.0,<4.0",
   "pytest-mock>=3.8.2,<4.0",
-  "dvclive[image,plots,markdown,system]",
+  "dvclive[image,plots,markdown]",
   "ipython",
   "pytest_voluptuous",
   "dpath"
@@ -69,9 +71,8 @@ catalyst = ["catalyst>22"]
 fastai = ["fastai"]
 lightning = ["lightning>=2.0", "torch", "jsonargparse[signatures]>=4.26.1"]
 optuna = ["optuna"]
-system = ["psutil", "pynvml"]
 all = [
-  "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown,system]"
+  "dvclive[image,mmcv,tf,xgb,lgbm,huggingface,catalyst,fastai,lightning,optuna,plots,markdown]"
 ]
 
 [project.urls]

From 29477ba72d329cf29b7a2d5f62ba9204eb18b1dd Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 22 Feb 2024 10:00:57 +0100
Subject: [PATCH 38/39] remove plot argument in SystemMonitor

---
 src/dvclive/live.py           | 4 ++--
 src/dvclive/monitor_system.py | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 51501c0c..50ed33f0 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -122,8 +122,8 @@ def __init__(
                 provided string will be passed to `dvc exp save --message`.
                 If DVCLive is used inside `dvc exp run`, the option will be ignored, use
                 `dvc exp run --message` instead.
-            monitor_system (bool): if `True`, DVCLive will monitor CPU, ram, and disk
-                usage. Defaults to `False`.
+            monitor_system (bool): if `True`, DVCLive will monitor GPU, CPU, ram, and
+                disk usage. Defaults to `False`.
         """
         self.summary: Dict[str, Any] = {}
 
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 84803917..4e3e406c 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -61,7 +61,6 @@ class SystemMonitor:
             This argument expect a dict where the key is the name that will be used
             in the metric's name and the value is the path to the directory to
             monitor. Defaults to {"main": "/"}.
-        plot (bool): should the system metrics be saved as plots. Defaults to True.
 
     Raises:
         ValueError: if the keys in `directories_to_monitor` contains invalid characters
@@ -81,7 +80,6 @@ def __init__(
         interval: float = 0.05,  # seconds
         num_samples: int = 20,
         directories_to_monitor: Optional[Dict[str, str]] = None,
-        plot: bool = True,
     ):
         self._interval = self._check_interval(interval, max_interval=0.1)
         self._num_samples = self._check_num_samples(
@@ -90,7 +88,6 @@ def __init__(
         self._disks_to_monitor = self._check_directories_to_monitor(
             directories_to_monitor
         )
-        self._plot = plot
         self._warn_cpu_problem = True
         self._warn_gpu_problem = True
         self._warn_disk_doesnt_exist: Dict[str, bool] = {}
@@ -167,7 +164,7 @@ def _monitoring_loop(self):
                     name,
                     values / self._num_samples,
                     timestamp=True,
-                    plot=None if blacklisted else self._plot,
+                    plot=None if blacklisted else True,
                 )
 
     def _get_metrics(self) -> Dict[str, Union[float, int]]:

From 53d83d5fc14ad479ef917bfcaf45605b9b1c4ddb Mon Sep 17 00:00:00 2001
From: AlexandreKempf <alexandre.kempf@cri-paris.org>
Date: Thu, 22 Feb 2024 11:08:18 +0100
Subject: [PATCH 39/39] change call to the SystemMonitor object to be a Live
 method

---
 src/dvclive/live.py           | 47 ++++++++++++++++++------
 src/dvclive/monitor_system.py | 58 ++++++++++--------------------
 tests/test_monitor_system.py  | 68 ++++++++++++++++++++---------------
 3 files changed, 93 insertions(+), 80 deletions(-)

diff --git a/src/dvclive/live.py b/src/dvclive/live.py
index 50ed33f0..466887cd 100644
--- a/src/dvclive/live.py
+++ b/src/dvclive/live.py
@@ -42,7 +42,7 @@
 from .report import BLANK_NOTEBOOK_REPORT, make_report
 from .serialize import dump_json, dump_yaml, load_yaml
 from .studio import get_dvc_studio_config, post_to_studio
-from .monitor_system import SystemMonitor
+from .monitor_system import _SystemMonitor
 from .utils import (
     StrPath,
     catch_and_warn,
@@ -169,10 +169,9 @@ def __init__(
         self._dvc_studio_config: Dict[str, Any] = {}
         self._init_studio()
 
-        self._system_monitor = None
+        self._system_monitor: Optional[_SystemMonitor] = None  # Monitoring thread
         if monitor_system:
-            self._system_monitor = SystemMonitor()
-            self._system_monitor(self)
+            self.monitor_system()
 
     def _init_resume(self):
         self._read_params()
@@ -377,16 +376,42 @@ def step(self, value: int) -> None:
         self._step = value
         logger.debug(f"Step: {self.step}")
 
-    @property
-    def system_monitor(self) -> Optional[SystemMonitor]:
-        return self._system_monitor or None
+    def monitor_system(
+        self,
+        interval: float = 0.05,  # seconds
+        num_samples: int = 20,
+        directories_to_monitor: Optional[Dict[str, str]] = None,
+    ) -> None:
+        """Monitor GPU, CPU, ram, and disk resources and log them to DVC Live.
+
+        Args:
+            interval (float): the time interval between samples in seconds. To keep the
+                sampling interval small, the maximum value allowed is 0.1 seconds.
+                Default to 0.05.
+            num_samples (int): the number of samples to collect before the aggregation.
+                The value should be between 1 and 30 samples. Default to 20.
+            directories_to_monitor (Optional[Dict[str, str]]): a dictionary with the
+                information about which directories to monitor. The `key` would be the
+                name of the metric and the `value` is the path to the directory.
+                The metric tracked concerns the partition that contains the directory.
+                Default to `{"main": "/"}`.
+
+        Raises:
+            ValueError: if the keys in `directories_to_monitor` contains invalid
+                characters as defined by `os.path.normpath`.
+        """
+        if directories_to_monitor is None:
+            directories_to_monitor = {"main": "/"}
 
-    @system_monitor.setter
-    def system_monitor(self, system_monitor: SystemMonitor) -> None:
         if self._system_monitor is not None:
             self._system_monitor.end()
-        self._system_monitor = system_monitor
-        self._system_monitor(self)
+
+        self._system_monitor = _SystemMonitor(
+            live=self,
+            interval=interval,
+            num_samples=num_samples,
+            directories_to_monitor=directories_to_monitor,
+        )
 
     def sync(self):
         self.make_summary()
diff --git a/src/dvclive/monitor_system.py b/src/dvclive/monitor_system.py
index 4e3e406c..f6d26d83 100644
--- a/src/dvclive/monitor_system.py
+++ b/src/dvclive/monitor_system.py
@@ -1,13 +1,12 @@
 import logging
 import os
-from typing import Dict, Union, Optional, Tuple
+from typing import Dict, Union, Tuple
 
 import psutil
 from statistics import mean
 from threading import Event, Thread
 from funcy import merge_with
 
-
 try:
     from pynvml import (
         nvmlInit,
@@ -47,26 +46,7 @@
 METRIC_VRAM_TOTAL_GB = "system/vram/total (GB)"
 
 
-class SystemMonitor:
-    """Monitor CPU, ram, and disk resources and log them to DVC Live. Monitor also the
-    GPU resources if available.
-
-    Args:
-        interval (float): interval in seconds between two measurements.
-            Defaults to 0.05.
-        num_samples (int): number of samples to average. Defaults to 20.
-        directories_to_monitor (Optional[Dict[str, str]]): monitor disk usage
-            statistics about the partition which contains the given paths. The
-            statistics include total and used space in gygabytes and percent.
-            This argument expect a dict where the key is the name that will be used
-            in the metric's name and the value is the path to the directory to
-            monitor. Defaults to {"main": "/"}.
-
-    Raises:
-        ValueError: if the keys in `directories_to_monitor` contains invalid characters
-        as defined by `os.path.normpath`.
-    """
-
+class _SystemMonitor:
     _plot_blacklist_prefix: Tuple = (
         METRIC_CPU_COUNT,
         METRIC_RAM_TOTAL_GB,
@@ -77,10 +57,12 @@ class SystemMonitor:
 
     def __init__(
         self,
-        interval: float = 0.05,  # seconds
-        num_samples: int = 20,
-        directories_to_monitor: Optional[Dict[str, str]] = None,
+        live,
+        interval: float,  # seconds
+        num_samples: int,
+        directories_to_monitor: Dict[str, str],
     ):
+        self._live = live
         self._interval = self._check_interval(interval, max_interval=0.1)
         self._num_samples = self._check_num_samples(
             num_samples, min_num_samples=1, max_num_samples=30
@@ -92,6 +74,11 @@ def __init__(
         self._warn_gpu_problem = True
         self._warn_disk_doesnt_exist: Dict[str, bool] = {}
 
+        self._shutdown_event = Event()
+        Thread(
+            target=self._monitoring_loop,
+        ).start()
+
     def _check_interval(self, interval: float, max_interval: float) -> float:
         if interval > max_interval:
             logger.warning(
@@ -115,11 +102,8 @@ def _check_num_samples(
         return num_samples
 
     def _check_directories_to_monitor(
-        self, directories_to_monitor: Optional[Dict[str, str]]
+        self, directories_to_monitor: Dict[str, str]
     ) -> Dict[str, str]:
-        if directories_to_monitor is None:
-            return {"main": "/"}
-
         disks_to_monitor = {}
         for disk_name, disk_path in directories_to_monitor.items():
             if disk_name != os.path.normpath(disk_name):
@@ -130,13 +114,6 @@ def _check_directories_to_monitor(
             disks_to_monitor[disk_name] = disk_path
         return disks_to_monitor
 
-    def __call__(self, live):
-        self._live = live
-        self._shutdown_event = Event()
-        Thread(
-            target=self._monitoring_loop,
-        ).start()
-
     def _monitoring_loop(self):
         while not self._shutdown_event.is_set():
             self._metrics = {}
@@ -168,14 +145,12 @@ def _monitoring_loop(self):
                 )
 
     def _get_metrics(self) -> Dict[str, Union[float, int]]:
-        result = {
+        return {
+            **self._get_gpu_info(),
             **self._get_cpu_info(),
             **self._get_ram_info(),
             **self._get_disk_info(),
         }
-        if GPU_AVAILABLE:
-            result.update(self._get_gpu_info())
-        return result
 
     def _get_ram_info(self) -> Dict[str, Union[float, int]]:
         ram_info = psutil.virtual_memory()
@@ -226,6 +201,9 @@ def _get_disk_info(self) -> Dict[str, Union[float, int]]:
         return result
 
     def _get_gpu_info(self) -> Dict[str, Union[float, int]]:
+        if not GPU_AVAILABLE:
+            return {}
+
         nvmlInit()
         num_gpus = nvmlDeviceGetCount()
         gpu_metrics = {
diff --git a/tests/test_monitor_system.py b/tests/test_monitor_system.py
index 2476a0f0..d704f00b 100644
--- a/tests/test_monitor_system.py
+++ b/tests/test_monitor_system.py
@@ -7,7 +7,7 @@
 
 from dvclive import Live
 from dvclive.monitor_system import (
-    SystemMonitor,
+    _SystemMonitor,
     METRIC_CPU_COUNT,
     METRIC_CPU_USAGE_PERCENT,
     METRIC_CPU_PARALLELIZATION_PERCENT,
@@ -149,14 +149,10 @@ def test_monitor_system_is_false(tmp_dir, mocker):
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
     mock_pynvml(mocker, num_gpus=0)
-    system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor)
-    with Live(
-        tmp_dir,
-        save_dvc_exp=False,
-        monitor_system=False,
-    ) as live:
-        assert live.system_monitor is None
-
+    system_monitor_mock = mocker.patch(
+        "dvclive.live._SystemMonitor", spec=_SystemMonitor
+    )
+    Live(tmp_dir, save_dvc_exp=False, monitor_system=False)
     system_monitor_mock.assert_not_called()
 
 
@@ -165,23 +161,35 @@ def test_monitor_system_is_true(tmp_dir, mocker):
     mock_psutil_ram(mocker)
     mock_psutil_disk(mocker)
     mock_pynvml(mocker, num_gpus=0)
-    system_monitor_mock = mocker.patch("dvclive.live.SystemMonitor", spec=SystemMonitor)
+    system_monitor_mock = mocker.patch(
+        "dvclive.live._SystemMonitor", spec=_SystemMonitor
+    )
+
+    Live(tmp_dir, save_dvc_exp=False, monitor_system=True)
+    system_monitor_mock.assert_called_once()
+
+
+def test_all_threads_close(tmp_dir, mocker):
+    mock_psutil_cpu(mocker)
+    mock_psutil_ram(mocker)
+    mock_psutil_disk(mocker)
+    mock_pynvml(mocker, num_gpus=0)
 
     with Live(
         tmp_dir,
         save_dvc_exp=False,
         monitor_system=True,
     ) as live:
-        system_monitor = live.system_monitor
+        first_end_spy = mocker.spy(live._system_monitor, "end")
+        first_end_spy.assert_not_called()
 
-        assert isinstance(system_monitor, SystemMonitor)
-        system_monitor_mock.assert_called_once()
+        live.monitor_system(interval=0.01)
+        first_end_spy.assert_called_once()
 
-        end_spy = mocker.spy(system_monitor, "end")
-        end_spy.assert_not_called()
+        second_end_spy = mocker.spy(live._system_monitor, "end")
 
     # check the monitoring thread is stopped
-    end_spy.assert_called_once()
+    second_end_spy.assert_called_once()
 
 
 def test_ignore_non_existent_directories(tmp_dir, mocker):
@@ -195,12 +203,14 @@ def test_ignore_non_existent_directories(tmp_dir, mocker):
         monitor_system=False,
     ) as live:
         non_existent_disk = "/non-existent"
-        monitor = SystemMonitor(
-            directories_to_monitor={"main": "/", "non-existent": non_existent_disk}
+        system_monitor = _SystemMonitor(
+            live=live,
+            interval=0.1,
+            num_samples=4,
+            directories_to_monitor={"main": "/", "non-existent": non_existent_disk},
         )
-        monitor(live)
-        metrics = monitor._get_metrics()
-        monitor.end()
+        metrics = system_monitor._get_metrics()
+        system_monitor.end()
 
     assert not Path(non_existent_disk).exists()
 
@@ -220,7 +230,7 @@ def test_monitor_system_metrics(tmp_dir, cpu_metrics, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+        live.monitor_system(interval=0.05, num_samples=4)
         # wait for the metrics to be logged.
         # METRIC_DISK_TOTAL_GB is the last metric to be logged.
         while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
@@ -244,7 +254,7 @@ def test_monitor_system_timeseries(tmp_dir, cpu_timeseries, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+        live.monitor_system(interval=0.05, num_samples=4)
 
         # wait for the metrics to be logged.
         # METRIC_DISK_TOTAL_GB is the last metric to be logged.
@@ -271,10 +281,10 @@ def test_monitor_system_metrics_with_gpu(tmp_dir, cpu_metrics, mocker):
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+        live.monitor_system(interval=0.05, num_samples=4)
         # wait for the metrics to be logged.
-        # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU.
-        while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0:
+        # METRIC_DISK_TOTAL_GB is the last metric to be logged.
+        while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
             time.sleep(0.001)
         live.next_step()
 
@@ -306,11 +316,11 @@ def test_monitor_system_timeseries_with_gpu(
         save_dvc_exp=False,
         monitor_system=False,
     ) as live:
-        live.system_monitor = SystemMonitor(interval=0.05, num_samples=4)
+        live.monitor_system(interval=0.05, num_samples=4)
 
         # wait for the metrics to be logged.
-        # f"{METRIC_VRAM_TOTAL_GB}/1" is the last metric to be logged if there is a GPU.
-        while len(dpath.search(live.summary, f"{METRIC_VRAM_TOTAL_GB}/1")) == 0:
+        # METRIC_DISK_TOTAL_GB is the last metric to be logged.
+        while len(dpath.search(live.summary, METRIC_DISK_TOTAL_GB)) == 0:
             time.sleep(0.001)
 
         live.next_step()